From addd5fceb28e8e7704a6afd73a358369a1c31f62 Mon Sep 17 00:00:00 2001 From: wenbin Date: Wed, 11 Aug 2021 14:06:08 +0800 Subject: [PATCH 001/126] miss format (#34771) --- .../fluid/operators/math/bert_encoder_functor.cu | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu index 4d7218cd89e04b..645d1f637183c7 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.cu +++ b/paddle/fluid/operators/math/bert_encoder_functor.cu @@ -25,6 +25,14 @@ namespace paddle { namespace operators { namespace math { +template +__device__ __forceinline__ T local_rsqrt(T num) { + return rsqrt(static_cast(num)); +} +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) +__device__ __forceinline__ half local_rsqrt(half num) { return hrsqrt(num); } +#endif + template __device__ inline void LayerNormSmall(T val, const kvp &thread_data, const int ld, const int idx, @@ -39,7 +47,7 @@ __device__ inline void LayerNormSmall(T val, const kvp &thread_data, if (threadIdx.x == 0) { mu = sum_kv.key; - rsigma = rsqrt(sum_kv.value - mu * mu + eps); + rsigma = local_rsqrt(sum_kv.value - mu * mu + eps); } __syncthreads(); @@ -63,7 +71,7 @@ __device__ inline void LayerNorm(const kvp &thread_data, const int ld, if (threadIdx.x == 0) { mu = sum_kv.key; - rsigma = rsqrt(sum_kv.value - mu * mu + eps); + rsigma = local_rsqrt(sum_kv.value - mu * mu + eps); } __syncthreads(); @@ -89,7 +97,7 @@ __device__ inline void LayerNorm2(const kvp &thread_data, const int ld, if (threadIdx.x == 0) { mu = sum_kv.key; - rsigma = rsqrt(sum_kv.value - mu * mu + eps); + rsigma = local_rsqrt(sum_kv.value - mu * mu + eps); } __syncthreads(); From 45af4f2aa7abe0bfe98ceec55f97064a2c298981 Mon Sep 17 00:00:00 2001 From: andyjpaddle <87074272+andyjpaddle@users.noreply.github.com> Date: Wed, 11 Aug 2021 14:19:09 +0800 Subject: [PATCH 002/126] [NPU] add elementwise_min_grad_op_npu,test=develop (#34731) --- .../elementwise/elementwise_min_op_npu.cc | 176 +++++++++++++++++- .../npu/test_elementwise_min_op_npu.py | 132 +++++++++---- 2 files changed, 265 insertions(+), 43 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc index 48ac3905f32bd9..84ff28bb3a0e4f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/elementwise/elementwise_min_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_npu.h" #include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { @@ -27,31 +29,199 @@ template class ElementwiseMinNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = + ctx.template device_context(); auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* out = ctx.Output("Out"); - auto place = ctx.GetPlace(); out->mutable_data(place); + int axis = ctx.Attr("axis"); + bool direct_compute = false; + auto x_dims = x->dims(); + auto y_dims = y->dims(); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + if (x_dims.size() >= y_dims.size()) { + direct_compute = + y_dims == framework::slice_ddim(x_dims, axis, x_dims.size()); + } else { + direct_compute = + x_dims == framework::slice_ddim(y_dims, axis, y_dims.size()); + } + Tensor transformed_x, transformed_y; + if (direct_compute) { + transformed_x.ShareDataWith(*x); + transformed_y.ShareDataWith(*y); + } else { + NpuElementWiseOpBroadcast(dev_ctx, x, y, axis, &transformed_x, + &transformed_y); + } + const auto& runner = + NpuOpRunner("Minimum", {transformed_x, transformed_y}, {*out}, {}); auto stream = ctx.template device_context() .stream(); - - const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {}); runner.Run(stream); } }; +template +class ElementwiseMinGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = + ctx.template device_context(); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis); + auto stream = dev_ctx.stream(); + if (dx && dy) { + // dx + dx->mutable_data(ctx.GetPlace()); + Tensor tmp_x; + tmp_x.ShareDataWith(*dx); + if (dx->dims() != dout->dims()) { + std::vector dst_dims_vec_x; + std::vector reduce_axes_x; + auto src_dims_x = dx->dims(); + auto dout_dims = dout->dims(); + + int src_axis_x = (src_dims_x.size() < dout_dims.size() ? axis : 0); + for (int ax = 0; ax < dout_dims.size(); ++ax) { + if ((ax < src_axis_x || ax >= src_axis_x + src_dims_x.size()) || + (dout_dims[ax] > 1 && src_dims_x[ax - src_axis_x] == 1)) { + reduce_axes_x.push_back(ax); + } else { + dst_dims_vec_x.push_back(dout_dims[ax]); + } + } + if (!reduce_axes_x.empty()) { + tmp_x.Resize(framework::make_ddim(dst_dims_vec_x)); + } + } + // dy + dy->mutable_data(ctx.GetPlace()); + Tensor tmp_y; + tmp_y.ShareDataWith(*dy); + if (dy->dims() != dout->dims()) { + std::vector dst_dims_vec_y; + std::vector reduce_axes_y; + auto src_dims_y = dy->dims(); + auto dout_dims = dout->dims(); + + int src_axis_y = (src_dims_y.size() < dout_dims.size() ? axis : 0); + for (int ax = 0; ax < dout_dims.size(); ++ax) { + if ((ax < src_axis_y || ax >= src_axis_y + src_dims_y.size()) || + (dout_dims[ax] > 1 && src_dims_y[ax - src_axis_y] == 1)) { + reduce_axes_y.push_back(ax); + } else { + dst_dims_vec_y.push_back(dout_dims[ax]); + } + } + if (!reduce_axes_y.empty()) { + tmp_y.Resize(framework::make_ddim(dst_dims_vec_y)); + } + } + + const auto& runner = + NpuOpRunner("MinimumGrad", {*dout, *x, *y}, {tmp_x, tmp_y}, + {{"grad_x", true}, {"grad_y", true}}); + runner.Run(stream); + + } else if (dx) { + Tensor zero_tensor(dout->type()); + zero_tensor.mutable_data(y->dims(), ctx.GetPlace()); + FillNpuTensorWithConstant(&zero_tensor, static_cast(0)); + // dx + dx->mutable_data(ctx.GetPlace()); + Tensor tmp_x; + tmp_x.ShareDataWith(*dx); + if (dx->dims() != dout->dims()) { + std::vector dst_dims_vec_x; + std::vector reduce_axes_x; + auto src_dims_x = dx->dims(); + auto dout_dims = dout->dims(); + + int src_axis_x = (src_dims_x.size() < dout_dims.size() ? axis : 0); + for (int ax = 0; ax < dout_dims.size(); ++ax) { + if ((ax < src_axis_x || ax >= src_axis_x + src_dims_x.size()) || + (dout_dims[ax] > 1 && src_dims_x[ax - src_axis_x] == 1)) { + reduce_axes_x.push_back(ax); + } else { + dst_dims_vec_x.push_back(dout_dims[ax]); + } + } + if (!reduce_axes_x.empty()) { + tmp_x.Resize(framework::make_ddim(dst_dims_vec_x)); + } + } + + const auto& runner = + NpuOpRunner("MinimumGrad", {*dout, *x, *y}, {tmp_x, zero_tensor}, + {{"grad_x", true}, {"grad_y", true}}); + runner.Run(stream); + + } else if (dy) { + Tensor zero_tensor(dout->type()); + zero_tensor.mutable_data(x->dims(), ctx.GetPlace()); + FillNpuTensorWithConstant(&zero_tensor, static_cast(0)); + + // dy + dy->mutable_data(ctx.GetPlace()); + Tensor tmp_y; + tmp_y.ShareDataWith(*dy); + if (dy->dims() != dout->dims()) { + std::vector dst_dims_vec_y; + std::vector reduce_axes_y; + auto src_dims_y = dy->dims(); + auto dout_dims = dout->dims(); + + int src_axis_y = (src_dims_y.size() < dout_dims.size() ? axis : 0); + for (int ax = 0; ax < dout_dims.size(); ++ax) { + if ((ax < src_axis_y || ax >= src_axis_y + src_dims_y.size()) || + (dout_dims[ax] > 1 && src_dims_y[ax - src_axis_y] == 1)) { + reduce_axes_y.push_back(ax); + } else { + dst_dims_vec_y.push_back(dout_dims[ax]); + } + } + if (!reduce_axes_y.empty()) { + tmp_y.Resize(framework::make_ddim(dst_dims_vec_y)); + } + } + + const auto& runner = + NpuOpRunner("MinimumGrad", {*dout, *x, *y}, {zero_tensor, tmp_y}, + {{"grad_x", true}, {"grad_y", true}}); + runner.Run(stream); + + } else { + std::cout << "error" << std::endl; + } + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL( elementwise_min, ops::ElementwiseMinNPUKernel, ops::ElementwiseMinNPUKernel); + +REGISTER_OP_NPU_KERNEL( + elementwise_min_grad, + ops::ElementwiseMinGradNPUKernel, + ops::ElementwiseMinGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py index 2034a12c5c0feb..51cf5cdaf6d1af 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py @@ -18,81 +18,133 @@ import unittest import sys sys.path.append("..") -from op_test import OpTest +from op_test import OpTest, skip_check_grad_ci import paddle import paddle.fluid as fluid +from paddle.fluid import Program, program_guard +import paddle.fluid.core as core paddle.enable_static() SEED = 2021 -class TestElementwiseMin(OpTest): +class TestElementwiseMinOp(OpTest): def setUp(self): self.set_npu() self.op_type = "elementwise_min" self.place = paddle.NPUPlace(0) - self.init_dtype() - np.random.seed(SEED) - x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) - y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) - out = np.minimum(x, y) - + self.init_input_output() self.inputs = { - 'X': OpTest.np_dtype_to_fluid_dtype(x), - 'Y': OpTest.np_dtype_to_fluid_dtype(y) + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) } - self.attrs = {} - self.outputs = {'Out': out} + self.outputs = {'Out': self.out} + self.attrs = {'axis': self.axis} def set_npu(self): self.__class__.use_npu = True + def init_input_output(self): + # If x and y have the same value, the min() is not differentiable. + # So we generate test data by the following method + # to avoid them being too close to each other. + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.sgn = np.random.choice([-1, 1], [13, 17]).astype(self.dtype) + self.y = self.x + self.sgn * np.random.uniform( + 0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.minimum(self.x, self.y) + self.axis = -1 + def init_dtype(self): self.dtype = np.float32 def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Min grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad_normal(self): + if self.dtype == np.float16: + return + self.check_grad_with_place( + self.place, + ['X', 'Y'], + 'Out', ) -class TestElementwiseMinFp16(OpTest): - def setUp(self): - self.set_npu() - self.op_type = "elementwise_min" - self.place = paddle.NPUPlace(0) + def test_check_grad_ingore_x(self): + if self.dtype == np.float16: + return - self.init_dtype() - np.random.seed(SEED) - x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) - y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) - out = np.minimum(x, y) + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + no_grad_set=set("X"), ) - self.inputs = { - 'X': OpTest.np_dtype_to_fluid_dtype(x), - 'Y': OpTest.np_dtype_to_fluid_dtype(y) - } - self.attrs = {} - self.outputs = {'Out': out} + def test_check_grad_ingore_y(self): + if self.dtype == np.float16: + return + + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + no_grad_set=set("Y"), ) - def set_npu(self): - self.__class__.use_npu = True - self.__class__.no_need_check_grad = True +class TestElementwiseMinOpFp16(TestElementwiseMinOp): def init_dtype(self): self.dtype = np.float16 - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-5) + +class TestElementwiseMinOp_Vector(TestElementwiseMinOp): + def init_input_output(self): + self.x = np.random.uniform(1, 2, (100, )).astype(self.dtype) + self.sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype) + self.y = self.x + self.sgn * np.random.uniform(0.1, 1, ( + 100, )).astype(self.dtype) + self.out = np.minimum(self.x, self.y) + self.axis = -1 + + +class TestElementwiseMinOpFp16_Vector(TestElementwiseMinOp_Vector): + def init_dtype(self): + self.dtype = np.float16 + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1) to test broadcast.") +class TestElementwiseMinOp_scalar(TestElementwiseMinOp): + def init_input_output(self): + self.x = np.random.random_integers(-5, 5, [10, 3, 4]).astype(self.dtype) + self.y = np.array([0.5]).astype(self.dtype) + self.out = np.minimum(self.x, self.y) + self.axis = -1 + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1) to test broadcast.") +class TestElementwiseMinOpFp16_scalar(TestElementwiseMinOp_scalar): + def init_dtype(self): + self.dtype = np.float16 + + +class TestElementwiseMinOp_broadcast(TestElementwiseMinOp): + def init_input_output(self): + self.x = np.random.uniform(0.5, 1, (2, 3, 100)).astype(self.dtype) + self.sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype) + self.y = self.x[0, 0, :] + self.sgn * \ + np.random.uniform(1, 2, (100, )).astype(self.dtype) + self.out = np.minimum(self.x, self.y.reshape(1, 1, 100)) + self.axis = -1 + + +class TestElementwiseMinOpFp16_broadcast(TestElementwiseMinOp_broadcast): + def init_dtype(self): + self.dtype = np.float16 -class TestElementwiseMinNet(unittest.TestCase): +class TestElementwiseMinOpNet(unittest.TestCase): def _test(self, run_npu=True): main_prog = paddle.static.Program() startup_prog = paddle.static.Program() From b5ec65e15a6534f1d9f83a7cdb79743dd4172d5a Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Wed, 11 Aug 2021 14:42:14 +0800 Subject: [PATCH 003/126] [NPU] Add exp and exp_grad npu op (#34612) * add exp and exp_grad npu op * modify support register type * remove empty line and remove exp_grad support data type int/int64 * move exp and epx_grad kernel to activation_op_npu.cc, delete attrs * move code to activation_op_npu.cc --- paddle/fluid/operators/activation_op_npu.cc | 39 ++++++++++ .../tests/unittests/npu/test_exp_op_npu.py | 73 +++++++++++++++++++ 2 files changed, 112 insertions(+) mode change 100644 => 100755 paddle/fluid/operators/activation_op_npu.cc create mode 100755 python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc old mode 100644 new mode 100755 index 02ce817bcc8b2b..5cf70cc391d8ff --- a/paddle/fluid/operators/activation_op_npu.cc +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -560,6 +560,37 @@ class AtanGradNPUKernel : public framework::OpKernel { } }; +template +class ExpNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("Exp", {*x}, {*out}, {}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class ExpGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); + auto stream = + ctx.template device_context() + .stream(); + const auto& runner = NpuOpRunner("Mul", {*dout, *out}, {*dx}, {}); + runner.Run(stream); + } +}; + } // namespace operators } // namespace paddle @@ -692,3 +723,11 @@ REGISTER_OP_NPU_KERNEL( ops::AtanGradNPUKernel, ops::AtanGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + exp, ops::ExpNPUKernel, + ops::ExpNPUKernel); + +REGISTER_OP_NPU_KERNEL( + exp_grad, ops::ExpGradNPUKernel, + ops::ExpGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py new file mode 100755 index 00000000000000..ccd5f0649d8dc6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py @@ -0,0 +1,73 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest + +import numpy as np +from scipy.special import expit, erf + +from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16 +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import compiler, Program, program_guard + +paddle.enable_static() +SEED = 2049 + + +class TestExpNPUOP(OpTest): + def setUp(self): + + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "exp" + self.init_dtype() + self.init_kernel_type() + + np.random.seed(SEED) + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.exp(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad_with_place(self.place, ['X'], 'Out') + + def init_dtype(self): + self.dtype = np.float32 + + def init_kernel_type(self): + pass + + def set_npu(self): + self.__class__.use_npu = True + + +class TestExpNPUOPFloat64(TestExpNPUOP): + def init_dtype(self): + self.dtype = np.float64 + + +if __name__ == "__main__": + unittest.main() From 88f2f4a40cf0a36bd6eed29ae03be75abbfdf650 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Wed, 11 Aug 2021 14:58:32 +0800 Subject: [PATCH 004/126] [HybridParallel] Support save/load for PipeLineParallel (#34768) * add save/load for pipelineparallel * add save/load --- .../parallel_layers/pp_layers.py | 55 ++++++++- .../unittests/hybrid_parallel_pp_save_load.py | 114 ++++++++++++++++++ .../hybrid_parallel_pp_transformer.py | 3 +- ...test_parallel_dygraph_pipeline_parallel.py | 3 + 4 files changed, 172 insertions(+), 3 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/hybrid_parallel_pp_save_load.py diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py index a3c6a5b5fb665f..f546adc65ea714 100644 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py @@ -11,12 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import math -import paddle import re +import glob +import os +import numpy as np +import random +from functools import partial + +import paddle from paddle.fluid.dygraph.layers import Layer from ...utils.log_util import logger, layer_to_str -from functools import partial __all__ = [] @@ -310,3 +316,48 @@ def forward(self, input): for layer in self.run_function: input = layer(input) return input + + def save_state_dict(self, path): + if self._topo.get_coord(self.global_rank).data != 0: + return + + def _offset_dirname(ckpt_dir, local_layer_idx): + idx = local_layer_idx + self._start_pos + model_rank = self._topo.get_coord(self.global_rank).model + rank_message = "-tensor_" + "{:0>2d}".format(model_rank) + layer_save_path = os.path.join(ckpt_dir, + 'layer_{:0>2d}'.format(idx)) + layer_save_path = layer_save_path + rank_message + '-model_states.pdparams' + return layer_save_path + + os.makedirs(path, exist_ok=True) + for idx, layer in enumerate(self.run_function): + model_save_path = _offset_dirname(path, idx) + if not hasattr(layer, 'state_dict'): + continue + paddle.save(layer.state_dict(), model_save_path) + + logger.info("save model state successfully...") + + def set_state_dir(self, path): + assert os.path.exists( + path), "{} not found, please check the path".format(path) + + for idx, layer in enumerate(self.run_function): + if not hasattr(layer, 'set_state_dict'): + continue + layer_idx = idx + self._start_pos + layer_save_path = os.path.join(path, + 'layer_{0:0>2d}'.format(layer_idx)) + model_files = glob.glob(layer_save_path + "*model_states.pdparams") + model_files.sort() + mp_rank = self._topo.get_coord(self.global_rank).model + mp_world_size = self._topo.get_dim('model') + num_files = len(model_files) + + load_param_path = model_files[mp_rank * num_files // mp_world_size] + model_state_dict = paddle.load(load_param_path) + layer.set_state_dict(model_state_dict) + + self._synchronize_shared_weights() + logger.info("load model state successfully...") diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_save_load.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_save_load.py new file mode 100644 index 00000000000000..e6e27bbb41a8a4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_save_load.py @@ -0,0 +1,114 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest +import paddle +import numpy as np +import random +import os +import shutil +import tempfile +import paddle.distributed as dist +import paddle.distributed.fleet as fleet +from hybrid_parallel_pp_transformer import ModelPipe, set_random_seed + +batch_size = 8 +length = 8 +micro_batch_size = 2 +vocab_size = 128 + + +class TestDistPPSaveLoadTraning(unittest.TestCase): + def setUp(self): + strategy = fleet.DistributedStrategy() + self.model_parallel_size = 1 + self.data_parallel_size = 1 + self.pipeline_parallel_size = 2 + strategy.hybrid_configs = { + "dp_degree": self.data_parallel_size, + "mp_degree": self.model_parallel_size, + "pp_degree": self.pipeline_parallel_size, + } + strategy.pipeline_configs = { + "accumulate_steps": batch_size // micro_batch_size, + "micro_batch_size": micro_batch_size + } + fleet.init(is_collective=True, strategy=strategy) + + def test_pp_model(self): + hcg = fleet.get_hybrid_communicate_group() + word_size = hcg.get_model_parallel_world_size() + dp_id = hcg.get_data_parallel_rank() + pp_id = hcg.get_stage_id() + rank_id = dist.get_rank() + topology = hcg.topology() + set_random_seed(1024, dp_id, rank_id) + + model = ModelPipe(topology) + scheduler = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True) + optimizer = paddle.optimizer.SGD(learning_rate=scheduler, + parameters=model.parameters()) + + model = fleet.distributed_model(model) + optimizer = fleet.distributed_optimizer(optimizer) + output_dir = tempfile.mkdtemp() + + # warmup step + for step_id in range(2): + x_data = np.random.randint(0, vocab_size, size=[batch_size, length]) + x = paddle.to_tensor(x_data) + x.stop_gradient = True + loss = model.train_batch([x, x], optimizer, scheduler) + + model._layers.save_state_dict(output_dir) + paddle.save(optimizer.state_dict(), + os.path.join(output_dir, "model_state.pdopt")) + + # construct data + test_steps = 5 + np_data = np.random.randint( + 0, vocab_size, size=[test_steps, batch_size, length]) + + origin_loss = [] + for step_id in range(5): + x_data = np_data[step_id, :] + x = paddle.to_tensor(x_data) + x.stop_gradient = True + loss = model.train_batch([x, x], optimizer, scheduler) + origin_loss.append(loss.numpy()) + + # test step + model._layers.set_state_dir(output_dir) + opt_dict = paddle.load(os.path.join(output_dir, "model_state.pdopt")) + optimizer.set_state_dict(opt_dict) + + for step_id in range(5): + x_data = np_data[step_id, :] + x = paddle.to_tensor(x_data) + x.stop_gradient = True + loss = model.train_batch([x, x], optimizer, scheduler) + print("origin loss: ", origin_loss[step_id], "current loss: ", + loss.numpy()) + np.testing.assert_allclose(loss.numpy(), origin_loss[step_id]) + + # finally, remove the model/optimizer path + shutil.rmtree(output_dir) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py index 62b1a8b1da6797..524099c6ab05e8 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py @@ -86,7 +86,8 @@ def forward(self, x, mask): product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_model**-0.5) weights = F.softmax(product + mask) - weights = F.dropout(weights, 0.2) + # TODO(shenliang03) For save/load in PipeLineParallel, can’t support dropout temporarily. + # weights = F.dropout(weights, 0.2) tgt = layers.matmul(weights, v) residual = tgt tgt = self.norm1(tgt) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py index 62e781678c9fc8..003e0c1685cae7 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py @@ -36,6 +36,9 @@ def test_pipeline_parallel(self): def test_hybrid_parallel_transformer(self): self.run_mnist_2gpu('hybrid_parallel_pp_transformer.py') + def test_hybrid_parallel_transformer(self): + self.run_mnist_2gpu('hybrid_parallel_pp_save_load.py') + if __name__ == "__main__": unittest.main() From 3f962e772b69de9d176dfb2bbed8bc7487810964 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Wed, 11 Aug 2021 15:20:25 +0800 Subject: [PATCH 005/126] add the basic apis for auto_parallel (#33804) * add auto_parallel apis --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/framework.proto | 17 + paddle/fluid/framework/process_mesh_desc.cc | 61 +++ paddle/fluid/framework/process_mesh_desc.h | 65 +++ paddle/fluid/framework/proto_desc.h | 8 + paddle/fluid/framework/var_desc.cc | 40 ++ paddle/fluid/framework/var_desc.h | 14 + paddle/fluid/pybind/const_value.cc | 4 + paddle/fluid/pybind/protobuf.cc | 19 +- paddle/fluid/pybind/protobuf.h | 1 + paddle/fluid/pybind/pybind.cc | 1 + python/paddle/distributed/__init__.py | 15 +- .../distributed/auto_parallel/__init__.py | 22 + .../distributed/auto_parallel/interface.py | 438 ++++++++++++++++++ python/paddle/fluid/framework.py | 116 +++++ .../tests/unittests/test_auto_parallel_api.py | 143 ++++++ python/setup.py.in | 1 + 17 files changed, 964 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/framework/process_mesh_desc.cc create mode 100644 paddle/fluid/framework/process_mesh_desc.h create mode 100644 python/paddle/distributed/auto_parallel/__init__.py create mode 100644 python/paddle/distributed/auto_parallel/interface.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_api.py diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 419db670467a01..d0c64f44af3e2f 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -202,7 +202,7 @@ cc_test(operator_exception_test SRCS operator_exception_test.cc DEPS operator op cc_library(version SRCS version.cc) cc_test(version_test SRCS version_test.cc DEPS version) -cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute shape_inference op_info operator glog version) +cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc process_mesh_desc.cc DEPS attribute shape_inference op_info operator glog version) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 5c9ec2bad347e0..73103eb28274cd 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -38,6 +38,13 @@ enum AttrType { FLOAT64S = 12; } +message ProcessMeshDesc { + required int32 id = 1; + required int32 parent_id = 2; + repeated int32 topology = 3; + repeated int32 process_group = 4; +}; + // OpDesc describes an instance of a C++ framework::OperatorBase // derived class type. message OpDesc { @@ -167,6 +174,15 @@ message VarType { } message VarDesc { + + message Attr { + required string name = 1; + required AttrType type = 2; + optional int32 i = 3; + optional string s = 4; + repeated int32 ints = 5; + }; + required string name = 1; required VarType type = 2; optional bool persistable = 3 [ default = false ]; @@ -175,6 +191,7 @@ message VarDesc { optional bool need_check_feed = 4 [ default = false ]; optional bool is_parameter = 5 [ default = false ]; optional bool stop_gradient = 6 [ default = false ]; + repeated Attr attrs = 7; } message BlockDesc { diff --git a/paddle/fluid/framework/process_mesh_desc.cc b/paddle/fluid/framework/process_mesh_desc.cc new file mode 100644 index 00000000000000..207e10fc194a82 --- /dev/null +++ b/paddle/fluid/framework/process_mesh_desc.cc @@ -0,0 +1,61 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/process_mesh_desc.h" + +namespace paddle { +namespace framework { + +int32_t ProcessMeshDesc::next_id = -1; + +ProcessMeshDesc::ProcessMeshDesc(const std::vector &topo, + const std::vector &process_group, + int32_t parent_id) { + int32_t cur_id = ++next_id; + desc_.set_id(cur_id); + desc_.set_parent_id(parent_id); + for (size_t i = 0; i != topo.size(); ++i) { + desc_.add_topology(topo[i]); + } + for (size_t i = 0; i != process_group.size(); ++i) { + desc_.add_process_group(process_group[i]); + } + ProcessMeshDescMap::GetInstance().Insert(cur_id, this); +} + +std::vector ProcessMeshDesc::Topology() const { + size_t size = desc_.topology_size(); + std::vector ret(size); + for (auto i = 0; i != desc_.topology_size(); ++i) { + ret[i] = desc_.topology(i); + } + return ret; +} + +std::vector ProcessMeshDesc::ProcessGroup() const { + size_t size = desc_.process_group_size(); + std::vector ret(size); + for (auto i = 0; i != desc_.process_group_size(); ++i) { + ret[i] = desc_.process_group(i); + } + return ret; +} + +ProcessMeshDescMap &ProcessMeshDescMap::GetInstance() { + static ProcessMeshDescMap g_process_mesh_desc_map; + return g_process_mesh_desc_map; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/process_mesh_desc.h b/paddle/fluid/framework/process_mesh_desc.h new file mode 100644 index 00000000000000..3829da7650f074 --- /dev/null +++ b/paddle/fluid/framework/process_mesh_desc.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/proto_desc.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class ProcessMeshDesc { + public: + ProcessMeshDesc(const std::vector& topo, + const std::vector& process_group, int32_t parent_id); + + int32_t ID() const { return desc_.id(); } + int32_t Parent() const { return desc_.parent_id(); } + + std::vector Topology() const; + std::vector ProcessGroup() const; + + static int32_t next_id; + + private: + proto::ProcessMeshDesc desc_; // not_own +}; + +class ProcessMeshDescMap { + public: + static ProcessMeshDescMap& GetInstance(); + + bool Has(int32_t index) const { return map_.find(index) != map_.end(); } + + void Insert(int32_t index, ProcessMeshDesc* mesh) { + PADDLE_ENFORCE_NE( + Has(index), true, + platform::errors::AlreadyExists("Index (%d) has been used.", index)); + map_.insert(std::make_pair(index, mesh)); + } + + private: + ProcessMeshDescMap() = default; + // Use raw pointer to avoid double free + std::unordered_map map_; + DISABLE_COPY_AND_ASSIGN(ProcessMeshDescMap); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/proto_desc.h b/paddle/fluid/framework/proto_desc.h index 40521c07829336..73572c4023c09b 100644 --- a/paddle/fluid/framework/proto_desc.h +++ b/paddle/fluid/framework/proto_desc.h @@ -22,5 +22,13 @@ constexpr int kRootBlockIndex = 0; // The Parent Index of root Block, this block does not exist. constexpr int kNoneBlockIndex = -1; +// The Parent Index of root ProcessMesh, this ProcessMesh does not exist. +constexpr int kNoneProcessMeshIndex = -1; + +// If a attribute name has a certain suffix, it means that the +// atrribute is a distributed-related attribute for auto parallel. +// e.g., "mesh_id@PARALLEL". +constexpr char kAutoParallelSuffix[] = "@PARALLEL"; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index 3f1cf30c7cab1f..c3bdd6ae7f135c 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -280,6 +280,46 @@ std::vector VarDesc::mutable_tensor_descs() { } } +std::vector VarDesc::AttrNames() const { + std::vector retv; + retv.reserve(attrs_.size()); + for (auto &attr : attrs_) { + retv.push_back(attr.first); + } + return retv; +} + +void VarDesc::RemoveAttr(const std::string &name) { attrs_.erase(name); } + +void VarDesc::SetAttr(const std::string &name, const Attribute &v) { + // NOTICE(sandyhouse): pybind11 will take the empty list in python as + // the std::vector type in C++; so we have to change the attr's type + // here if we meet this issue + proto::AttrType attr_type = static_cast(v.which() - 1); + if (attr_type == proto::AttrType::INTS && + BOOST_GET_CONST(std::vector, v).size() == 0u) { + // Find current attr via attr name and set the correct attribute value + this->attrs_[name] = std::vector(); + return; + } + bool valid = attr_type == proto::AttrType::INT || + attr_type == proto::AttrType::STRING || + attr_type == proto::AttrType::INTS; + PADDLE_ENFORCE_EQ(valid, true, platform::errors::InvalidArgument( + "The value for attr (%s) must be " + "one of list or int or string.", + name)); + + this->attrs_[name] = v; +} + +Attribute VarDesc::GetAttr(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound( + "Attribute %s is not found.", name)); + return it->second; +} + bool operator==(const VarDesc &left, const VarDesc &right) { return left.Proto()->SerializeAsString() == right.Proto()->SerializeAsString(); diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index 567cac5bb24c32..6821165692d2a4 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -19,7 +19,9 @@ limitations under the License. */ #include #include "glog/logging.h" +#include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/type_defs.h" namespace paddle { namespace framework { @@ -137,6 +139,17 @@ class VarDesc { desc_.set_need_check_feed(need_check_feed); } + bool HasAttr(const std::string &name) const { + return attrs_.find(name) != attrs_.end(); + } + + std::vector AttrNames() const; + + void SetAttr(const std::string &name, const Attribute &v); + void RemoveAttr(const std::string &name); + + Attribute GetAttr(const std::string &name) const; + private: const proto::VarType::TensorDesc &tensor_desc() const; std::vector tensor_descs() const; @@ -144,6 +157,7 @@ class VarDesc { std::vector mutable_tensor_descs(); proto::VarDesc desc_; + AttributeMap attrs_; }; bool operator==(const VarDesc &left, const VarDesc &right); diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index c24a2b4a088c5c..30d04e62e69d3c 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/proto_desc.h" #if defined(PADDLE_WITH_DGC) #include "paddle/fluid/framework/details/dgc_const_values.h" @@ -33,6 +34,9 @@ void BindConstValue(pybind11::module* m) { m->def("kControlDepVarName", [] { return framework::ir::Node::kControlDepVarName; }); m->def("kNewGradSuffix", [] { return framework::kNewGradSuffix; }); + m->def("kAutoParallelSuffix", [] { return framework::kAutoParallelSuffix; }); + m->def("kNoneProcessMeshIndex", + [] { return framework::kNoneProcessMeshIndex; }); auto op_proto_and_checker_maker = m->def_submodule("op_proto_and_checker_maker"); diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index f40a8d12611a36..7cd21785a47591 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/process_mesh_desc.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/version.h" @@ -84,6 +85,17 @@ void BindProgramDesc(pybind11::module *m) { [](pd::ProgramDesc &self) -> int64_t { return self.Version(); }); } +void BindProcessMeshDesc(pybind11::module *m) { + pybind11::class_(*m, "ProcessMeshDesc", "") + .def(pybind11::init &, + const std::vector &, int32_t>()) + .def_property_readonly("id", &pd::ProcessMeshDesc::ID) + .def_property_readonly("parent", &pd::ProcessMeshDesc::Parent) + .def_property_readonly("topology", &pd::ProcessMeshDesc::Topology) + .def_property_readonly("process_group", + &pd::ProcessMeshDesc::ProcessGroup); +} + void BindBlockDesc(pybind11::module *m) { pybind11::class_ blockdesc(*m, "BlockDesc", ""); g_blockdesc_pytype = (PyTypeObject *)blockdesc.ptr(); // NOLINT @@ -184,7 +196,12 @@ void BindVarDsec(pybind11::module *m) { .def("clear_stop_gradient", &pd::VarDesc::ClearStopGradient) .def("has_stop_gradient", &pd::VarDesc::HasStopGradient) .def("need_check_feed", &pd::VarDesc::NeedCheckFeed) - .def("set_need_check_feed", &pd::VarDesc::SetNeedCheckFeed); + .def("set_need_check_feed", &pd::VarDesc::SetNeedCheckFeed) + .def("has_attr", &pd::VarDesc::HasAttr) + .def("attr_names", &pd::VarDesc::AttrNames) + .def("_set_attr", &pd::VarDesc::SetAttr) + .def("remove_attr", &pd::VarDesc::RemoveAttr) + .def("attr", &pd::VarDesc::GetAttr); pybind11::enum_ vartype(var_desc, "VarType", ""); g_vartype_pytype = (PyTypeObject *)vartype.ptr(); // NOLINT diff --git a/paddle/fluid/pybind/protobuf.h b/paddle/fluid/pybind/protobuf.h index e7370672a88fcf..4c5aa9701cd5a1 100644 --- a/paddle/fluid/pybind/protobuf.h +++ b/paddle/fluid/pybind/protobuf.h @@ -30,6 +30,7 @@ void BindProgramDesc(pybind11::module* m); void BindBlockDesc(pybind11::module* m); void BindVarDsec(pybind11::module* m); void BindOpDesc(pybind11::module* m); +void BindProcessMeshDesc(pybind11::module* m); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 040ae26213f5f5..859e0ad916dd47 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2054,6 +2054,7 @@ All parameter, weight, gradient are variables in Paddle. BindOpDesc(&m); BindConstValue(&m); BindGlobalValueGetterSetter(&m); + BindProcessMeshDesc(&m); py::class_(m, "LodRankTable") .def("items", [](framework::LoDRankTable &table) { diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index 7427219285c200..90e6a00e384af6 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -36,6 +36,13 @@ from .collective import send # noqa: F401 from .collective import wait # noqa: F401 +from .auto_parallel import shard_tensor # noqa: F401 +from .auto_parallel import shard_op # noqa: F401 +from .auto_parallel import set_shard_mask # noqa: F401 +from .auto_parallel import set_offload_device # noqa: F401 +from .auto_parallel import set_pipeline_stage # noqa: F401 +from .auto_parallel import ProcessMesh # noqa: F401 + from .fleet import BoxPSDataset # noqa: F401 from .entry_attr import ProbabilityEntry # noqa: F401 @@ -69,5 +76,11 @@ "ReduceOp", "wait", "get_rank", - "ProbabilityEntry" + "ProbabilityEntry", + "shard_tensor", + "shard_op", + "set_shard_mask", + "set_offload_device", + "set_pipeline_stage", + "ProcessMesh", ] diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py new file mode 100644 index 00000000000000..afe8d5652cfa73 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .interface import shard_tensor # noqa: F401 +from .interface import shard_op # noqa: F401 +from .interface import set_shard_mask # noqa: F401 +from .interface import set_offload_device # noqa: F401 +from .interface import set_pipeline_stage # noqa: F401 +from .interface import ProcessMesh # noqa: F401 + +__all__ = [] diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py new file mode 100644 index 00000000000000..f98cc30131457c --- /dev/null +++ b/python/paddle/distributed/auto_parallel/interface.py @@ -0,0 +1,438 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy +import paddle.fluid.core as core +import paddle +from paddle.fluid.framework import Variable +from paddle.fluid.framework import in_dygraph_mode + +__all__ = [] + +# a map from ProcessMesh ids to the ProcessMesh instances +_g_process_mesh_map = dict() + +# user defined map from logical process ids to physical ones +_user_defined_physical_map = None + + +def _append_attr_suffix(name): + """ + Append auto parallel suffix for distributed attribute name. + """ + return name + core.kAutoParallelSuffix() + + +def _remove_attr_suffix(name): + """ + Remove auto parallel suffix from distributed attribute name. + """ + return name.strip(core.kAutoParallelSuffix()) + + +def _static_mode_check(): + if in_dygraph_mode(): + raise RuntimeError("Auto-parallel only supports static mode, " + "please use paddle.enable_static().") + + +def _get_nested_list_shape(nested_list): + """ + Get the shape of a nested_list. + """ + result = [] + while isinstance(nested_list, list): + result.append(len(nested_list)) + nested_list = nested_list[0] + return result + + +def _flatten_nested_list(nested_list): + """ + Get a list of all items in a nested_list. + Ref: https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists + """ + result = numpy.array(nested_list).flatten().tolist() + return result + + +class ProcessMesh(object): + r""" + The class `Processmesh` describes the topology of logical processes. + A mesh is an N-dimensional array. The shape of the N-dimensional + array represents the topology of logical processes and every + element of the N-dimensional array represent a logical process. For + example, the 2-dimensional array [[2, 4, 5], [0, 1, 3]] + illustrates six logical processes organized as the topology [2, 3], + i.e., the shape of the 2-dimensional array. With the above topology, + there are two parallel groups, where the first parallel group has a + parallel degree of 2 and the second one has a parallel degree of 3. + And the first logical process is the one with id=2. + + Args: + mesh (list): an N-dimensional array (nested list) describes the toplogy + of logical processes. The shape of the N-dimensional array + represents the topology of logical processes and every + element of the N-dimensional array represents a logical process. + parent (ProcessMesh, optional): the parent ProcessMesh. None means + the ProcessMesh is the root one without parent ProcessMesh. + Default: None. + + Returns: + None + + Raises: + ValueError: If `mesh` is not an instance of list. + + Examples: + .. code-block:: python + + import paddle + import paddle.distributed as dist + + paddle.enable_static() + + mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]]) + assert mesh.parent is None + assert mesh.topology == [2, 3] + assert mesh.process_group == [2, 4, 5, 0, 1, 3] + mesh.set_placement([0, 1, 2, 3, 4, 5]) + + """ + + def __init__(self, mesh, parent=None): + _static_mode_check() + if mesh is None or not isinstance(mesh, list): + raise ValueError('mesh must be an instance of list.') + + self._topology = _get_nested_list_shape(mesh) + self._processes = _flatten_nested_list(mesh) + + # Every element of mesh must be >= 0. + assert min(self._processes) >= 0, ('All elements of mesh must be >= 0.') + + unique_ids = set(self._processes) + assert len(unique_ids) == len(self._processes), ( + 'All elements of mesh must be unique.') + + if parent is None: + # For root ProcessMesh, the ids of logical processes must be range + # from 0 to N-1, where N is the number of logical processes. + assert max(self._processes) == len(self._processes) - 1, ( + 'For root ProcessMesh, ids of logical processes must be range ' + 'from 0 to N-1, where N is the number of logical processes.') + + parent_id = core.kNoneProcessMeshIndex() + assert len(_g_process_mesh_map.keys()) == 0, ( + 'The first ProcessMesh must be the root, which has no parent.') + else: + assert len(_g_process_mesh_map.keys()) > 0, ( + 'All ProcessMesh must have a parent except the root one.') + + assert isinstance(parent, ProcessMesh), ( + 'parent must be an instance of ProcessMesh.') + parent_id = parent._desc.id + + # All elements in mesh must belong to its parent + parent_ids = set(parent.process_group) + assert unique_ids <= parent_ids, ( + 'All elements in mesh must belong to its parent.') + + self._desc = core.ProcessMeshDesc(self._topology, self._processes, + parent_id) + + self._id = self._desc.id + self._parent_id = parent_id + assert self._id not in _g_process_mesh_map, ( + "The ProcessMesh with id %d already exists." % self._id) + _g_process_mesh_map[self._id] = self + + @property + def topology(self): + r""" + Get the topology of logical processes belonging to this ProcessMesh. + This is the shape of `mesh` used to initialized this ProcessMesh. + """ + return self._topology + + @property + def process_group(self): + r""" + Get a list of all processes belonging to this ProcessMesh. + """ + return self._processes + + @property + def parent(self): + r""" + Get the parent ProcessMesh. + """ + if self._parent_id == core.kNoneProcessMeshIndex(): return None + assert self._parent_id in _g_process_mesh_map, ( + "parent with id %d does not exist." % self._parent_id) + return _g_process_mesh_map[self._parent_id] + + def set_placement(self, order): + """ + Set the map from logical processes to physical ones using the + user defined order. + + Args: + order (list): order of the physical process ids. + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + import paddle.distributed as dist + + paddle.enable_static() + + mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]]) + mesh.set_placement([0, 1, 2, 3, 4, 5]) + + """ + assert self.parent is None, ( + "This function can only be called by the root ProcessMesh.") + unique_ids = set(order) + assert isinstance(order, list) + + assert len(unique_ids) == len(order), ( + "All elements in order must be unique.") + assert min(order) == 0 + assert max(order) == len(order) - 1, ( + "All elements in order must be from 0 to N - 1, where N " + "is the number of physical processes.") + + logical_order = self.process_group + global _user_defined_physical_map + assert _user_defined_physical_map is None, ( + "This function can only be called once.") + _user_defined_physical_map = dict() + + assert len(logical_order) == len(order) + for idx, l_id in enumerate(logical_order): + _user_defined_physical_map[l_id] = order[idx] + + def __eq__(self, other): + assert other and isinstance(other, ProcessMesh) + if self.topology != other.topology or self.process_group != other.process_group: + return False + return True + + def __ne__(self, other): + return not self.__eq__(other) + + +def _dim_mapping_checker(tensor, mesh, dim_mapping): + assert len(tensor.shape) == len(dim_mapping) + mesh_dim = len(mesh.topology) + dim_set = set() + for i in range(len(dim_mapping)): + assert dim_mapping[i] == -1 or (dim_mapping[i] < mesh_dim and + dim_mapping[i] >= 0) + if dim_mapping[i] >= 0: + assert dim_mapping[i] not in dim_set + dim_set.add(dim_mapping[i]) + + +def shard_tensor(x, mesh, dim_mapping): + """ + Add distributed attributes for a tensors. + + Args: + x (Tensor): the tensor to process. + mesh (ProcessMesh): an instance of ProcessMesh to describe the topology of logical processes. + dim_mapping (list): a list to describe the mapping between `x` and `mesh`, + the dimension `i` of `x` is split across the dimension `dims_mapping[i]`, where -1 means + without parition along the corresponding dimension. + + Returns: + Tensor: the tensor `x` itself. + + Examples: + .. code-block:: python + + import paddle + import paddle.distributed as dist + + paddle.enable_static() + + mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]]) + x = paddle.ones([4, 6]) + dist.shard_tensor(x, mesh, [0, -1]) + + """ + _static_mode_check() + _dim_mapping_checker(x, mesh, dim_mapping) + attr_name = _append_attr_suffix('mesh_id') + x._set_attr(attr_name, mesh._id) + attr_name = _append_attr_suffix('dim_mapping') + x._set_attr(attr_name, dim_mapping) + return x + + +def set_shard_mask(x, mask): + """ + Set the mask for a tensor which mask out the tensor from some processes in its mesh. + + Args: + x (Tensor): the tensor to process. + mask (list): a nested list. The shape of `mask` must be the same as the ProcessMesh belonging to + the tensor `x`. Every value of `mask` must be one or zero, where one means + the tenor `x` will be put on the corresponding logical process and zero means the tensor `x` + will not be put on the corresponding logical process. + For example, for a ProcessMesh represented by the 2-dimensional + array [[2, 4, 5], [0, 1, 3]], and a `mask` given by the + 2-dimensional [[1, 0, 1], [0, 1, 0]], + then the tensor `x` will only be put on logical processes 2, 5 and 1. + + Returns: + Tensor: the tensor `x` itself. + + Examples: + .. code-block:: python + + import paddle + import paddle.distributed as dist + + paddle.enable_static() + + mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]]) + mask = [[1, 0, 1], [0, 1, 0]] + x = paddle.ones([4, 6]) + dist.set_shard_mask(x, mask) + + """ + _static_mode_check() + assert isinstance(mask, list) + attr_name = _append_attr_suffix('mask') + x._set_attr(attr_name, _flatten_nested_list(mask)) + return x + + +def shard_op(op_fn, mesh, dim_mapping_dict, **kwargs): + """ + Call a functioin and add distributed attributes for ops added by the function. + + Args: + op_fn (callable): a callable object of an API. + mesh (ProcessMesh): an instance of ProcessMesh specifies the topology of logical processes. + dim_mapping_dict (dict): a mapping from tensor's name to its dims_mapping. + The dim_mapping is a list to describe the mapping between a tensor and `mesh`, + the dimension `i` of the tensor is split across the dimension `dim_mapping[i]`, + where -1 means without parition along the corresponding dimension. + kwargs (dict): a dict of parameter passed to the function `op_fn`. + + Returns: + list: the outputs of the function `op_fn`. + + Examples: + .. code-block:: python + + import paddle + import paddle.distributed as dist + + paddle.enable_static() + + mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]]) + x = paddle.ones([4, 6]) + y = paddle.zeros([4, 6]) + kwargs = {'x': x, 'y': y} + dist.shard_op(paddle.add, mesh, None, **kwargs) + + """ + _static_mode_check() + main_prog = paddle.fluid.default_main_program() + main_block = main_prog.global_block() + op_size = len(main_block.ops) + output = op_fn(**kwargs) + new_op_size = len(main_block.ops) + if dim_mapping_dict is None: dim_mapping_dict = dict() + for idx in range(op_size, new_op_size): + op = main_block.ops[idx] + attr_name = _append_attr_suffix('mesh_id') + op._set_attr(attr_name, mesh._id) + for var_name in dim_mapping_dict.keys(): + assert var_name in op.output_arg_names + op.input_arg_names + attr_name = _append_attr_suffix(var_name) + if var_name in op.input_arg_names: + # we use the prefix "IN_" to indicates an input argument name + attr_name = "IN_" + attr_name + else: + # we use the prefix "OUT_" to indicates an input argument name + attr_name = "OUT_" + attr_name + op._set_attr(attr_name, dim_mapping_dict[var_name]) + + if isinstance(output, Variable): + output = [output] + return list(output) + + +def set_offload_device(x, device): + """ + Set the device that the tensor `x` will be put on. + + Args: + x (tensor): the tensor to process. + device (str): the device that the tensor `x` will be put on, e.g., 'cpu'. + + Returns: + Tensor: the tensor `x` itself. + + Examples: + .. code-block:: python + + import paddle + import paddle.distributed as dist + + paddle.enable_static() + + x = paddle.ones([4, 6]) + dist.set_offload_device(x, 'cpu') + + """ + _static_mode_check() + attr_name = _append_attr_suffix("offload_device") + x._set_attr(attr_name, device) + return x + + +def set_pipeline_stage(stage): + """ + Set the pipeline stage of the following ops. + + Args: + stage (int): the pipeline stage the following ops belonging to. + + Returns: + None. + + Examples: + .. code-block:: python + + import paddle + import paddle.distributed as dist + + paddle.enable_static() + + dist.set_pipeline_stage(0) + + """ + from paddle.fluid.framework import _set_pipeline_stage + _static_mode_check() + _set_pipeline_stage(stage) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 02f9fd1a95e2b2..10b7292a0b6bb5 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -72,6 +72,7 @@ _global_expected_place_ = None _current_device = None global_prog_seed = 0 +_current_pipeline_stage = None _global_flags_ = core.globals() @@ -239,6 +240,11 @@ def __impl__(*args, **kwargs): return __impl__ +def _set_pipeline_stage(stage): + global _current_pipeline_stage + _current_pipeline_stage = stage + + # NOTE(zhiqiu): This decorator is used for the APIs of Variable which is only # used to make Variable and VarBase has same interfaces, like numpy. Since VarBase is not exposed in our # official docments, logically, we want to keep VarBase and logically consistent. While, actually, @@ -1873,6 +1879,86 @@ def size(self): type='size', inputs={'Input': [self]}, outputs={'Out': [output]}) return output + def _set_attr(self, name, val): + """ + Set the value of attribute by attribute's name. + + Args: + name(str): the attribute name. + val(int|str|list): the value of the attribute. + """ + self._update_desc_attr(name, val) + + def _has_attr(self, name): + """ + Whether this Variable has the attribute with the name `name` or not. + + Args: + name(str): the attribute name. + + Returns: + bool: True if has this attribute. + """ + return self.desc.has_attr(name) + + def _remove_attr(self, name): + self.desc.remove_attr(name) + + def _update_desc_attr(self, name, val): + """ + Update the value of desc's attribute by attribute's name. + + Args: + name(str): the attribute name. + val(int|str|list): the value of the attribute. + """ + self.desc._set_attr(name, val) + + @property + def attr_names(self): + """Get the names of all attributes defined.""" + return self.desc.attr_names() + + def _get_attr(self, name): + """ + Get the attribute by name. + + Args: + name(str): the attribute name. + + Returns: + int|str|list: The attribute value. The return value + can be any valid attribute type. + """ + return self.desc.attr(name) + + @property + def process_mesh(self): + """ + Get the process mesh belonging to this Variable. + """ + from paddle.distributed.auto_parallel.interface import _g_process_mesh_map + from paddle.distributed.auto_parallel.interface import ProcessMesh + mesh_attr_name = 'mesh_id' + core.kAutoParallelSuffix() + mesh_id = self.desc.attr(mesh_attr_name) + return _g_process_mesh_map[mesh_id] + + @property + def shard_mask(self): + """ + Get shard_mask belonging to this Variable. + """ + mask_attr_name = 'mask' + core.kAutoParallelSuffix() + return self.desc.attr(mask_attr_name) + + @property + def offload_device(self): + """ + Get the offload device of this Variable. + """ + offload_attr_name = 'offload_device' + core.kAutoParallelSuffix() + return self.desc.attr(offload_attr_name) + def get_all_op_protos(): """ @@ -2077,6 +2163,11 @@ def __init__(self, "The Attr(force_cpu) of Op(%s) will be deprecated in the future, " "please use 'device_guard' instead. 'device_guard' has higher priority when they are " "used at the same time." % type) + if _current_pipeline_stage is not None: + pipeline_attr_name = 'pipeline_stage' + core.kAutoParallelSuffix( + ) + self._update_desc_attr(pipeline_attr_name, + _current_pipeline_stage) def find_name(var_list, name): for var_name in var_list: @@ -2548,6 +2639,31 @@ def _is_backward_op(self): return False + @property + def process_mesh(self): + """ + Get the process mesh belonging to this Operator. + """ + from paddle.distributed.auto_parallel.interface import _g_process_mesh_map + mesh_attr_name = 'mesh_id' + core.kAutoParallelSuffix() + mesh_id = self.attr(mesh_attr_name) + return _g_process_mesh_map[mesh_id] + + def dims_mapping(self, name): + """ + Get the dims_mapping for the op's var named `name`. + """ + dims_mapping_attr_name = name + core.kAutoParallelSuffix() + return self.attr(dims_mapping_attr_name) + + @property + def pipeline_stage(self): + """ + Get pipeline stage of the Operator. + """ + pipeline_stage_attr_name = 'pipeline_stage' + core.kAutoParallelSuffix() + return self.desc.attr(pipeline_stage_attr_name) + class Block(object): """ diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py new file mode 100644 index 00000000000000..0fc2e24a3f6bc9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py @@ -0,0 +1,143 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import functools +import operator +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.nn as nn +import paddle.distributed as dist + +paddle.enable_static() + + +def _flatten_nested_list(nested_list): + result = functools.reduce(operator.iconcat, nested_list, []) + return result + + +def _append_attr_suffix(name): + return name + core.kAutoParallelSuffix() + + +LAST_PP_STAGE = 3 +MASK = [[0, 1], [1, 0], [1, 1]] +MESH = dist.ProcessMesh([[0, 1, 2], [3, 4, 5]]) + + +class SimpleNet(nn.Layer): + def __init__(self, vocab_size=128, hidden_size=4): + super(SimpleNet, self).__init__() + self.mesh = MESH + self.mesh.set_placement([5, 4, 3, 2, 1, 0]) + self.word_embeddings = nn.Embedding(vocab_size, hidden_size) + self.dense1 = nn.Linear(hidden_size, hidden_size) + self.dense2 = nn.Linear(hidden_size, hidden_size // 2) + + def forward(self, x, y): + x = dist.shard_tensor(x, self.mesh, dim_mapping=[0, -1]) + x = dist.set_shard_mask(x, MASK) + emb_out = self.word_embeddings(x) + + dist.set_pipeline_stage(LAST_PP_STAGE) + + y = dist.shard_tensor(y, self.mesh, dim_mapping=[0, -1]) + dist.set_offload_device(y, "gpu:3") + linear1 = self.dense1(y) + out = self.dense2(linear1) + + return x, y, self.mesh + + +class TestAutoParallelAPI(unittest.TestCase): + def test_api(self): + net = SimpleNet() + data1 = fluid.layers.fill_constant(shape=[2, 4], value=1, dtype="int64") + data2 = fluid.layers.fill_constant( + shape=[2, 4], value=2, dtype="float32") + data3 = fluid.layers.fill_constant( + shape=[2, 4], value=4, dtype="float32") + x, y, mesh = net.forward(data1, data2) + mesh_attr = _append_attr_suffix('mesh_id') + x_mesh_id = x._get_attr(mesh_attr) + self.assertEqual(x_mesh_id, mesh._id) + x_mesh = x.process_mesh + + allatts = x.attr_names + self.assertEqual(x_mesh, mesh) + shard_mask_attr = _append_attr_suffix('mask') + self.assertEqual( + x._get_attr(shard_mask_attr), _flatten_nested_list(MASK)) + self.assertEqual(x.shard_mask, _flatten_nested_list(MASK)) + offload_attr = _append_attr_suffix('offload_device') + self.assertEqual(y._get_attr(offload_attr), "gpu:3") + self.assertEqual(y.desc.has_attr(offload_attr), True) + self.assertEqual(y.offload_device, "gpu:3") + y._remove_attr(offload_attr) + self.assertEqual(y._has_attr(offload_attr), False) + ops = paddle.static.default_main_program().block(0).ops + first_op = ops[0] + last_op = ops[-1] + + self.assertEqual(last_op.pipeline_stage, LAST_PP_STAGE) + + DIMS_MAPPING1 = [0, 1, -1] + DIMS_MAPPING2 = [-1, 2, 0] + kwargs = {'x': data2, 'y': data3} + dist.shard_op( + paddle.add, + mesh=mesh, + dim_mapping_dict={ + data2.name: DIMS_MAPPING1, + data3.name: DIMS_MAPPING2 + }, + **kwargs) + ops = paddle.static.default_main_program().block(0).ops + last_op = ops[-1] + + self.assertEqual(last_op.process_mesh, mesh) + attr_name = "IN_" + data2.name + attr_name = _append_attr_suffix(attr_name) + self.assertEqual(last_op.attr(attr_name), DIMS_MAPPING1) + attr_name = "IN_" + data3.name + attr_name = _append_attr_suffix(attr_name) + self.assertEqual(last_op.attr(attr_name), DIMS_MAPPING2) + + def test_process_mesh(self): + mesh1 = dist.ProcessMesh([[0, 1, 2], [3, 4, 5]], parent=MESH) + mesh2 = dist.ProcessMesh([[0, 1, 2], [3, 4, 5]], parent=mesh1) + mesh3 = dist.ProcessMesh([[0, 1], [2, 3]], parent=mesh1) + mesh4 = dist.ProcessMesh([[2, 3], [4, 5]], parent=mesh1) + + self.assertEqual(MESH.parent, None) + self.assertEqual(mesh1.parent, MESH) + self.assertEqual(mesh1._desc.parent, MESH._id) + self.assertEqual(mesh3.parent, mesh1) + self.assertEqual(mesh4.parent, mesh1) + self.assertEqual(mesh1, mesh2) + self.assertNotEqual(mesh3, mesh4) + self.assertEqual(mesh2._id, mesh2._desc.id) + self.assertEqual(mesh3.topology, mesh3._desc.topology) + self.assertEqual(mesh3.topology, [2, 2]) + self.assertEqual(mesh3.process_group, [0, 1, 2, 3]) + self.assertEqual(mesh4.process_group, mesh4._desc.process_group) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index d530f8483bcde7..07cf4c3a252df8 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -164,6 +164,7 @@ packages=['paddle', 'paddle.distributed.fleet.meta_parallel', 'paddle.distributed.fleet.meta_parallel.pp_utils', 'paddle.distributed.fleet.meta_parallel.parallel_layers', + 'paddle.distributed.auto_parallel', 'paddle.framework', 'paddle.jit', 'paddle.jit.dy2static', From 4d7af372ee6481aaa3f43feba3bb128f2e6e2ae4 Mon Sep 17 00:00:00 2001 From: WangXi Date: Wed, 11 Aug 2021 15:30:58 +0800 Subject: [PATCH 006/126] [hybrid] pp+dp support fp16 allreduce (#34762) --- .../fleet/meta_optimizers/sharding/utils.py | 47 ++--- .../meta_optimizers/sharding_optimizer.py | 51 ++++-- python/paddle/fluid/optimizer.py | 106 +++++++---- .../test_fleet_sharding_meta_optimizer.py | 169 +++++++++++++++++- 4 files changed, 295 insertions(+), 78 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index a0e18eb16b6016..52ef843aa0d751 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -14,7 +14,7 @@ import paddle from paddle.fluid import core, unique_name from functools import reduce -from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op +from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op, is_backward_op from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY import re @@ -431,15 +431,19 @@ def insert_reduce_ops(block, reduce_vars, shard, op_role=OpRole.Backward, - use_calc_stream=False): + use_calc_stream=False, + rank=None): """ _add_allreduce_ops """ + grad_in_this_device = [] for var in reduce_vars: root_id = get_grad_device(var, shard) assert root_id >= 0, "root id should be a positive int, but now root id is {}".format( root_id) + if rank is not None and rank == root_id: + grad_in_this_device.append(var) block._insert_op_without_sync( insert_idx, type='c_reduce_sum', @@ -451,16 +455,23 @@ def insert_reduce_ops(block, 'use_calc_stream': use_calc_stream, OP_ROLE_KEY: op_role }) - return + + return grad_in_this_device def get_grad_device(grad_name, shard): assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format( grad_name) base_name = None - # mind the traversal order + # NOTE: mind the traversal order possible_suffixes = [ - '.cast_fp16@GRAD@MERGED', '.cast_fp16@GRAD', '@GRAD@MERGED', '@GRAD' + # sharding gm + '.cast_fp16@GRAD@MERGED', + '.cast_fp16@GRAD', + # pipeline + '@GRAD@MERGED@FP16', + '@GRAD@MERGED', + '@GRAD', ] for suffix in possible_suffixes: if suffix in grad_name: @@ -487,6 +498,15 @@ def get_first_check_finite_and_unscale_op_idx(block, raise_error=True): return -1 +def get_first_optimize_op_idx(block): + first_opt_op_idx = None + for index, op in reversed(tuple(enumerate(block.ops))): + if is_backward_op(op) and first_opt_op_idx is None: + first_opt_op_idx = index + 1 + break + return first_opt_op_idx + + def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root): """ _add_broadcast_ops @@ -672,23 +692,6 @@ def sharding_predicate(var): return -def get_grad_device(grad_name, shard): - assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format( - grad_name) - base_name = None - # mind the traversal order - possible_suffixes = ['.cast_fp16@GRAD', '@GRAD'] - for suffix in possible_suffixes: - if suffix in grad_name: - base_name = re.sub(suffix, '', grad_name) - break - - assert base_name in shard.global_param2device, "[{}] should be a param variable.".format( - base_name) - - return shard.global_param2device[base_name] - - def append_naive_sync(block, sync_var, ring_id): # NOTE (JZ-LIANG) update this to use barrier sync for more elegent logic # sync within global diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index a5df9486da4656..93901b38873b95 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -294,6 +294,8 @@ def _insert_allreduce_for_pp(self): if self.pp_degree == 1: return strategy = self.user_defined_strategy + fp16_allreduce = strategy.fp16_allreduce + main_block = self._main_program.global_block() startup_block = self._startup_program.global_block() @@ -317,33 +319,44 @@ def _insert_allreduce_for_pp(self): main_block._remove_op(idx) accumulated_grad_names = self._pp_optimizer._accumulate_gradients( - main_block) - # accumulated_grad_names = sorted(accumulated_grad_names) + main_block, fp16_allreduce=fp16_allreduce) + + len_of_ops = len(main_block.ops) + first_optimize_op_index = get_first_optimize_op_idx(main_block) + if self.pp_allreduce_in_optimize: - print("persistable FP32 grad: ") - print(accumulated_grad_names) - first_optimize_op_index = get_first_check_finite_and_unscale_op_idx( - main_block, raise_error=strategy.amp) - insert_reduce_ops( + logger.info("Pipeline Persistable grad is {}".format( + accumulated_grad_names)) + # FIXME(wangxi): accumulated_grad get from pipeline is not + # include sharding's param@BroadCast grad when + # pp_allreduce_in_optimize + accumulated_grad_names = insert_reduce_ops( main_block, first_optimize_op_index, self.sharding_ring_id, accumulated_grad_names, self._shard, core.op_proto_and_checker_maker.OpRole.Optimize, - use_calc_stream=True) + use_calc_stream=True, + rank=self.sharding_rank) + + logger.info("PP-Sharding grad is {}".format(accumulated_grad_names)) + first_optimize_op_index += (len(main_block.ops) - len_of_ops) + len_of_ops = len(main_block.ops) + if self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp": - first_optimize_op_index = get_first_check_finite_and_unscale_op_idx( - main_block, raise_error=strategy.amp) - if first_optimize_op_index >= 0: - insert_allreduce_ops( - main_block, - first_optimize_op_index, - self.dp_ring_id, - accumulated_grad_names, - core.op_proto_and_checker_maker.OpRole.Optimize, - use_calc_stream=True, - user_defined_strategy=strategy) + insert_allreduce_ops( + main_block, + first_optimize_op_index, + self.dp_ring_id, + accumulated_grad_names, + core.op_proto_and_checker_maker.OpRole.Optimize, + use_calc_stream=True, + user_defined_strategy=strategy) + first_optimize_op_index += (len(main_block.ops) - len_of_ops) + len_of_ops = len(main_block.ops) + + # FIXME(wangxi): if fp16_allreduce, put cast fp16->fp32 to there? def _adapt_amp_clip_without_sharding(self): if self.sharding_degree > 1: return diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index ab3dbad1ef326d..7ad94f4be3eb2f 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -4528,7 +4528,7 @@ def _rename_arg(self, op, old_name, new_name): op._rename_input(old_name, new_name) op._rename_output(old_name, new_name) - def _create_var(self, block, ref_var, name): + def _create_var(self, block, ref_var, name, dtype=None): """ Create a new var for block, which has the same type, shape and dtype as ref_var, then rename it with the @@ -4537,7 +4537,7 @@ def _create_var(self, block, ref_var, name): new_var = block.create_var( name=name, shape=ref_var.shape, - dtype=ref_var.dtype, + dtype=ref_var.dtype if dtype is None else dtype, type=ref_var.type, lod_level=ref_var.lod_level, persistable=ref_var.persistable, @@ -5044,7 +5044,10 @@ def _rename_gradient_var_name(self, block): new_grad_name = name + "@MERGED" self._rename_arg(op, name, new_grad_name) - def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False): + def _accumulate_gradients(self, + block, + pp_allreduce_in_optimize=False, + fp16_allreduce=False): """ Create a new merged gradient for each parameter and accumulate the corresponding gradient to it. @@ -5052,6 +5055,9 @@ def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False): merged_gradient_names = [] first_opt_op_idx = None + merged_suffix = '@MERGED@FP16' if fp16_allreduce else '@MERGED' + dtype = paddle.float16 if fp16_allreduce else None + for index, op in reversed(tuple(enumerate(list(block.ops)))): # remove the cast op of fp16 grad to fp32 grad if self._is_optimize_op(op) and op.type == 'cast': @@ -5062,12 +5068,10 @@ def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False): block._remove_op(index) continue - if self._is_backward_op(op) and not first_opt_op_idx: + if self._is_backward_op(op) and first_opt_op_idx is None: first_opt_op_idx = index + 1 # no optimize phase if first_opt_op_idx == len(block.ops): return - if block.ops[first_opt_op_idx].type == "c_sync_comm_stream": - first_opt_op_idx += 1 if self._is_backward_op(op) and ( self._op_role_var_key in op.attr_names): @@ -5079,12 +5083,14 @@ def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False): param_name = op_role_var[i] if not block.has_var(param_name): continue if '@BroadCast' in param_name: continue + param_grad_name = param_name + core.grad_var_suffix() - merged_param_grad_name = param_grad_name + '@MERGED' + merged_param_grad_name = param_grad_name + merged_suffix if not block.has_var(merged_param_grad_name): self._create_var(block, block.vars[param_name], - merged_param_grad_name) + merged_param_grad_name, dtype) assert block.has_var(merged_param_grad_name) + param_grad_var = block.var(param_grad_name) merged_param_grad_var = block.var(merged_param_grad_name) merged_param_grad_var.persistable = True @@ -5103,22 +5109,18 @@ def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False): offset += 1 grad_name = op_role_var[i + 1] grad_var = block.vars[grad_name] - if not 'cast_fp16' in grad_name: - block._insert_op( - index=first_opt_op_idx + offset, - type='sum', - inputs={'X': [grad_var, merged_param_grad_var]}, - outputs={'Out': merged_param_grad_var}, - attrs={ - self._op_role_key: self._op_role.Backward, - }) - offset += 1 - merged_gradient_names.append(merged_param_grad_name) - else: - # cast gradient to fp32 to accumulate to merged gradient + + is_fp16_grad = 'cast_fp16' in grad_name + need_cast = (is_fp16_grad is not fp16_allreduce) + + if need_cast: + # if fp16_allreduce: + # cast grad to fp16 to accumulate to merged gradient + # else: + # cast grad to fp32 to accumulate to merged gradient cast_grad_var_name = param_grad_name + '@TMP' - cast_grad_var = self._create_var(block, param_grad_var, - cast_grad_var_name) + cast_grad_var = self._create_var( + block, param_grad_var, cast_grad_var_name, dtype) cast_grad_var.persistable = False block._insert_op( index=first_opt_op_idx + offset, @@ -5131,18 +5133,52 @@ def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False): self._op_role_key: self._op_role.Backward, }) offset += 1 - block._insert_op( - index=first_opt_op_idx + offset, - type='sum', - inputs={ - 'X': [merged_param_grad_var, cast_grad_var] - }, - outputs={'Out': merged_param_grad_var}, - attrs={ - self._op_role_key: self._op_role.Backward, - }) - offset += 1 - merged_gradient_names.append(merged_param_grad_name) + grad_var = cast_grad_var + + block._insert_op( + index=first_opt_op_idx + offset, + type='sum', + inputs={'X': [merged_param_grad_var, grad_var]}, + outputs={'Out': merged_param_grad_var}, + attrs={self._op_role_key: self._op_role.Backward, }) + offset += 1 + merged_gradient_names.append(merged_param_grad_name) + + if not fp16_allreduce: return merged_gradient_names + + first_opt_op_idx = None + for index, op in reversed(tuple(enumerate(list(block.ops)))): + if self._is_backward_op(op) and first_opt_op_idx is None: + first_opt_op_idx = index + 1 + break + assert first_opt_op_idx is not None + + # insert cast op from fp16->fp32 + # FIXME(wangxi): maybe put in sharding is better, for some grad + # is not in sharding device. + for fp16_grad_name in merged_gradient_names: + grad_name = fp16_grad_name.replace('@FP16', '') + param_name = fp16_grad_name.replace('@GRAD@MERGED@FP16', '') + + if not block.has_var(grad_name): + self._create_var(block, block.vars[param_name], grad_name) + assert block.has_var(grad_name) + + fp16_grad_var = block.var(fp16_grad_name) + grad_var = block.var(grad_name) + grad_var.persistable = False + + block._insert_op( + index=first_opt_op_idx, + type='cast', + inputs={'X': fp16_grad_var}, + outputs={'Out': grad_var}, + attrs={ + 'in_dtype': fp16_grad_var.dtype, + 'out_dtype': grad_var.dtype, + self._op_role_key: self._op_role.Optimize, + }) + return merged_gradient_names def _add_sub_blocks(self, main_block, program_list): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index b7cf9dfaec5760..d70a58c7d8ab41 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -552,9 +552,9 @@ def test_sharding_with_pp(self): 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', - 'c_sync_comm_stream', 'fill_constant', 'sum', 'fill_constant', + 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', - 'fill_constant', 'sum', 'momentum', 'momentum', 'momentum', + 'c_sync_comm_stream', 'momentum', 'momentum', 'momentum', 'momentum', 'momentum' ]) @@ -694,6 +694,171 @@ def test_hybrid_with_mp_pp_amp_gclip(self): self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36002']) + def test_hybrid_with_pp_dp_amp_fp16allreduce(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.pp_net(train_prog, startup_prog) + strategy.amp = True + strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], } + strategy.sharding = True + strategy.sharding_configs = { + "sharding_degree": 1, + "mp_degree": 1, + "pp_degree": 2, + "dp_degree": 2, + } + strategy.pipeline = True + strategy.pipeline_configs = { + "schedule_mode": "1F1B", + "micro_batch_size": 2, + "accumulate_steps": 4, + } + strategy.fp16_allreduce = True + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + train_prog = train_prog._pipeline_opt['section_program'] + startup_prog = startup_prog._pipeline_opt['startup_program'] + + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check program + startup_prog_op_types = [op.type for op in startup_prog_ops] + main_prog_op_types = [op.type for op in main_prog_ops] + + # ring: mp, pp_group, pp_pair, pp_pair + self.assertEqual(startup_prog_op_types, [ + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', + 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', + 'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream' + ]) + + self.assertEqual(main_prog_op_types, [ + 'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', + 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', + 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', + 'softmax', 'cross_entropy2', 'mean', 'elementwise_mul', + 'fill_constant', 'scale', 'scale', 'elementwise_mul_grad', + 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', + 'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', + 'fill_constant', 'cast', 'sum', 'fill_constant', 'sum', + 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', + 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', + 'fill_constant', 'sum', 'coalesce_tensor', 'c_allreduce_sum', + 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', + 'c_sync_comm_stream', 'check_finite_and_unscale', 'cast', + 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', + 'momentum', 'momentum', 'momentum', 'momentum', 'momentum', + 'momentum', 'momentum' + ]) + + # amp check_finite_and_unscale, allreduce(pp) + self.assertEqual(main_prog_op_types.count('c_allreduce_max'), 1) + + # should has ring id for pp + created_ring_ids = [ + op.desc.attr("ring_id") for op in startup_prog_ops + if op.type == "c_comm_init" + ] + self.assertIn(self.pp_pair_ring_id, created_ring_ids) + self.assertIn(self.dp_ring_id, created_ring_ids) + + # check correctness of pp group + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "comm_id_0": + pp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36003']) + + # check correctness of dp group + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "comm_id_3": + dp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002']) + + def test_hybrid_with_sharding_pp_amp_fp16allreduce_in_optimize(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.pp_net(train_prog, startup_prog) + strategy.amp = True + strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], } + strategy.sharding = True + strategy.sharding_configs = { + "segment_broadcast_MB": 0.1, + "sharding_degree": 2, + "mp_degree": 1, + "pp_degree": 2, + "dp_degree": 1, + 'pp_allreduce_in_optimize': True, + } + strategy.pipeline = True + strategy.pipeline_configs = { + "schedule_mode": "1F1B", + "micro_batch_size": 2, + "accumulate_steps": 4, + } + strategy.fp16_allreduce = True + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + train_prog = train_prog._pipeline_opt['section_program'] + startup_prog = startup_prog._pipeline_opt['startup_program'] + + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check program + startup_prog_op_types = [op.type for op in startup_prog_ops] + main_prog_op_types = [op.type for op in main_prog_ops] + + # ring: sharding, pp_group, pp_pair, pp_pair + self.assertEqual(startup_prog_op_types, [ + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', + 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', + 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init' + ]) + + # FIXME(wangxi): some bug in sharding+pp with pp_allreduce_in_optimize + # self.assertEqual(main_prog_op_types, []) + + # amp check_finite_and_unscale, allreduce(pp) + self.assertEqual(main_prog_op_types.count('c_allreduce_max'), 2) + + # should has ring id for pp + created_ring_ids = [ + op.desc.attr("ring_id") for op in startup_prog_ops + if op.type == "c_comm_init" + ] + self.assertIn(self.sharding_ring_id, created_ring_ids) + self.assertIn(self.pp_pair_ring_id, created_ring_ids) + + # check correctness of sharding group + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "comm_id_0": + sharding_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003']) + + # check correctness of pp group + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "comm_id_1": + pp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36002']) + if __name__ == "__main__": unittest.main() From 3f011d82538c9c52bf0c9aee9448bb879d96d642 Mon Sep 17 00:00:00 2001 From: Hao Lin Date: Wed, 11 Aug 2021 15:47:54 +0800 Subject: [PATCH 007/126] Add ext_tensor.slice() API (#34227) * Add ext_tensor.slice() API, test=develop * Call Tensor::mutable_data first to fix bugs and add test for writing to sliced tensor * Fix unit test bug * Fix code format problem, test=develop * Fix code format problem * Fix code format problem * strengthen unit test * Use CustomTensorUtils::ShareDataFrom to simplify codes --- paddle/fluid/extension/include/ext_tensor.h | 12 ++++++- paddle/fluid/extension/src/ext_tensor.cc | 25 +++++++++++++ paddle/fluid/framework/custom_tensor_test.cc | 37 ++++++++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h index d40503409fbdc1..7d13f56b02b821 100644 --- a/paddle/fluid/extension/include/ext_tensor.h +++ b/paddle/fluid/extension/include/ext_tensor.h @@ -88,10 +88,20 @@ class PD_DLL_DECL Tensor { /// It's usually used to set the input tensor data. /// \param PlaceType of target place, of which /// the tensor will copy to. - template Tensor copy_to(const PlaceType& place) const; + /// \brief Return a sub-tensor of the given tensor. + /// It is usually used to extract a sub-tensor (which supports + /// modifying the data of the original tensor) to perform further + /// operations. + /// \param begin_idx The index of the start row (inclusive) to slice. + /// The index number begins from 0. + /// \param end_idx The index of the end row (exclusive) to slice. + /// The index number begins from begin_idx + 1. + /// \return The sliced tensor. + Tensor slice(const int64_t begin_idx, const int64_t end_idx) const; + /// \brief Return the shape of the Tensor. std::vector shape() const; diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc index a9e286b4f9b231..317fb7b2270b1f 100644 --- a/paddle/fluid/extension/src/ext_tensor.cc +++ b/paddle/fluid/extension/src/ext_tensor.cc @@ -124,6 +124,21 @@ void DeviceCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc, } \ auto *tensor = static_cast(tensor_.get()); +#define GET_INNER_PLACE \ + platform::Place place; \ + switch (place_) { \ + case PlaceType::kCPU: \ + place = platform::CPUPlace(); \ + break; \ + case PlaceType::kGPU: \ + place = platform::CUDAPlace(); \ + break; \ + default: \ + PADDLE_THROW(platform::errors::Unavailable( \ + "Custom operator unsupported place id(%d)", \ + static_cast(place_))); \ + } + void Tensor::reshape(const std::vector &shape) { GET_CASTED_TENSOR auto new_dim = framework::make_ddim(shape); @@ -257,6 +272,16 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const { return target; } +Tensor Tensor::slice(const int64_t begin_idx, const int64_t end_idx) const { + GET_CASTED_TENSOR + GET_INNER_PLACE + framework::Tensor intermediate = tensor->Slice(begin_idx, end_idx); + Tensor target = Tensor(place_); + framework::CustomTensorUtils::ShareDataFrom( + static_cast(&intermediate), target); + return target; +} + template PD_DLL_DECL Tensor Tensor::copy_to(const PlaceType &target_place) const; template PD_DLL_DECL Tensor diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc index b2896c74c39a6e..7fbc4f554ba653 100644 --- a/paddle/fluid/framework/custom_tensor_test.cc +++ b/paddle/fluid/framework/custom_tensor_test.cc @@ -92,6 +92,41 @@ void TestAPISizeAndShape() { CHECK(t1.shape() == tensor_shape); } +void TestAPISlice() { + std::vector tensor_shape_origin1 = {5, 5}; + std::vector tensor_shape_sub1 = {3, 5}; + std::vector tensor_shape_origin2 = {5, 5, 5}; + std::vector tensor_shape_sub2 = {1, 5, 5}; +#ifdef PADDLE_WITH_CUDA + auto t1 = paddle::Tensor(paddle::PlaceType::kGPU, tensor_shape_origin1); + t1.mutable_data(); + CHECK(t1.slice(0, 5).shape() == tensor_shape_origin1); + CHECK(t1.slice(0, 3).shape() == tensor_shape_sub1); + auto t2 = paddle::Tensor(paddle::PlaceType::kGPU, tensor_shape_origin2); + t2.mutable_data(); + CHECK(t2.slice(4, 5).shape() == tensor_shape_sub2); +#endif + auto t3 = paddle::Tensor(paddle::PlaceType::kCPU, tensor_shape_origin1); + t3.mutable_data(); + CHECK(t3.slice(0, 5).shape() == tensor_shape_origin1); + CHECK(t3.slice(0, 3).shape() == tensor_shape_sub1); + auto t4 = paddle::Tensor(paddle::PlaceType::kCPU, tensor_shape_origin2); + t4.mutable_data(); + CHECK(t4.slice(4, 5).shape() == tensor_shape_sub2); + + // Test writing function for sliced tensor + auto t = InitCPUTensorForTest(); + auto t_sliced = t.slice(0, 1); + auto* t_sliced_data_ptr = t_sliced.mutable_data(); + for (int64_t i = 0; i < t_sliced.size(); i++) { + t_sliced_data_ptr[i] += static_cast(5); + } + auto* t_data_ptr = t.mutable_data(); + for (int64_t i = 0; i < t_sliced.size(); i++) { + CHECK_EQ(t_data_ptr[i], static_cast(10)); + } +} + template paddle::DataType TestDtype() { std::vector tensor_shape = {5, 5}; @@ -261,6 +296,8 @@ TEST(CustomTensor, copyTest) { TestAPISizeAndShape(); VLOG(2) << "TestPlace"; TestAPIPlace(); + VLOG(2) << "TestSlice"; + TestAPISlice(); VLOG(2) << "TestCast"; GroupTestCast(); VLOG(2) << "TestDtypeConvert"; From 9ed5db28d8b685636aa4a4f125def666b8c38236 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Wed, 11 Aug 2021 04:42:21 -0500 Subject: [PATCH 008/126] [NPU] add batch_norm_op_npu and test (#34056) * add batch_norm_op_npu and tests * remove skip.If * fix bug --- paddle/fluid/operators/batch_norm_op_npu.cc | 230 ++++++++++ .../unittests/npu/test_batch_norm_op_npu.py | 428 ++++++++++++++++++ 2 files changed, 658 insertions(+) create mode 100644 paddle/fluid/operators/batch_norm_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc new file mode 100644 index 00000000000000..b4dc10777c651d --- /dev/null +++ b/paddle/fluid/operators/batch_norm_op_npu.cc @@ -0,0 +1,230 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/batch_norm_op.h" + +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class NPUBatchNormOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto &dev_ctx = ctx.template device_context(); + const float epsilon = ctx.Attr("epsilon"); + float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + const bool use_global_stats = ctx.Attr("use_global_stats"); + const bool trainable_stats = ctx.Attr("trainable_statistics"); + const bool test_mode = is_test && (!trainable_stats); + const std::string data_layout = ctx.Attr("data_layout"); + + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE_EQ(x_dims.size(), 4, + platform::errors::InvalidArgument( + "The input tensor X's dimension must equal to 4. But " + "received X's shape = [%s], X's dimension = [%d].", + x_dims, x_dims.size())); + + auto *y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + + Tensor x_tensor, y_tesnor; + x_tensor.ShareDataWith(*x); + y_tesnor.ShareDataWith(*y); + if (data_layout == "NHWC") { + x_tensor.set_layout(DataLayout::kNHWC); + y_tesnor.set_layout(DataLayout::kNHWC); + } + + bool training = !test_mode && !use_global_stats; + if (!training) { + const auto *est_mean = ctx.Input("Mean"); + const auto *est_var = ctx.Input("Variance"); + framework::Tensor reserve_space1, reserve_space2; + reserve_space1.mutable_data(est_mean->dims(), ctx.GetPlace()); + reserve_space2.mutable_data(est_var->dims(), ctx.GetPlace()); + + const auto &runner = NpuOpRunner( + "BatchNorm", {x_tensor, *scale, *bias, *est_mean, *est_var}, + {y_tesnor, reserve_space1, reserve_space2, reserve_space1, + reserve_space2}, + {{"epsilon", epsilon}, + {"is_training", training}, + {"data_format", data_layout}}); + auto stream = dev_ctx.stream(); + runner.Run(stream); + } else { + // if MomentumTensor is set, use MomentumTensor value, momentum + // is only used in this training branch + if (ctx.HasInput("MomentumTensor")) { + const auto *mom_tensor = ctx.Input("MomentumTensor"); + Tensor mom_cpu; + TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu); + momentum = mom_cpu.data()[0]; + } + + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); + + framework::Tensor mean_tmp, variance_tmp; + mean_tmp.mutable_data(mean_out->dims(), ctx.GetPlace()); + variance_tmp.mutable_data(variance_out->dims(), ctx.GetPlace()); + + const auto &runner = NpuOpRunner( + "BatchNorm", {x_tensor, *scale, *bias}, + {y_tesnor, mean_tmp, variance_tmp, *saved_mean, *saved_variance}, + {{"epsilon", epsilon}, + {"is_training", training}, + {"data_format", data_layout}}); + auto stream = dev_ctx.stream(); + runner.Run(stream); + // Ascend can't output the estimated mean and variance + framework::Tensor this_factor_tensor; + this_factor_tensor.mutable_data(framework::make_ddim({1}), + ctx.GetPlace()); + framework::TensorFromVector({static_cast(1. - momentum)}, + dev_ctx, &this_factor_tensor); + framework::Tensor momentum_tensor; + momentum_tensor.mutable_data(framework::make_ddim({1}), + ctx.GetPlace()); + framework::TensorFromVector({static_cast(momentum)}, + dev_ctx, &momentum_tensor); + framework::Tensor ones_tensor; + ones_tensor.mutable_data(mean_out->dims(), ctx.GetPlace()); + framework::TensorFromVector( + std::vector(framework::product(mean_out->dims()), 1.0f), + dev_ctx, &ones_tensor); + + const auto &runner1 = NpuOpRunner("AddMatMatElements", + {*mean_out, *saved_mean, ones_tensor, + momentum_tensor, this_factor_tensor}, + {*mean_out}, {}); + runner1.Run(stream); + const auto &runner2 = NpuOpRunner( + "AddMatMatElements", {*variance_out, *saved_variance, ones_tensor, + momentum_tensor, this_factor_tensor}, + {*variance_out}, {}); + runner2.Run(stream); + } + } +}; + +template +class NPUBatchNormGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto &dev_ctx = ctx.template device_context(); + const float epsilon = ctx.Attr("epsilon"); + const std::string data_layout = ctx.Attr("data_layout"); + bool use_global_stats = ctx.Attr("use_global_stats"); + + const auto *y_grad = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + auto *saved_mean = ctx.Input("SavedMean"); + auto *saved_variance = ctx.Input("SavedVariance"); + + auto *x_grad = ctx.Output(framework::GradVarName("X")); + auto *scale_grad = ctx.Output(framework::GradVarName("Scale")); + auto *bias_grad = ctx.Output(framework::GradVarName("Bias")); + + const bool is_test = ctx.Attr("is_test"); + use_global_stats = is_test || use_global_stats; + + const Tensor *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE_EQ(x_dims.size(), 4, + platform::errors::InvalidArgument( + "The input tensor X's dimension must equal to 4. But " + "received X's shape = [%s], X's dimension = [%d].", + x_dims, x_dims.size())); + + // init output + Tensor scale_grad_tmp, bias_grad_tmp, x_grad_tmp; + if (scale_grad && bias_grad) { + scale_grad->mutable_data(ctx.GetPlace()); + bias_grad->mutable_data(ctx.GetPlace()); + scale_grad_tmp.ShareDataWith(*scale_grad); + bias_grad_tmp.ShareDataWith(*bias_grad); + } else { + scale_grad_tmp.mutable_data(scale->dims(), ctx.GetPlace()); + bias_grad_tmp.mutable_data(bias->dims(), ctx.GetPlace()); + } + + Tensor x_tensor, y_grad_tensor, x_grad_tensor; + x_tensor.ShareDataWith(*x); + y_grad_tensor.ShareDataWith(*y_grad); + if (x_grad) { + x_grad->mutable_data(ctx.GetPlace()); + x_grad_tensor.ShareDataWith(*x_grad); + } else { + x_grad_tensor.mutable_data(x->dims(), ctx.GetPlace()); + } + if (data_layout == "NHWC") { + x_tensor.set_layout(DataLayout::kNHWC); + y_grad_tensor.set_layout(DataLayout::kNHWC); + x_grad_tensor.set_layout(DataLayout::kNHWC); + } + if (!use_global_stats) { + const auto &runner = NpuOpRunner( + "BatchNormGrad", + {y_grad_tensor, x_tensor, *scale, *saved_mean, *saved_variance}, + {x_grad_tensor, scale_grad_tmp, bias_grad_tmp, *saved_mean, + *saved_variance}, // segment fault if no reserve_space_3 and + // reserve_space_4 + {{"epsilon", epsilon}, + {"is_training", true}, + {"data_format", data_layout}}); + auto stream = dev_ctx.stream(); + runner.Run(stream); + } else { + const auto *running_mean = ctx.Input("Mean"); + const auto *running_var = ctx.Input("Variance"); + + const auto &runner = NpuOpRunner( + "BatchNormGrad", + {y_grad_tensor, x_tensor, *scale, *running_mean, *running_var}, + {x_grad_tensor, scale_grad_tmp, bias_grad_tmp, *running_mean, + *running_var}, // segment fault if no reserve_space_3 and + // reserve_space_4 + {{"epsilon", epsilon}, + {"is_training", true}, + {"data_format", data_layout}}); + auto stream = dev_ctx.stream(); + runner.Run(stream); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(batch_norm, ops::NPUBatchNormOpKernel, + ops::NPUBatchNormOpKernel); +REGISTER_OP_NPU_KERNEL(batch_norm_grad, ops::NPUBatchNormGradOpKernel, + ops::NPUBatchNormGradOpKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py new file mode 100644 index 00000000000000..76c81d2d683a05 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py @@ -0,0 +1,428 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid.op import Operator +from op_test import OpTest, _set_use_system_allocator +from paddle.fluid import Program, program_guard + +from test_batch_norm_op import _reference_testing, _cal_mean_variance, _reference_training, _reference_grad + +_set_use_system_allocator(False) +paddle.enable_static() + + +class TestBatchNormOpInference(unittest.TestCase): + def setUp(self): + self.dtype = np.float32 + self.init_kernel_type() + self.data_formats = ["NCHW", "NHWC"] + + def __assert_close(self, tensor, np_array, msg, atol=1e-4): + self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) + + def check_with_place(self, place, data_layout, dtype, shape): + epsilon = epsilon = 0.00001 + if len(shape) == 2: + x_shape = shape + c = x_shape[1] + else: + n, h, w, c = shape[0], shape[1], shape[2], shape[3] + if data_layout == "NHWC": + x_shape = [n, h, w, c] + elif data_layout == "NCHW": + x_shape = [n, c, h, w] + else: + raise ValueError("Unknown data layout.") + scale_shape = [c] + + x = np.random.random_sample(x_shape).astype(dtype) + x = x - 0.5 + scale = np.random.random_sample(scale_shape).astype(np.float32) + bias = np.random.random_sample(scale_shape).astype(np.float32) + mean = np.zeros(scale_shape).astype(np.float32) + variance = np.ones(scale_shape).astype(np.float32) + y = _reference_testing(x, scale, bias, mean, variance, epsilon, + data_layout).astype(dtype) + var_dict = locals() + var_names = ["x", "scale", "bias", "mean", "variance", "y"] + ground_truth = {name: var_dict[name] for name in var_names} + ground_truth["saved_mean"] = mean + ground_truth["saved_variance"] = variance + + program = fluid.Program() + with fluid.program_guard(program): + block = program.global_block() + for name in ground_truth: + block.create_var( + name=name, dtype="float32", shape=ground_truth[name].shape) + inputs = { + "X": block.var("x"), + "Scale": block.var("scale"), + "Bias": block.var("bias"), + "Mean": block.var("mean"), + "Variance": block.var("variance") + } + attrs = { + "epsilon": epsilon, + "is_test": True, + "data_layout": data_layout, + "use_mkldnn": False, + "fuse_with_relu": False, + } + outputs = { + "Y": block.var("y"), + "MeanOut": block.var("mean"), # share memory + "VarianceOut": block.var("variance"), # share memory + "SavedMean": block.var("saved_mean"), + "SavedVariance": block.var("saved_variance") + } + block.create_var(name="reserve_space", dtype='float32') + outputs["ReserveSpace"] = block.var('reserve_space') + bn_op = block.append_op( + type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs) + + program._sync_with_cpp() + + exe = fluid.Executor(place) + out = exe.run( + program, + feed={ + name: ground_truth[name] + for name in ["x", "scale", "bias", "mean", "variance"] + }, + fetch_list=["y"]) + self.__assert_close(var_dict["y"], out[0], "y", atol=1e-3) + + def test_check_output(self): + place = core.NPUPlace(0) + for data_format in self.data_formats: + self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5]) + + def init_kernel_type(self): + pass + + +class TestFP16BatchNormOpInference(TestBatchNormOpInference): + def setUp(self): + self.dtype = np.float16 + self.init_kernel_type() + self.data_formats = ["NCHW", "NHWC"] + + +class TestBatchNormOpTraining(unittest.TestCase): + def set_npu(self): + self.__class__.use_npu = True + + def setUp(self): + self.set_npu() + self.use_mkldnn = False + self.fuse_with_relu = False + self.data_formats = ["NCHW", "NHWC"] + self.momentum = 0.9 + self.use_momentum_variable = False + self.epsilon = 0.00001 + self.init_kernel_type() + self.init_test_case() + + def init_test_case(self): + self.use_global_stats = False + self.no_grad_set = set() + self.fetch_list = [ + "y", 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD', + 'scale@GRAD', 'bias@GRAD' + ] + + def __assert_close(self, tensor, np_array, msg, atol=1e-4): + np.allclose(np.array(tensor), np_array, atol=atol) + + def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, + epsilon, momentum, shape, data_layout): + # run forward + y, saved_mean, var_ref = _reference_training(x, scale, bias, epsilon, + data_layout) + mean_out = saved_mean * (1. - momentum) + momentum * mean + variance_out = var_ref * (1. - momentum) + momentum * variance + saved_variance = 1. / np.sqrt(var_ref + epsilon) + # run backward + x_grad, scale_grad, bias_grad = _reference_grad( + x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout) + + return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad + + def set_mean_variance(self, scale_shape, x, data_layout): + mean, variance = _cal_mean_variance(x, self.epsilon, data_layout) + mean_pre = np.zeros(scale_shape).astype(np.float32) + variance_pre = np.ones(scale_shape).astype(np.float32) + # computing global mean/variance for one step + if self.use_global_stats: + mom = self.momentum + mean = mean * (1. - mom) + mom * mean_pre + variance = variance * (1. - mom) + mom * variance_pre + return mean, variance + + def test_forward_backward(self): + def test_with_place(place, data_layout, shape): + # attr + epsilon = self.epsilon + momentum = self.momentum + if data_layout == "NCHW": + n, c, h, w = shape[0], shape[1], shape[2], shape[3] + else: + n, h, w, c = shape[0], shape[1], shape[2], shape[3] + scale_shape = [c] + + np.random.seed(123) + x = np.random.random_sample(shape).astype(np.float32) + scale = np.random.random_sample(scale_shape).astype(np.float32) + bias = np.random.random_sample(scale_shape).astype(np.float32) + mean, variance = self.set_mean_variance(scale_shape, x, data_layout) + y_grad = np.random.random_sample(shape).astype(np.float32) + momentum_var = np.array([momentum]).astype(np.float32) + + y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward( + x, y_grad, scale, bias, mean, variance, epsilon, momentum, + shape, data_layout) + + var_dict = locals() + var_dict['y@GRAD'] = y_grad + var_dict['x@GRAD'] = x_grad + var_dict['scale@GRAD'] = scale_grad + var_dict['bias@GRAD'] = bias_grad + + var_names = [ + 'x', 'scale', 'bias', 'mean', 'variance', "y", 'saved_mean', + 'saved_variance', 'momentum_var' + ] + ground_truth = {name: var_dict[name] for name in var_names} + + program = fluid.Program() + with fluid.program_guard(program): + block = program.global_block() + for name in ground_truth: + block.create_var( + name=name, + dtype='float32', + shape=ground_truth[name].shape) + inputs = { + "X": block.var('x'), + "Scale": block.var('scale'), + "Bias": block.var('bias'), + "Mean": block.var('mean'), + "Variance": block.var('variance') + } + attrs = { + "epsilon": epsilon, + "is_test": False, + "data_layout": data_layout, + "use_mkldnn": self.use_mkldnn, + "fuse_with_relu": self.fuse_with_relu, + "use_global_stats": self.use_global_stats + } + if self.use_momentum_variable: + inputs['MomentumTensor'] = block.var('momentum_var') + else: + attrs['momentum'] = momentum + + outputs = { + "Y": block.var("y"), + "MeanOut": block.var('mean'), # share memory + "VarianceOut": block.var('variance'), # share memory + "SavedMean": block.var('saved_mean'), + "SavedVariance": block.var('saved_variance') + } + block.create_var(name="reserve_space", dtype='float32') + outputs["ReserveSpace"] = block.var('reserve_space') + bn_op = block.append_op( + type="batch_norm", + inputs=inputs, + outputs=outputs, + attrs=attrs) + block.create_var(name='y@GRAD', dtype='float32', shape=y.shape) + + # generate backward op_desc + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( + bn_op.desc, self.no_grad_set, []) + grad_op_desc = grad_op_desc_list[0] + new_op_desc = block.desc.append_op() + new_op_desc.copy_from(grad_op_desc) + for var_name in grad_op_desc.output_arg_names(): + block.desc.var(var_name.encode("ascii")) + grad_op_desc.infer_var_type(block.desc) + grad_op_desc.infer_shape(block.desc) + for arg in grad_op_desc.output_arg_names(): + grad_var = block.desc.find_var(arg.encode("ascii")) + grad_var.set_dtype(core.VarDesc.VarType.FP32) + + program._sync_with_cpp() + + exe = fluid.Executor(place) + out = exe.run(program, + feed={ + name: var_dict[name] + for name in [ + 'x', 'scale', 'bias', 'mean', 'variance', + 'y@GRAD', 'momentum_var' + ] + }, + fetch_list=self.fetch_list) + + for id, name in enumerate(self.fetch_list): + if name == 'variance': + self.__assert_close( + var_dict[name], out[id], name, atol=1e-3) + continue + self.__assert_close(var_dict[name], out[id], name) + print("op test forward passed: ", str(place), data_layout) + + for data_format in self.data_formats: + test_with_place(core.NPUPlace(0), data_format, [2, 3, 4, 5]) + + def init_kernel_type(self): + pass + + +class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining): + def init_test_case(self): + self.use_global_stats = False + self.no_grad_set = set(['scale@GRAD', 'bias@GRAD']) + self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD'] + + +class TestBatchNormOpTrainingMomentumVariable(TestBatchNormOpTraining): + def init_test_case(self): + self.use_momentum_variable = True + self.use_global_stats = False + self.no_grad_set = set() + self.fetch_list = [ + 'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD', + 'scale@GRAD', 'bias@GRAD' + ] + + +class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining): + def init_test_case(self): + self.use_global_stats = True + self.no_grad_set = set() + self.fetch_list = [ + 'y', 'mean', 'variance', 'x@GRAD', 'scale@GRAD', 'bias@GRAD' + ] + + def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format): + if data_format == "NCHW": + x = np.transpose(x, (0, 2, 3, 1)) + y_grad = np.transpose(y_grad, (0, 2, 3, 1)) + + x_grad = scale * y_grad / np.sqrt(var + epsilon) + grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon), + axis=(0, 1, 2)) + grad_offset = np.sum(y_grad, axis=(0, 1, 2)) + + # transfer back to N, C, H, W + if data_format == "NCHW": + x_grad = np.transpose(x_grad, (0, 3, 1, 2)) + x = np.transpose(x, (0, 3, 1, 2)) + y_grad = np.transpose(y_grad, (0, 3, 1, 2)) + + return x_grad, grad_scale, grad_offset + + def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, + epsilon, momentum, shape, data_layout): + if data_layout != "NCHW" and data_layout != "NHWC": + raise ValueError("Unknown data order.") + + if data_layout == "NCHW": + x = np.transpose(x, (0, 2, 3, 1)) + + # run normalizaton + normalized = (x - mean) / np.sqrt(variance + epsilon) + y = normalized * scale + bias + + # transfer back to N, C, H, W + if data_layout == "NCHW": + x = np.transpose(x, (0, 3, 1, 2)) + y = np.transpose(y, (0, 3, 1, 2)) + + mean_out = mean + variance_out = variance + saved_variance = 1. / np.sqrt(variance + epsilon) + # run backward + x_grad, scale_grad, bias_grad = self.reference_grad( + x, y_grad, scale, mean, variance, epsilon, data_layout) + + return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad + + +class TestBatchNormOpFreezeStatsAndScaleBiasTraining( + TestBatchNormOpFreezeStatsTraining): + def init_test_case(self): + self.use_global_stats = True + self.no_grad_set = set(['scale@GRAD', 'bias@GRAD']) + self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD'] + + +class TestDygraphBatchNormTrainableStats(unittest.TestCase): + def test_dygraph(self): + places = [fluid.NPUPlace(0)] + for p in places: + shape = [4, 10, 4, 4] + + def compute(x, is_test, trainable_statistics): + with fluid.dygraph.guard(p): + bn = fluid.dygraph.BatchNorm( + shape[1], + is_test=is_test, + trainable_statistics=trainable_statistics) + y = bn(fluid.dygraph.to_variable(x)) + return y.numpy() + + x = np.random.randn(*shape).astype("float32") + y1 = compute(x, False, False) + y2 = compute(x, True, True) + self.assertTrue(np.allclose(y1, y2)) + + def test_static(self): + places = [fluid.NPUPlace(0)] + for p in places: + exe = fluid.Executor(p) + shape = [4, 10, 16, 16] + + def compute(x_np, is_test, trainable_statistics): + with program_guard(Program(), Program()): + bn = fluid.dygraph.BatchNorm( + shape[1], + is_test=is_test, + trainable_statistics=trainable_statistics) + x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype) + y = bn(x) + exe.run(fluid.default_startup_program()) + r = exe.run(feed={'x': x_np}, fetch_list=[y])[0] + return r + + x = np.random.randn(*shape).astype("float32") + y1 = compute(x, False, False) + y2 = compute(x, True, True) + self.assertTrue(np.allclose(y1, y2)) + + +if __name__ == "__main__": + unittest.main() From f6fab559ccdea51daad72884a4bb7da48566b835 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Wed, 11 Aug 2021 04:43:23 -0500 Subject: [PATCH 009/126] [NPU] add reduce_mean_op_npu and test (#34053) * add reduce_mean_op_npu and test * remove skip.If * update --- .../reduce_ops/reduce_mean_op_npu.cc | 112 +++++++++++ .../unittests/npu/test_reduce_mean_op_npu.py | 184 ++++++++++++++++++ 2 files changed, 296 insertions(+) create mode 100644 paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_reduce_mean_op_npu.py diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc new file mode 100644 index 00000000000000..55a6a75740f3a7 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_npu.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class NPUReduceMeanOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + output->mutable_data(ctx.GetPlace()); + + bool reduce_all = ctx.Attr("reduce_all"); + auto dims = ctx.Attr>("dim"); + bool keep_dim = ctx.Attr("keep_dim"); + + auto input_dims_vec = framework::vectorize(input->dims()); + if (reduce_all) { + dims.clear(); + for (size_t i = 0; i < input_dims_vec.size(); i++) { + dims.push_back(static_cast(i)); + } + } + + const auto& runner = NpuOpRunner("ReduceMeanD", {*input}, {*output}, + {{"axes", dims}, {"keep_dims", keep_dim}}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class NPUReduceMeanGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + input_grad->mutable_data(ctx.GetPlace()); + + bool reduce_all = ctx.Attr("reduce_all"); + auto reduce_dims = ctx.Attr>("dim"); + auto input_dims_vec = framework::vectorize(input->dims()); + + int reduce_numel = 1; + if (reduce_all) { + reduce_dims.clear(); + for (size_t d = 0; d < input_dims_vec.size(); ++d) { + reduce_dims.push_back(static_cast(d)); + } + } + for (auto& d : reduce_dims) { + if (d < 0) { + d = d + input_dims_vec.size(); + } + reduce_numel *= input_dims_vec[d]; + } + + const auto& runner = + NpuOpRunner("FillV2D", {}, {*input_grad}, + {{"value", 1.0f / static_cast(reduce_numel)}, + {"dims", input_dims_vec}}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + + Tensor transformed_input_grad, transformed_out_grad; + Tensor tmp_output_grad; + auto tmp_output_dims_vec = input_dims_vec; + for (auto d : reduce_dims) { + tmp_output_dims_vec[d] = 1; + } + tmp_output_grad.ShareDataWith(*output_grad); + tmp_output_grad.Resize(framework::make_ddim(tmp_output_dims_vec)); + auto& dev_ctx = + ctx.template device_context(); + NpuElementWiseOpBroadcast(dev_ctx, input_grad, &tmp_output_grad, 0, + &transformed_input_grad, + &transformed_out_grad); + const auto& runner2 = + NpuOpRunner("Mul", {transformed_input_grad, transformed_out_grad}, + {*input_grad}, {}); + runner2.Run(stream); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(reduce_mean, ops::NPUReduceMeanOpKernel); +REGISTER_OP_NPU_KERNEL(reduce_mean_grad, ops::NPUReduceMeanGradOpKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_mean_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_mean_op_npu.py new file mode 100644 index 00000000000000..ed27c335a4e326 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_mean_op_npu.py @@ -0,0 +1,184 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + + +class TestMeanOp(OpTest): + def set_npu(self): + self.__class__.use_npu = True + + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")} + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + def test_check_output(self): + self.check_output_with_place(paddle.NPUPlace(0)) + + def test_check_grad(self): + self.check_grad_with_place(paddle.NPUPlace(0), ['X'], 'Out') + + +class TestMeanOp5D(TestMeanOp): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.inputs = { + 'X': np.random.random((1, 2, 5, 6, 10)).astype("float32") + } + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + +class TestMeanOp6D(TestMeanOp): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.inputs = { + 'X': np.random.random((1, 1, 2, 5, 6, 10)).astype("float32") + } + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + +class TestMeanOp8D(TestMeanOp): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.inputs = { + 'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float32") + } + self.attrs = {'dim': (0, 3)} + self.outputs = {'Out': self.inputs['X'].mean(axis=(0, 3))} + + +class Test1DReduce(TestMeanOp): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.inputs = {'X': np.random.random(120).astype("float32")} + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + +class Test2DReduce0(Test1DReduce): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.attrs = {'dim': [0]} + self.inputs = {'X': np.random.random((20, 10)).astype("float32")} + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + +class Test2DReduce1(Test1DReduce): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.attrs = {'dim': [1]} + self.inputs = {'X': np.random.random((20, 10)).astype("float32")} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce0(Test1DReduce): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.attrs = {'dim': [1]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce1(Test1DReduce): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.attrs = {'dim': [2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce2(Test1DReduce): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.attrs = {'dim': [-2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce3(Test1DReduce): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.attrs = {'dim': [1, 2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + +class TestKeepDimReduce(Test1DReduce): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")} + self.attrs = {'dim': [1], 'keep_dim': True} + self.outputs = { + 'Out': self.inputs['X'].mean( + axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim']) + } + + +class TestKeepDim8DReduce(Test1DReduce): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.inputs = { + 'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float32") + } + self.attrs = {'dim': (3, 4, 5), 'keep_dim': True} + self.outputs = { + 'Out': self.inputs['X'].mean( + axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim']) + } + + +class TestReduceAll(Test1DReduce): + def setUp(self): + self.set_npu() + self.op_type = "reduce_mean" + self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")} + self.attrs = {'reduce_all': True} + self.outputs = {'Out': self.inputs['X'].mean()} + + +if __name__ == '__main__': + unittest.main() From 9e3e08f0a9390a1761c8b8e6d32fd530ccc70b72 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Wed, 11 Aug 2021 04:43:44 -0500 Subject: [PATCH 010/126] [NPU] add momentum_op_npu and test (#34082) * add momentum_op_npu and test * update * fix hang --- .../operators/optimizers/momentum_op_npu.cc | 96 +++++ paddle/fluid/pybind/pybind.cc | 9 +- .../unittests/npu/test_momentum_op_npu.py | 328 ++++++++++++++++++ 3 files changed, 432 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/optimizers/momentum_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py diff --git a/paddle/fluid/operators/optimizers/momentum_op_npu.cc b/paddle/fluid/operators/optimizers/momentum_op_npu.cc new file mode 100644 index 00000000000000..2f2fdf8527450d --- /dev/null +++ b/paddle/fluid/operators/optimizers/momentum_op_npu.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/optimizers/momentum_op.h" + +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/optimizers/sgd_op.h" + +namespace paddle { +namespace operators { + +template +class NPUMomentumOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + std::string regularization_method = + ctx.Attr("regularization_method"); + auto regularization_coeff = ctx.Attr("regularization_coeff"); + RegularizationType regularization_flag{ + RegularizationType::kNONE}; // disable regularization + if (regularization_method == "l2_decay") { + regularization_flag = RegularizationType::kL2DECAY; + } + + T mu = static_cast(ctx.Attr("mu")); + bool use_nesterov = ctx.Attr("use_nesterov"); + + auto learning_rate = ctx.Input("LearningRate"); + auto param = ctx.Input("Param"); + auto velocity = ctx.Input("Velocity"); + + auto param_out = ctx.Output("ParamOut"); + auto velocity_out = ctx.Output("VelocityOut"); + + param_out->mutable_data(ctx.GetPlace()); + velocity_out->mutable_data(ctx.GetPlace()); + + auto* grad_var = ctx.InputVar("Grad"); + if (grad_var->IsType()) { + auto grad = ctx.Input("Grad"); + Tensor mu_tensor; + mu_tensor.mutable_data(framework::make_ddim({1}), ctx.GetPlace()); + FillNpuTensorWithConstant(&mu_tensor, mu); + + Tensor regularized_grad; + if (regularization_flag == RegularizationType::kL2DECAY) { + regularized_grad.mutable_data(grad->dims(), ctx.GetPlace()); + const auto& runner1 = NpuOpRunner("Muls", {*param}, {regularized_grad}, + {{"value", regularization_coeff}}); + runner1.Run(dev_ctx.stream()); + const auto& runner2 = NpuOpRunner("Add", {regularized_grad, *grad}, + {regularized_grad}, {}); + runner2.Run(dev_ctx.stream()); + } else { + regularized_grad.ShareDataWith(*grad); + } + framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out); + framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out); + // NOTE: ApplyMomentum will change the input + const auto& runner = NpuOpRunner( + "ApplyMomentum", {*param_out, *velocity_out, *learning_rate, + regularized_grad, mu_tensor}, + {*param_out}, {{"use_nesterov", use_nesterov}}); + runner.Run(dev_ctx.stream()); + } else if (grad_var->IsType()) { + PADDLE_ENFORCE_EQ(false, true, platform::errors::PermissionDenied( + "Unsupport SparseMomentum")); + } else { + PADDLE_ENFORCE_EQ(false, true, + platform::errors::PermissionDenied( + "Unsupported Variable Type of Grad " + "in MomentumOp. Excepted LodTensor " + "or SelectedRows, But received [%s]", + paddle::framework::ToTypeName(grad_var->Type()))); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_NPU_KERNEL(momentum, ops::NPUMomentumOpKernel, + ops::NPUMomentumOpKernel); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 859e0ad916dd47..589ea088a6cf03 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2217,7 +2217,14 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_ASCEND_CL m.def("get_npu_device_count", platform::GetNPUDeviceCount); - m.def("npu_finalize", []() { platform::AclInstance::Instance().Finalize(); }); + m.def("npu_finalize", []() { + auto &pool = platform::DeviceContextPool::Instance(); + auto devices = platform::GetSelectedNPUDevices(); + for (size_t i = 0; i < devices.size(); ++i) { + pool.Get(platform::NPUPlace(devices[i]))->Wait(); + } + platform::AclInstance::Instance().Finalize(); + }); py::class_(m, "NPUProfConfigWrapper"); diff --git a/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py new file mode 100644 index 00000000000000..b8c261c2555c46 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py @@ -0,0 +1,328 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from test_momentum_op import calculate_momentum_by_numpy + +paddle.enable_static() + + +class TestMomentumOp1(OpTest): + def set_npu(self): + self.__class__.use_npu = True + + def setUp(self): + self.set_npu() + self.op_type = "momentum" + self.init_dtype() + self.init_case() + + param = np.random.random(self.shape).astype(self.dtype) + grad = np.random.random(self.shape).astype(self.dtype) + velocity = np.zeros(self.shape).astype(self.dtype) + learning_rate = np.array([0.001]).astype(np.float32) + mu = 0.0001 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Velocity': velocity, + 'LearningRate': learning_rate + } + + self.attrs = {'mu': mu, 'use_nesterov': self.use_nesterov} + + param_out, velocity_out = calculate_momentum_by_numpy( + param=param, + grad=grad, + mu=mu, + velocity=velocity, + use_nesterov=self.use_nesterov, + learning_rate=learning_rate) + + self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} + + def init_case(self): + self.shape = (123, 321) + self.use_nesterov = False + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(core.NPUPlace(0)) + + +class TestMomentumOpFp16(TestMomentumOp1): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output(atol=1e-3) + + +class TestMomentumOp2(TestMomentumOp1): + def init_case(self): + self.shape = (123, 321) + self.use_nesterov = True + + +class TestMomentumV2(unittest.TestCase): + def test_momentum_dygraph(self): + paddle.disable_static(place=fluid.NPUPlace(0)) + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear = paddle.nn.Linear(13, 5) + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Momentum( + learning_rate=0.01, momentum=0.9, parameters=linear.parameters()) + out = linear(a) + out.backward() + adam.step() + adam.clear_gradients() + + def test_momentum(self): + paddle.enable_static() + place = fluid.NPUPlace(0) + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + rms_optimizer = paddle.optimizer.Momentum( + learning_rate=0.1, momentum=0.9) + rms_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + + def test_raise_error(self): + self.assertRaises( + ValueError, paddle.optimizer.Momentum, learning_rate=None) + self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None) + + +class TestMomentumOpWithDecay(OpTest): + def set_npu(self): + self.__class__.use_npu = True + + def setUp(self): + self.set_npu() + self.op_type = "momentum" + self.dtype = np.float32 + self.use_nesterov = True + self.regularization_method = 'l2_decay' + self.regularization_coeff = 0.9 + self.init_config() + + param = np.random.random((123, 321)).astype(self.dtype) + grad = np.random.random((123, 321)).astype(self.dtype) + velocity = np.zeros((123, 321)).astype(self.dtype) + learning_rate = np.array([0.001]).astype(np.float32) + mu = 0.0001 + use_nesterov = self.use_nesterov + regularization_method = self.regularization_method + regularization_coeff = self.regularization_coeff + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Velocity': velocity, + 'LearningRate': learning_rate + } + + self.attrs = { + 'mu': mu, + 'use_nesterov': use_nesterov, + 'regularization_method': regularization_method, + 'regularization_coeff': regularization_coeff + } + + grad = grad + regularization_coeff * param + + param_out, velocity_out = calculate_momentum_by_numpy( + param=param, + grad=grad, + mu=mu, + velocity=velocity, + use_nesterov=use_nesterov, + learning_rate=learning_rate) + + self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} + + def init_config(self): + pass + + def test_check_output(self): + paddle.enable_static() + self.check_output_with_place(core.NPUPlace(0), atol=3e-3) + + +class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay): + def init_config(self): + self.dtype = np.float16 + + def test_check_output(self): + paddle.enable_static() + self.check_output(atol=1e-3) + + +class TestMomentumOpWithDecay2(TestMomentumOpWithDecay): + def init_config(self): + self.use_nesterov = False + + +class TestMomentumOpWithDecayAPI(unittest.TestCase): + def _test_momentum_dygraph_common(self, regularization): + paddle.disable_static(fluid.NPUPlace(0)) + inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + linear = paddle.nn.Linear(10, 10) + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + # This can be any optimizer supported by dygraph. + momentum = paddle.fluid.contrib.optimizer.Momentum( + learning_rate=0.01, + momentum=0.9, + parameter_list=linear.parameters(), + regularization=regularization) + momentum.minimize(loss) + + def test_momentum_dygraph_1(self): + self._test_momentum_dygraph_common( + regularization=paddle.fluid.regularizer.L2Decay( + regularization_coeff=0.1)) + + def test_momentum_static(self): + paddle.enable_static() + place = fluid.NPUPlace(0) + main = fluid.Program() + with fluid.program_guard(main): + x = fluid.layers.data(name='x', shape=[13], dtype='float32') + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + + momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum( + learning_rate=0.1, momentum=0.9) + momentum_optimizer.minimize(avg_cost) + + fetch_list = [avg_cost] + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=1) + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for data in train_reader(): + exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) + + +class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase): + def __update_params(self, momentum, linear): + for i in range(10): + inp = paddle.full( + shape=[2, 2], fill_value=i, dtype='float32').astype("float32") + inp = paddle.to_tensor(inp) + out = linear(inp) + loss = paddle.mean(out) + loss.backward() + momentum.minimize(loss) + linear.clear_gradients() + + def __test_vs(self, place=fluid.NPUPlace(0)): + paddle.disable_static(place=place) + linear_old = paddle.nn.Linear( + 2, + 2, + weight_attr=paddle.nn.initializer.Constant(value=2.0), + bias_attr=paddle.nn.initializer.Constant(value=2.0)) + momentum_old = paddle.fluid.optimizer.Momentum( + learning_rate=0.01, + momentum=0.9, + parameter_list=linear_old.parameters(), + regularization=paddle.fluid.regularizer.L2Decay( + regularization_coeff=0.1)) + self.__update_params(momentum=momentum_old, linear=linear_old) + + linear_new = paddle.nn.Linear( + 2, + 2, + weight_attr=paddle.nn.initializer.Constant(value=2.0), + bias_attr=paddle.nn.initializer.Constant(value=2.0)) + momentum_new = paddle.fluid.contrib.optimizer.Momentum( + learning_rate=0.01, + momentum=0.9, + parameter_list=linear_new.parameters(), + regularization=paddle.fluid.regularizer.L2Decay( + regularization_coeff=0.1)) + self.__update_params(momentum=momentum_new, linear=linear_new) + + self.assertEqual( + (linear_old.weight.numpy() == linear_new.weight.numpy()).all(), + True, + 'the param weight updated by two Momentum optimizers should equal') + + def test_vs(self, place=fluid.NPUPlace(0)): + self.__test_vs(place=place) + + +class TestMomentumV2Group(TestMomentumV2): + def test_momentum_dygraph(self): + paddle.disable_static(place=fluid.NPUPlace(0)) + value = np.arange(26).reshape(2, 13).astype("float32") + a = paddle.to_tensor(value) + linear_1 = paddle.nn.Linear(13, 5) + linear_2 = paddle.nn.Linear(5, 3) + # This can be any optimizer supported by dygraph. + adam = paddle.optimizer.Momentum( + learning_rate=0.01, + parameters=[{ + 'params': linear_1.parameters() + }, { + 'params': linear_2.parameters(), + 'weight_decay': 0.001, + 'learning_rate': 0.1, + 'momentum': 0.99 + }], + weight_decay=0.1, + momentum=0.9) + out = linear_1(a) + out = linear_2(out) + out.backward() + adam.step() + adam.clear_gradients() + + +if __name__ == "__main__": + unittest.main() From d45d311296c24e1766851072548edecc418615d9 Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Wed, 11 Aug 2021 18:10:12 +0800 Subject: [PATCH 011/126] split_op for npu (#34699) --- paddle/fluid/operators/split_op_npu.cc | 83 +++++++++ .../tests/unittests/npu/test_split_op_npu.py | 158 ++++++++++++++++++ 2 files changed, 241 insertions(+) create mode 100644 paddle/fluid/operators/split_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_split_op_npu.py diff --git a/paddle/fluid/operators/split_op_npu.cc b/paddle/fluid/operators/split_op_npu.cc new file mode 100644 index 00000000000000..3cca4f89c10d0e --- /dev/null +++ b/paddle/fluid/operators/split_op_npu.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/split_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SplitNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto outs = ctx.MultiOutput("Out"); + int num = ctx.Attr("num"); + std::vector sections = ctx.Attr>("sections"); + int axis = ctx.Attr("axis"); + + if (ctx.HasInput("AxisTensor")) { + // TODO(liupeng51): + PADDLE_THROW(platform::errors::Unimplemented( + "The AxisTensor is not supported on NPU now.")); + } + if (ctx.HasInput("SectionsTensorList")) { + // TODO(liupeng51): + PADDLE_THROW(platform::errors::Unimplemented( + "The SectionsTensorList is not supported on NPU now.")); + } + + std::vector outputs; + auto place = ctx.GetPlace(); + for (size_t j = 0; j < outs.size(); ++j) { + outs[j]->mutable_data(ctx.GetPlace()); + outputs.push_back(*outs[j]); + } + auto stream = + ctx.template device_context() + .stream(); + NpuOpRunner runner; + if (sections.size() == 0) { + framework::NPUAttributeMap attr_input = {{"num_split", num}, + {"split_dim", axis}}; + runner.SetType("SplitD").AddInputs({*in}).AddOutputs(outputs).AddAttrs( + attr_input); + } else { + framework::NPUAttributeMap attr_input = { + {"size_splits", sections}, + {"split_dim", axis}, + {"num_split", static_cast(sections.size())}}; + runner.SetType("SplitVD").AddInput(*in).AddOutputs(outputs).AddAttrs( + attr_input); + } + + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(split, ops::SplitNPUKernel, + ops::SplitNPUKernel, + ops::SplitNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_split_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_split_op_npu.py new file mode 100644 index 00000000000000..fd48ec958e4a4c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_split_op_npu.py @@ -0,0 +1,158 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestCase1(OpTest): + def setUp(self): + self.set_npu() + self.set_example() + self.op_type = "split" + self.place = paddle.NPUPlace(0) + ipt = self.x.astype(self.dtype) + axis = self.axis if isinstance(self.axis, int) else int(self.axis[0]) + tmp_outs = np.split( + ipt, axis=axis, indices_or_sections=self.num_or_sections) + tmp_outs = [o.astype(self.dtype) for o in tmp_outs] + self.outputs = {'Out': []} + self.outs = [] + for i, o in enumerate(tmp_outs): + self.outputs["Out"].append((str(i), o)) + self.outs.append(str(i)) + + self.attrs = {"axis": self.axis, "num": self.num_or_sections} + self.inputs = {} + self.inputs.update({'X': ipt.astype(self.dtype)}) + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.op_type = "split" + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], self.outs) + + def set_example(self): + self.dtype = "float32" + self.x = np.random.random((2, 4, 6)) + self.axis = 1 + self.num_or_sections = 2 + + +class TestCase2(TestCase1): + def set_example(self): + self.dtype = "float32" + self.x = np.random.random((20, 4, 50)) + self.axis = 0 + self.num_or_sections = 4 + + +class TestCase4(TestCase1): + def set_example(self): + self.dtype = "float16" + self.x = np.random.random((4, 50, 20)) + self.axis = 2 + self.num_or_sections = 4 + + +# Test Sections +class TestCase5(TestCase1): + def set_example(self): + super().set_example() + self.x = np.random.random((2, 10, 4)) + self.axis = 1 + self.num_or_sections = [2, 4, 8] + + def setUp(self): + super().setUp() + self.attrs.update({"sections": [2, 2, 4, 2], "num": 0}) + + +class API_TestSplit(unittest.TestCase): + def test_out(self): + with fluid.program_guard(fluid.Program(), fluid.Program()): + data = fluid.layers.data('data', shape=[-1, 10], dtype='float32') + x0, x1 = paddle.split(data, num_or_sections=(3, 7), axis=1) + place = fluid.NPUPlace(0) + exe = fluid.Executor(place) + input1 = np.random.random([1, 10]).astype('float32') + r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1]) + ex_x0, ex_x1 = np.split(input1, (3, ), axis=1) + self.assertTrue(np.allclose(ex_x0, r0)) + self.assertTrue(np.allclose(ex_x1, r1)) + + +class API_TestSplit2(unittest.TestCase): + def test_out(self): + with fluid.program_guard(fluid.Program(), fluid.Program()): + data = fluid.layers.data('data', shape=[-1, 10], dtype='float32') + x0, x1 = paddle.split(data, num_or_sections=2, axis=1) + place = fluid.NPUPlace(0) + exe = fluid.Executor(place) + input1 = np.random.random([1, 10]).astype('float32') + r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1]) + ex_x0, ex_x1 = np.split(input1, 2, axis=1) + self.assertTrue(np.allclose(ex_x0, r0)) + self.assertTrue(np.allclose(ex_x1, r1)) + + +class API_TestDygraphSplit(unittest.TestCase): + def test_out1(self): + with fluid.dygraph.guard(paddle.NPUPlace(0)): + input_1 = np.random.random([4, 6, 6]).astype("int32") + # input is a variable which shape is [4, 6, 6] + input = fluid.dygraph.to_variable(input_1) + x0, x1, x2 = paddle.split(input, num_or_sections=3, axis=1) + x0_out = x0.numpy() + x1_out = x1.numpy() + x2_out = x2.numpy() + ex_x0, ex_x1, ex_x2 = np.split(input_1, 3, axis=1) + self.assertTrue(np.allclose(ex_x0, x0_out)) + self.assertTrue(np.allclose(ex_x1, x1_out)) + self.assertTrue(np.allclose(ex_x2, x2_out)) + + def test_out2(self): + with fluid.dygraph.guard(paddle.NPUPlace(0)): + input_1 = np.random.random([4, 6, 6]).astype("int32") + # input is a variable which shape is [4, 6, 6] + input = fluid.dygraph.to_variable(input_1) + x0, x1, x2 = paddle.split(input, num_or_sections=[1, 2, 3], axis=1) + x0_out = x0.numpy() + x1_out = x1.numpy() + x2_out = x2.numpy() + ex_x0, ex_x1, ex_x2 = np.split(input_1, (1, 3), axis=1) + self.assertTrue(np.allclose(ex_x0, x0_out)) + self.assertTrue(np.allclose(ex_x1, x1_out)) + self.assertTrue(np.allclose(ex_x2, x2_out)) + + +if __name__ == '__main__': + unittest.main() From 234c21ac8b0ddef8cc22b441b35660fffc01a7a7 Mon Sep 17 00:00:00 2001 From: pangyoki Date: Wed, 11 Aug 2021 19:02:11 +0800 Subject: [PATCH 012/126] [NPU] add while, read_from_array and write_to_array npu op (#34755) * add while read_from_array write_to_array npu op * optimize unittest --- paddle/fluid/operators/array_operator.h | 3 +- .../operators/controlflow/while_op_helper.cc | 11 +- paddle/fluid/operators/sum_op_npu.cc | 97 +++++++++---- .../tests/unittests/npu/test_while_op_npu.py | 130 ++++++++++++++++++ 4 files changed, 211 insertions(+), 30 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h index 44063f233caf80..af44a77c8131db 100644 --- a/paddle/fluid/operators/array_operator.h +++ b/paddle/fluid/operators/array_operator.h @@ -47,7 +47,8 @@ class ArrayOp : public framework::OperatorBase { size_t offset; if (platform::is_gpu_place(i_tensor.place()) || - platform::is_xpu_place(i_tensor.place())) { + platform::is_xpu_place(i_tensor.place()) || + platform::is_npu_place(i_tensor.place())) { // FIXME: Avoid copy from GPU to CPU framework::Tensor t; framework::TensorCopy(i_tensor, platform::CPUPlace(), dev_ctx, &t); diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 5c94c0827100b3..63b273fdbb8bdf 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -212,14 +212,17 @@ bool GetCondData(const framework::LoDTensor &cond) { if (platform::is_cpu_place(cond.place())) { return cond.data()[0]; } - // when platform::is_gpu_place(cond.place()) is true + // when platform::is_gpu_place(cond.place()) or + // platform::is_npu_place(cond.place()) is true std::unique_ptr cpu_cond{new framework::LoDTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "This version of PaddlePaddle does NOT support GPU but got GPU tensor " - "Cond in WhileOp. Please compile WITH_GPU option.")); + "This version of PaddlePaddle does NOT support GPU/NPU but got GPU/NPU " + "tensor " + "Cond in WhileOp. Please compile WITH_GPU or WITH_ASCEND_CL option.")); #endif return cpu_cond->data()[0]; } diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc index a6032236c01ac3..a4f75e369e392e 100644 --- a/paddle/fluid/operators/sum_op_npu.cc +++ b/paddle/fluid/operators/sum_op_npu.cc @@ -27,36 +27,83 @@ using Tensor = framework::Tensor; template class SumNPUKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto x = ctx.MultiInput("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); + void Compute(const framework::ExecutionContext &ctx) const override { + auto out_var = ctx.OutputVar("Out"); + if (out_var->IsType()) { + auto *out = out_var->GetMutable(); + auto x = ctx.MultiInput("X"); + out->mutable_data(ctx.GetPlace()); - auto place = ctx.GetPlace(); + auto place = ctx.GetPlace(); - int n = static_cast(x.size()); - if (n == 1) { - TensorCopy(*x[0], place, out); - return; - } + int n = static_cast(x.size()); + if (n == 1) { + TensorCopy(*x[0], place, out); + return; + } - std::vector inputs; - std::vector names; - for (int i = 0; i < n; ++i) { - if (x[i] && x[i]->numel() > 0) { - inputs.push_back(*x[i]); - names.push_back("x" + std::to_string(i)); - } else { - continue; + std::vector inputs; + std::vector names; + for (int i = 0; i < n; ++i) { + if (x[i] && x[i]->numel() > 0) { + inputs.push_back(*x[i]); + names.push_back("x" + std::to_string(i)); + } else { + continue; + } } - } - auto stream = - ctx.template device_context() - .stream(); - NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}}; - runner.AddInputNames(names); - runner.Run(stream); + auto stream = + ctx.template device_context() + .stream(); + NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}}; + runner.AddInputNames(names); + runner.Run(stream); + } else if (out_var->IsType()) { + auto in_vars = ctx.MultiInputVar("X"); + bool in_place = out_var == in_vars[0]; + auto &out_array = *out_var->GetMutable(); + for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) { + PADDLE_ENFORCE_EQ(in_vars[i]->IsType(), true, + platform::errors::InvalidArgument( + "Only support all inputs are TensorArray, " + "but inputs[%d] is not TensorArray.", + i)); + auto &in_array = in_vars[i]->Get(); + + for (size_t i = 0; i < in_array.size(); ++i) { + if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) { + if (i >= out_array.size()) { + out_array.resize(i + 1); + } + if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) { + framework::TensorCopy(in_array[i], in_array[i].place(), + ctx.device_context(), &out_array[i]); + out_array[i].set_lod(in_array[i].lod()); + } else { + PADDLE_ENFORCE_EQ( + out_array[i].lod(), in_array[i].lod(), + platform::errors::InvalidArgument( + "The lod message between inputs[%d] and" + " outputs[%d] must be same, but now is not same.", + i, i)); + auto stream = ctx.template device_context< + paddle::platform::NPUDeviceContext>() + .stream(); + NpuOpRunner runner{ + "Add", {out_array[i], in_array[i]}, {out_array[i]}, {}}; + runner.Run(stream); + } + } + } + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Expected type of Output(out) must be Tensor or " + "LoDTensorArray. But got " + "unsupport type: %s.", + framework::ToTypeName(out_var->Type()))); + } } }; diff --git a/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py new file mode 100644 index 00000000000000..a388761d5e3843 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py @@ -0,0 +1,130 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle +import paddle.fluid.layers as layers +from paddle.fluid.executor import Executor +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid.backward import append_backward +import numpy +from paddle.fluid import compiler, Program, program_guard + +paddle.enable_static() + + +class TestWhileOp(unittest.TestCase): + def simple_net(self): + d0 = layers.data( + "d0", shape=[10], append_batch_size=False, dtype='float32') + d1 = layers.data( + "d1", shape=[10], append_batch_size=False, dtype='float32') + d2 = layers.data( + "d2", shape=[10], append_batch_size=False, dtype='float32') + # fill_constant npu op doesn't support int64 + i = layers.zeros(shape=[1], dtype='int32') + i = layers.cast(i, 'int64') + i.stop_gradient = True + init = layers.zeros(shape=[10], dtype='float32') + mem_array = layers.array_write(x=init, i=i) + data_array = layers.array_write(x=d0, i=i) + i = layers.increment(i) + layers.array_write(d1, i, array=data_array) + i = layers.increment(i) + layers.array_write(d2, i, array=data_array) + i = layers.zeros(shape=[1], dtype='int32') + i = layers.cast(i, 'int64') + i.stop_gradient = True + array_len = layers.fill_constant(shape=[1], dtype='int32', value=5) + array_len = layers.cast(array_len, 'int64') + array_len.stop_gradient = True + cond = layers.ones(shape=[1], dtype='int32') + cond = layers.cast(cond, 'bool') + j = layers.fill_constant(shape=[1], dtype='int32', value=1) + j = layers.cast(j, 'int64') + j.stop_gradient = True + array_len2 = layers.fill_constant(shape=[1], dtype='int32', value=3) + array_len2 = layers.cast(array_len2, 'int64') + array_len2.stop_gradient = True + cond2 = layers.logical_or(x=j, y=array_len2) + cond2 = layers.ones(shape=[1], dtype='int32') + cond2 = layers.cast(cond2, 'bool') + while_op = layers.While(cond=cond) + while_op2 = layers.While(cond=cond2) + with while_op.block(): + d = layers.array_read(array=data_array, i=i) + prev = layers.array_read(array=mem_array, i=i) + result = layers.sums(input=[d, prev]) + + i = layers.increment(x=i, in_place=True) + layers.array_write(result, i=i, array=mem_array) + layers.less_than(x=i, y=array_len, cond=cond) + + with while_op2.block(): + d2 = layers.array_read(array=data_array, i=j) + prev2 = layers.array_read(array=mem_array, i=j) + result2 = layers.sums(input=[d2, prev2]) + + j = layers.increment(x=j, in_place=True) + layers.array_write(result2, i=j, array=mem_array) + layers.less_than(x=j, y=array_len2, cond=cond2) + sum_result = layers.array_read(array=mem_array, i=j) + loss = layers.mean(sum_result) + return loss, sum_result + + def test_simple_net(self): + paddle.enable_static() + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + loss, sum_result = self.simple_net() + + append_backward(loss) + + npu_place = paddle.NPUPlace(0) + exe = Executor(npu_place) + d = [] + + for i in range(3): + d.append(numpy.random.random(size=[10]).astype('float32')) + + outs = exe.run(feed={'d0': d[0], + 'd1': d[1], + 'd2': d[2]}, + fetch_list=[sum_result]) + self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01) + + def test_simple_net_forward(self): + paddle.enable_static() + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + self.simple_net() + + npu_place = paddle.NPUPlace(0) + exe = Executor(npu_place) + d = [] + + for i in range(3): + d.append(numpy.random.random(size=[10]).astype('float32')) + + for _ in range(2): + exe.run(main_program, feed={'d0': d[0], 'd1': d[1], 'd2': d[2]}) + + +if __name__ == '__main__': + unittest.main() From fc537d4f035053b01ea871bdb7586ac2e39cc95f Mon Sep 17 00:00:00 2001 From: Fan Zhang Date: Wed, 11 Aug 2021 19:29:07 +0800 Subject: [PATCH 013/126] [NPU] Support npu op flatten_contiguous_range_grad (#34798) --- paddle/fluid/operators/flatten_op_npu.cc | 33 +++++++++++++++++++ .../test_flatten_contiguous_range_op_npu.py | 18 ++++++++-- 2 files changed, 48 insertions(+), 3 deletions(-) mode change 100644 => 100755 python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py diff --git a/paddle/fluid/operators/flatten_op_npu.cc b/paddle/fluid/operators/flatten_op_npu.cc index 1569760fe3b96f..9252716f3acfc1 100644 --- a/paddle/fluid/operators/flatten_op_npu.cc +++ b/paddle/fluid/operators/flatten_op_npu.cc @@ -78,6 +78,25 @@ class FlattenContiguousRangeNPUKernel : public framework::OpKernel { } }; +template +class FlattenContiguousRangeGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_out = + ctx.Input(framework::GradVarName("Out")); + + auto xshape_dims = ctx.Input("XShape")->dims(); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + + d_x->mutable_data(ctx.GetPlace(), d_out->type()); + framework::TensorCopy( + *d_out, ctx.GetPlace(), + ctx.template device_context(), d_x); + d_x->Resize(x_dims); + } +}; + } // namespace operators } // namespace paddle @@ -110,3 +129,17 @@ REGISTER_OP_NPU_KERNEL( int8_t>, ops::FlattenContiguousRangeNPUKernel); +REGISTER_OP_NPU_KERNEL( + flatten_contiguous_range_grad, + ops::FlattenContiguousRangeGradNPUKernel, + ops::FlattenContiguousRangeGradNPUKernel, + ops::FlattenContiguousRangeGradNPUKernel, + ops::FlattenContiguousRangeGradNPUKernel, + ops::FlattenContiguousRangeGradNPUKernel, + ops::FlattenContiguousRangeGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py old mode 100644 new mode 100755 index 88e711dcf068e6..742d156c7f5f1b --- a/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py @@ -49,7 +49,7 @@ def test_check_output(self): self.check_output_with_place(self.place, no_check_set=["XShape"]) def test_check_grad(self): - pass + self.check_grad_with_place(self.place, ["X"], "Out") def init_test_case(self): self.in_shape = (3, 2, 5, 4) @@ -163,13 +163,13 @@ def init_attrs(self): } -class TestFlattenOp_int(TestFlattenOp): +class TestFlattenOp_int32(TestFlattenOp): def init_test_case(self): self.in_shape = (3, 2, 5, 4) self.start_axis = 0 self.stop_axis = 1 self.new_shape = (6, 5, 4) - self.dtype = np.int + self.dtype = np.int32 def init_attrs(self): self.attrs = { @@ -177,6 +177,9 @@ def init_attrs(self): "stop_axis": self.stop_axis } + def test_check_grad(self): + pass + class TestFlattenOp_uint8(TestFlattenOp): def init_test_case(self): @@ -192,6 +195,9 @@ def init_attrs(self): "stop_axis": self.stop_axis } + def test_check_grad(self): + pass + class TestFlattenOp_int8(TestFlattenOp): def init_test_case(self): @@ -207,6 +213,9 @@ def init_attrs(self): "stop_axis": self.stop_axis } + def test_check_grad(self): + pass + class TestFlattenOp_int64(TestFlattenOp): def init_test_case(self): @@ -222,6 +231,9 @@ def init_attrs(self): "stop_axis": self.stop_axis } + def test_check_grad(self): + pass + class TestFlatten2OpError(unittest.TestCase): def test_errors(self): From 3429c04b1c6ef4ebd6426f87ff946d0d6087c1ab Mon Sep 17 00:00:00 2001 From: Wangzheee <634486483@qq.com> Date: Wed, 11 Aug 2021 19:48:30 +0800 Subject: [PATCH 014/126] [Paddle TRT]fix_fc_int8_convert; fix_reshape_convert (#34787) * fix_fc_reshape_convert * fix --- .../ir/quant_conv2d_dequant_fuse_pass.cc | 37 ++++++ .../fluid/inference/tensorrt/convert/fc_op.cc | 108 ++++++++++-------- paddle/fluid/inference/tensorrt/op_teller.cc | 13 ++- 3 files changed, 106 insertions(+), 52 deletions(-) diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index b48c8c6e70a939..354db8acf87a73 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -153,6 +153,43 @@ QuantDequantFusePass::QuantDequantFusePass() { .AddAttr("data_format") .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) .End(); + AddOpCompat(OpCompat("depthwise_conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddInput("ResidualData") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsOptional() + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); AddOpCompat(OpCompat("mul")) .AddInput("X") .IsTensor() diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 74bb854e55f823..ef50f3db42c6f5 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -33,6 +33,53 @@ namespace tensorrt { */ class FcOpConverter : public OpConverter { public: + nvinfer1::ILayer* reshape_before_fc(nvinfer1::ITensor* before_fc, + nvinfer1::Dims x_dim, + int x_num_col_dims) { + // add shuffle before fc + nvinfer1::Dims reshape_before_fc_dim; + reshape_before_fc_dim.nbDims = x_num_col_dims + 3; + // padding shape "* x q x 1 x 1" + for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) { + reshape_before_fc_dim.d[i] = 1; + } + for (int i = 0; i < x_dim.nbDims; i++) { + if (i < x_num_col_dims) { + reshape_before_fc_dim.d[i] = 0; + } else { + if (x_dim.d[i] < 0) { + reshape_before_fc_dim.d[x_num_col_dims] = -1; + break; + } + reshape_before_fc_dim.d[x_num_col_dims] *= x_dim.d[i]; + } + } + auto* reshape_before_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *before_fc); + reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); + return reshape_before_fc_layer; + } + + nvinfer1::ILayer* reshape_after_fc(nvinfer1::ITensor* after_fc, + nvinfer1::Dims x_dim, int x_num_col_dims) { + // add shuffle after fc + nvinfer1::Dims reshape_after_fc_dim; + if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 && + x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 1) { + // If use tensorrt'oss, the x_dim and x_num_col_dims need change + reshape_after_fc_dim.nbDims = 4; + } else { + reshape_after_fc_dim.nbDims = x_num_col_dims + 1; + } + for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) { + reshape_after_fc_dim.d[i] = 0; + } + auto* reshape_after_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *after_fc); + reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim); + return reshape_after_fc_layer; + } + void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias"; @@ -92,10 +139,8 @@ class FcOpConverter : public OpConverter { "it's %d-dimensional.", Y_t->dims().size())); // a matrix size_t n_output = Y_t->dims()[1]; - int m = Y_t->dims()[0]; int n = Y_t->dims()[1]; - auto tranpose_weight = [](const float* src, float* dst, int m, int n) { for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { @@ -119,47 +164,35 @@ class FcOpConverter : public OpConverter { auto* fc_layer_int8 = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output, nv_ksize, weight.get(), bias.get()); - engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0), out_scale); + auto* fc_after_reshape_int8 = reshape_after_fc( + fc_layer_int8->getOutput(0), x_dim, x_num_col_dims); + engine_->SetTensorDynamicRange(fc_after_reshape_int8->getOutput(0), + out_scale); if (activation_type == "relu") { nvinfer1::IActivationLayer* relu_layer_int8 = TRT_ENGINE_ADD_LAYER( - engine_, Activation, *(fc_layer_int8->getOutput(0)), + engine_, Activation, *(fc_after_reshape_int8->getOutput(0)), nvinfer1::ActivationType::kRELU); RreplenishLayerAndOutput(relu_layer_int8, "relu_after_fc_shuffle", {output_name}, test_mode); } else { - RreplenishLayerAndOutput(fc_layer_int8, "shuffle_after_fc", + RreplenishLayerAndOutput(fc_after_reshape_int8, "shuffle_after_fc", {output_name}, test_mode); } } else { // add fc layer - auto* fc_layer_before = + auto* fc_layer_float = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, n_output, weight.get(), bias.get()); - fc_layer_before->setName( - ("fc_layer_before(Output: " + output_name + ")").c_str()); - // add shuffle after fc - nvinfer1::Dims reshape_after_fc_dim; - if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 && - x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 1) { - // If use tensorrt'oss, the x_dim and x_num_col_dims need change - reshape_after_fc_dim.nbDims = 4; - } else { - reshape_after_fc_dim.nbDims = x_num_col_dims + 1; - } - for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) { - reshape_after_fc_dim.d[i] = 0; - } - auto* fc_layer_float = TRT_ENGINE_ADD_LAYER( - engine_, Shuffle, *fc_layer_before->getOutput(0)); - fc_layer_float->setReshapeDimensions(reshape_after_fc_dim); + auto* fc_after_reshape_float = reshape_after_fc( + fc_layer_float->getOutput(0), x_dim, x_num_col_dims); if (activation_type == "relu") { nvinfer1::IActivationLayer* relu_layer_float = TRT_ENGINE_ADD_LAYER( - engine_, Activation, *(fc_layer_float->getOutput(0)), + engine_, Activation, *(fc_after_reshape_float->getOutput(0)), nvinfer1::ActivationType::kRELU); RreplenishLayerAndOutput(relu_layer_float, "relu_after_fc_shuffle", {output_name}, test_mode); } else { - RreplenishLayerAndOutput(fc_layer_float, "shuffle_after_fc", + RreplenishLayerAndOutput(fc_after_reshape_float, "shuffle_after_fc", {output_name}, test_mode); } } @@ -169,12 +202,10 @@ class FcOpConverter : public OpConverter { weight_data_tmp.reserve(Y_t->numel()); memcpy(weight_data_tmp.data(), weight_data, Y_t->numel() * sizeof(float)); tranpose_weight(weight_data_tmp.data(), weight_data, m, n); - TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), static_cast(Y_t->numel())}; weight.dims.assign({n, m}); - float* bias_data = nullptr; int bias_num = 0; if (with_bias) { @@ -204,28 +235,7 @@ class FcOpConverter : public OpConverter { "converter expects x_dim.nbDims > x_num_col_dims, but " "x_dim.nbDims : %d, x_num_col_dims : %d.", x_dim.nbDims, x_num_col_dims)); - // add shuffle before fc - nvinfer1::Dims reshape_before_fc_dim; - reshape_before_fc_dim.nbDims = x_num_col_dims + 3; - // padding shape "* x q x 1 x 1" - for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) { - reshape_before_fc_dim.d[i] = 1; - } - for (int i = 0; i < x_dim.nbDims; i++) { - if (i < x_num_col_dims) { - reshape_before_fc_dim.d[i] = 0; - } else { - if (x_dim.d[i] < 0) { - reshape_before_fc_dim.d[x_num_col_dims] = -1; - break; - } - reshape_before_fc_dim.d[x_num_col_dims] *= x_dim.d[i]; - } - } - auto* reshape_before_fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); - reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); - reshape_before_fc_layer->setName( - ("shuffle_before_fc(Output: " + output_name + ")").c_str()); + auto* reshape_before_fc_layer = reshape_before_fc(X, x_dim, x_num_col_dims); auto* reshape_itensor = reshape_before_fc_layer->getOutput(0); if (enable_int8) { engine_->SetTensorDynamicRange(reshape_itensor, in_scale); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index bfe3dfc85eecdd..72f20790f35242 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -698,15 +698,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; } } + if (op_type == "reshape" || op_type == "reshape2") { if (!desc.HasAttr("shape")) { return false; } // Paddle-TRT does not support the input tensors: Shape and ShapeTensor auto reshape_inputs = desc.Inputs(); - if (reshape_inputs.find("Shape") != reshape_inputs.end() || - reshape_inputs.find("ShapeTensor") != reshape_inputs.end()) { - return false; + if (reshape_inputs.find("Shape") != reshape_inputs.end()) { + if (desc.Input("Shape").size() >= 1) { + return false; + } + } + if (reshape_inputs.find("ShapeTensor") != reshape_inputs.end()) { + if (desc.Input("ShapeTensor").size() >= 1) { + return false; + } } std::vector shape = BOOST_GET_CONST(std::vector, desc.GetAttr("shape")); From 9d02313c31bfdf7c68647320bfa279e4be994d21 Mon Sep 17 00:00:00 2001 From: WeiXin Date: Wed, 11 Aug 2021 20:15:11 +0800 Subject: [PATCH 015/126] `set_value_grad` propagate gradients to `Input` and `TensorValue` (#34304) * add set_value_grad op * add unittest. * polish unittest. * polish code. * support cuda kernel * polish code according to CI * polish code. * polish code * remove *.pyc * polish code. * add unittest to improve coverage. * polish code. --- paddle/fluid/operators/set_value_op.cc | 93 ++++-- paddle/fluid/operators/set_value_op.cu | 8 + paddle/fluid/operators/set_value_op.h | 268 +++++++++++++++ .../tests/unittests/test_set_value_op.py | 314 +++++++++++++++++- 4 files changed, 656 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 9a6c43dee6d9d1..41e6d2d40061e8 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -157,39 +157,26 @@ class SetValueGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr op) const override { if (this->HasInput("ValueTensor")) { - op->SetType("slice"); - op->SetInput("Input", this->OutputGrad("Out")); + op->SetType("set_value_grad"); + + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetInput("ValueTensor", this->Input("ValueTensor")); if (this->HasInput("StartsTensorList")) { op->SetInput("StartsTensorList", this->Input("StartsTensorList")); } if (this->HasInput("EndsTensorList")) { op->SetInput("EndsTensorList", this->Input("EndsTensorList")); } + if (this->HasInput("StepsTensorList")) { + op->SetInput("StepsTensorList", this->Input("StepsTensorList")); + } + + op->SetAttrMap(this->Attrs()); + + op->SetOutput(framework::GradVarName("ValueTensor"), + this->InputGrad("ValueTensor")); + op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); - // convert std::vector to std::vector - std::vector axes_int64 = static_cast>( - BOOST_GET_CONST(std::vector, this->GetAttr("axes"))); - std::vector starts_int64 = static_cast>( - BOOST_GET_CONST(std::vector, this->GetAttr("starts"))); - std::vector ends_int64 = static_cast>( - BOOST_GET_CONST(std::vector, this->GetAttr("ends"))); - std::vector decrease_axes_int64 = - static_cast>(BOOST_GET_CONST( - std::vector, this->GetAttr("decrease_axes"))); - - std::vector axes(axes_int64.begin(), axes_int64.end()); - std::vector starts(starts_int64.begin(), starts_int64.end()); - std::vector ends(ends_int64.begin(), ends_int64.end()); - std::vector decrease_axes(decrease_axes_int64.begin(), - decrease_axes_int64.end()); - - op->SetAttr("axes", axes); - op->SetAttr("starts", starts); - op->SetAttr("ends", ends); - op->SetAttr("decrease_axis", decrease_axes); - op->SetAttr("infer_flags", std::vector({})); - - op->SetOutput("Out", this->InputGrad("ValueTensor")); } else { op->SetType("assign"); op->SetInput("X", this->OutputGrad("Out")); @@ -198,6 +185,50 @@ class SetValueGradMaker : public framework::SingleGradOpMaker { } }; +class SetValueGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + framework::GradVarName("Out"), "set_value_grad"); + + auto in_dims = ctx->GetInputDim(framework::GradVarName("Out")); + PADDLE_ENFORCE_LT( + in_dims.size(), 7, + platform::errors::InvalidArgument( + "The dimension of set_value_grad operator's input should be less " + "than 7, but received dimension is %d.", + in_dims.size())); + + if (ctx->HasOutput(framework::GradVarName("ValueTensor"))) { + ctx->ShareDim("ValueTensor", + /*->*/ framework::GradVarName("ValueTensor")); + ctx->ShareLoD("ValueTensor", + /*->*/ framework::GradVarName("ValueTensor")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto in_tensor = ctx.Input(framework::GradVarName("Out")); + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + in_tensor->place()); + } + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const override { + if (var_name == "StartsTensorList" || var_name == "EndsTensorList" || + var_name == "StepsTensorList") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; + DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"}); } // namespace operators @@ -218,6 +249,16 @@ REGISTER_OP_CPU_KERNEL( ops::SetValueKernel, ops::SetValueKernel); +REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad); + +REGISTER_OP_CPU_KERNEL( + set_value_grad, + ops::SetValueGradKernel, + ops::SetValueGradKernel, + ops::SetValueGradKernel, + ops::SetValueGradKernel, + ops::SetValueGradKernel); + REGISTER_OP_VERSION(set_value) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu index b65e1691b99c5d..f9701b0acaac76 100644 --- a/paddle/fluid/operators/set_value_op.cu +++ b/paddle/fluid/operators/set_value_op.cu @@ -22,3 +22,11 @@ REGISTER_OP_CUDA_KERNEL( ops::SetValueKernel, ops::SetValueKernel, ops::SetValueKernel); + +REGISTER_OP_CUDA_KERNEL( + set_value_grad, + ops::SetValueGradKernel, + ops::SetValueGradKernel, + ops::SetValueGradKernel, + ops::SetValueGradKernel, + ops::SetValueGradKernel); diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index eed8a9c9b22bc8..72b94dfa772792 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -22,8 +22,10 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/assign_value_op.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/slice_utils.h" +#include "paddle/fluid/operators/strided_slice_op.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/enforce.h" @@ -31,6 +33,24 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; +using DDim = framework::DDim; + +inline void GetOffsets(const DDim& big_dim, const DDim& small_dim, + DDim start_offset, int cur_dim, + std::vector* offsets) { + if (cur_dim == big_dim.size()) { + offsets->push_back(start_offset); + return; + } + if (small_dim[cur_dim] == big_dim[cur_dim]) { + GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); + } else { + for (int i = 0; i < big_dim[cur_dim]; i++) { + GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); + start_offset[cur_dim] += 1; + } + } +} inline std::string GetValueName(framework::proto::VarType::Type data_type) { std::string value_name; @@ -292,5 +312,253 @@ class SetValueKernel : public framework::OpKernel { } }; +template +class SetValueGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + int rank = ctx.Input(framework::GradVarName("Out"))->dims().size(); + + switch (rank) { + case 1: + SetValueGradCompute<1>(ctx); + break; + case 2: + SetValueGradCompute<2>(ctx); + break; + case 3: + SetValueGradCompute<3>(ctx); + break; + case 4: + SetValueGradCompute<4>(ctx); + break; + case 5: + SetValueGradCompute<5>(ctx); + break; + case 6: + SetValueGradCompute<6>(ctx); + break; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "The rank of set_value_grad's input should be less than 7, but " + "received %d.", + rank)); + } + } + + private: + template + void SetValueGradCompute(const framework::ExecutionContext& context) const { + auto starts = context.Attr>("starts"); + auto ends = context.Attr>("ends"); + auto steps = context.Attr>("steps"); + + auto axes_int64 = context.Attr>("axes"); + std::vector axes(axes_int64.begin(), axes_int64.end()); + + auto starts_indices = Eigen::DSizes(); + auto ends_indices = Eigen::DSizes(); + auto steps_indices = Eigen::DSizes(); + auto reverse_axis = Eigen::array(); + + auto list_new_ends_tensor = + context.MultiInput("EndsTensorList"); + auto list_new_starts_tensor = + context.MultiInput("StartsTensorList"); + auto list_new_steps_tensor = + context.MultiInput("StepsTensorList"); + + if (list_new_starts_tensor.size() > 0) { + starts = GetDataFromTensorList(list_new_starts_tensor); + } + + if (list_new_ends_tensor.size() > 0) { + ends = GetDataFromTensorList(list_new_ends_tensor); + } + + if (list_new_steps_tensor.size() > 0) { + steps = GetDataFromTensorList(list_new_steps_tensor); + } + + auto in = context.Input(framework::GradVarName("Out")); + PADDLE_ENFORCE_EQ( + in->IsInitialized(), true, + platform::errors::PermissionDenied( + "The input of `set_value_grad`(%s) has not been initialized", + framework::GradVarName("Out"))); + auto grad_value = context.Output( + framework::GradVarName("ValueTensor")); + auto grad_input = + context.Output(framework::GradVarName("Input")); + auto in_dims = in->dims(); + + auto decrease_axis_int64 = + context.Attr>("decrease_axes"); + std::vector decrease_axis(decrease_axis_int64.begin(), + decrease_axis_int64.end()); + std::vector infer_flags(axes.size(), 1); + std::vector out_dims_vector(in_dims.size(), -1); + StridedSliceOutDims(starts, ends, steps, axes, infer_flags, in_dims, + decrease_axis, out_dims_vector.data(), axes.size(), + false); + + framework::DDim out_dims(framework::make_ddim(out_dims_vector)); + + std::vector reverse_vector(starts.size(), 0); + StridedSliceFunctor(starts.data(), ends.data(), steps.data(), axes.data(), + reverse_vector.data(), in_dims, infer_flags, + decrease_axis, starts.size()); + + for (size_t axis = 0; axis < D; axis++) { + starts_indices[axis] = 0; + ends_indices[axis] = out_dims[axis]; + steps_indices[axis] = 1; + reverse_axis[axis] = false; + } + + for (size_t axis = 0; axis < axes.size(); axis++) { + int axis_index = axes[axis]; + starts_indices[axis_index] = starts[axis]; + ends_indices[axis_index] = ends[axis]; + steps_indices[axis_index] = steps[axis]; + reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false; + } + + bool need_reverse = false; + for (size_t axis = 0; axis < axes.size(); axis++) { + if (reverse_vector[axis] == 1) { + need_reverse = true; + break; + } + } + + auto& dev_ctx = context.template device_context(); + auto& place = + *context.template device_context().eigen_device(); + math::SetConstant set_zero; + + if (grad_input) { + // Set gradient of `Input` + TensorCopy(*in, context.GetPlace(), grad_input); + + auto grad_input_t = + framework::EigenTensor::From(*grad_input); + + framework::Tensor tmp(grad_input->type()); + tmp.mutable_data(out_dims, context.GetPlace()); + set_zero(dev_ctx, &tmp, static_cast(0)); + auto tmp_t = framework::EigenTensor::From(tmp); + + grad_input_t.stridedSlice(starts_indices, ends_indices, steps_indices) + .device(place) = tmp_t; + } + if (grad_value) { + grad_value->mutable_data(context.GetPlace()); + set_zero(dev_ctx, grad_value, static_cast(0)); + + auto in_t = framework::EigenTensor::From(*in); + + if (grad_value->dims() == out_dims) { + auto grad_value_t = + framework::EigenTensor::From(*grad_value); + if (need_reverse) { + framework::Tensor tmp(grad_value->type()); + tmp.mutable_data(out_dims, context.GetPlace()); + set_zero(dev_ctx, &tmp, static_cast(0)); + auto tmp_t = framework::EigenTensor::From(tmp); + + tmp_t.device(place) = + in_t.stridedSlice(starts_indices, ends_indices, steps_indices); + grad_value_t.device(place) = tmp_t.reverse(reverse_axis); + } else { + grad_value_t.device(place) = + in_t.stridedSlice(starts_indices, ends_indices, steps_indices); + } + } else { + int out_dims_size = out_dims.size(); + auto grad_value_dims = grad_value->dims(); + auto fake_grad_value_dims = out_dims; + + // Create an extented shape according to the rules of broadcast. + auto grad_value_dims_size = grad_value_dims.size(); + + int num_decrease = 0; + + int decrease_axis_size = decrease_axis.size(); + for (int i = 0; i < out_dims_size; i++) { + if (decrease_axis.end() != + std::find(decrease_axis.begin(), decrease_axis.end(), i)) { + fake_grad_value_dims[i] = 1; + num_decrease++; + } else if (i < out_dims_size - (grad_value_dims_size + + decrease_axis_size - num_decrease)) { + fake_grad_value_dims[i] = 1; + } else { + auto index_grad = + i - (out_dims_size - (grad_value_dims_size + + decrease_axis_size - num_decrease)); + fake_grad_value_dims[i] = grad_value_dims[index_grad]; + + PADDLE_ENFORCE_EQ((out_dims[i] == grad_value_dims[index_grad]) || + (grad_value_dims[index_grad] == 1), + true, + platform::errors::InvalidArgument( + "An error occurred while calculating %s: " + "[%s] can not be accumulated into [%s].", + framework::GradVarName("ValueTensor"), + out_dims, grad_value_dims)); + } + } + + VLOG(3) << "Dimensions of " << framework::GradVarName("ValueTensor") + << "([" << grad_value_dims << "])is broadcasted into [" + << fake_grad_value_dims << "]."; + + auto extent = Eigen::DSizes(); + auto offset = out_dims; + for (int i = 0; i < out_dims_size; i++) { + offset[i] = 0; + extent[i] = fake_grad_value_dims[i]; + } + std::vector offsets; + GetOffsets(out_dims, fake_grad_value_dims, offset, 0, &offsets); + + auto grad_value_t = + framework::EigenTensor:: + From(*grad_value, fake_grad_value_dims); + + framework::Tensor tmp(grad_value->type()); + tmp.mutable_data(out_dims, context.GetPlace()); + set_zero(dev_ctx, &tmp, static_cast(0)); + auto tmp_t = framework::EigenTensor::From(tmp); + + tmp_t.device(place) = + in_t.stridedSlice(starts_indices, ends_indices, steps_indices); + + // accumulate gradient + for (auto offset : offsets) { + grad_value_t.device(place) = + grad_value_t + + tmp_t.slice(framework::EigenDim::From(offset), extent); + } + if (need_reverse) { + framework::Tensor tmp_value(grad_value->type()); + tmp_value.mutable_data(fake_grad_value_dims, context.GetPlace()); + auto tmp_value_t = + framework::EigenTensor::From(tmp_value); + tmp_value_t.device(place) = grad_value_t.reverse(reverse_axis); + grad_value_t.device(place) = tmp_value_t; + } + } + } + } +}; + } // namespace operators } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py index 6f2f669913eb6a..d26055b3166d6e 100644 --- a/python/paddle/fluid/tests/unittests/test_set_value_op.py +++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py @@ -20,6 +20,8 @@ import numpy as np import paddle +from paddle.fluid.layer_helper import LayerHelper +from functools import reduce class TestSetValueBase(unittest.TestCase): @@ -915,7 +917,317 @@ def test_dynamic(self): loss.backward() self.assertTrue(var.grad.shape == x.grad[0, :, 0, 0].shape) - self.assertTrue((var.grad == x.grad[0, :, 0, 0]).all()) + # + self.assertTrue((0 == x.grad[0, :, 0, 0]).all()) + + +class TestGradientTruncated(unittest.TestCase): + def test_consistent_with_competitor(self): + paddle.disable_static() + + def set_value(t, value): + a = t * t + a[0, 1] = value + y = a * a + return y.sum() + + # case 1 + array = np.arange( + 1, 1 + 2 * 3 * 4, dtype="float32").reshape([1, 2, 1, 3, 1, 4]) + value = np.arange(100, 104, dtype="float32").reshape(1, 4) + + inps = paddle.to_tensor(array, stop_gradient=False) + value = paddle.to_tensor(value, stop_gradient=False) + + loss = set_value(inps, value) + loss.backward() + + value_grad = np.array([[600., 606., 612., 618.]]) + input_grad = np.array( + [[[[[[4., 32., 108., 256.]], [[500., 864., 1372., 2048.]], + [[2916., 4000., 5324., 6912.]]]], + [[[[0., 0., 0., 0.]], [[0., 0., 0., 0.]], [[0., 0., 0., 0.]]]]]]) + self.assertTrue( + np.array_equal(inps.grad.numpy(), input_grad), + msg="The gradient of value should be \n{},\n but reveived {}". + format(input_grad, inps.grad.numpy())) + self.assertTrue( + np.array_equal(value.grad.numpy(), value_grad), + msg="The gradient of input should be \n{},\n but reveived {}". + format(value_grad, value.grad.numpy())) + + # case 2 + array = np.arange(1, 2 * 3 * 4 + 1, dtype="float32").reshape([4, 2, 3]) + value = np.arange(100, 100 + 1, dtype="float32") + + inps2 = paddle.to_tensor(array, stop_gradient=False) + value2 = paddle.to_tensor(value, stop_gradient=False) + + loss = set_value(inps2, value2) + loss.backward() + + value_grad2 = np.array([600.]) + input_grad2 = np.array( + [[[4., 32., 108.], [0., 0., 0.]], [[1372., 2048., 2916.], + [4000., 5324., 6912.]], + [[8788., 10976., 13500.], [16384., 19652., 23328.]], + [[27436., 32000., 37044.], [42592., 48668., 55296.]]]) + self.assertTrue( + np.array_equal(inps2.grad.numpy(), input_grad2), + msg="The gradient of value should be \n{},\n but reveived {}". + format(input_grad, inps2.grad.numpy())) + self.assertTrue( + np.array_equal(value2.grad.numpy(), value_grad2), + msg="The gradient of input should be \n{},\n but reveived {}". + format(value_grad, value2.grad.numpy())) + + # case 3 + def set_value3(t, value): + a = t * t + a[0, :, 0, :] = value + y = a * a + return y.sum() + + array = np.arange( + 1, 1 + 2 * 3 * 4, dtype="float32").reshape([4, 3, 1, 1, 2, 1]) + value = np.arange(100, 100 + 2, dtype="float32").reshape(1, 2, 1) + + inps = paddle.to_tensor(array, stop_gradient=False) + value = paddle.to_tensor(value, stop_gradient=False) + + loss = set_value3(inps, value) + loss.backward() + + value_grad = np.array([[[600.], [606.]]]) + input_grad = np.array( + [[[[[[0.], [0.]]]], [[[[0.], [0.]]]], [[[[0.], [0.]]]]], + [[[[[1372.], [2048.]]]], [[[[2916.], [4000.]]]], + [[[[5324.], [6912.]]]]], [[[[[8788.], [10976.]]]], [[[[13500.], + [16384.]]]], + [[[[19652.], [23328.]]]]], + [[[[[27436.], [32000.]]]], [[[[37044.], [42592.]]]], + [[[[48668.], [55296.]]]]]]) + self.assertTrue( + np.array_equal(inps.grad.numpy(), input_grad), + msg="The gradient of value should be \n{},\n but reveived {}". + format(input_grad, inps.grad.numpy())) + self.assertTrue( + np.array_equal(value.grad.numpy(), value_grad), + msg="The gradient of input should be \n{},\n but reveived {}". + format(value_grad, value.grad.numpy())) + + #case 4: step >0 + def set_value4(t, value): + a = t * t + a[0, :, 0, ::3] = value + y = a * a + return y.sum() + + array = np.arange( + 1, 1 + 2 * 3 * 4, dtype="float32").reshape([2, 3, 1, 4, 1]) + value = np.arange(100, 100 + 2, dtype="float32").reshape(1, 2, 1) + + inps = paddle.to_tensor(array, stop_gradient=False) + value = paddle.to_tensor(value, stop_gradient=False) + + loss = set_value4(inps, value) + loss.backward() + + value_grad = np.array([[[600.], [606.]]]) + input_grad = np.array([[[[[0.], [32.], [108.], + [0.]]], [[[0.], [864.], [1372.], [0.]]], + [[[0.], [4000.], [5324.], [0.]]]], + [[[[8788.], [10976.], [13500.], [16384.]]], + [[[19652.], [23328.], [27436.], [32000.]]], + [[[37044.], [42592.], [48668.], [55296.]]]]]) + self.assertTrue( + np.array_equal(inps.grad.numpy(), input_grad), + msg="The gradient of value should be \n{},\n but reveived {}". + format(input_grad, inps.grad.numpy())) + self.assertTrue( + np.array_equal(value.grad.numpy(), value_grad), + msg="The gradient of input should be \n{},\n but reveived {}". + format(value_grad, value.grad.numpy())) + + # case 5:a[0].shape==value.shape + def set_value5(t, value): + a = t * t + a[0] = value + y = a * a + return y.sum() + + array = np.arange(1, 1 + 2 * 3 * 4, dtype="float32").reshape([2, 3, 4]) + value = np.arange(100, 100 + 12, dtype="float32").reshape(3, 4) + + inps = paddle.to_tensor(array, stop_gradient=False) + value = paddle.to_tensor(value, stop_gradient=False) + + loss = set_value5(inps, value) + loss.backward() + + value_grad = np.array([[200., 202., 204., 206.], + [208., 210., 212., 214.], + [216., 218., 220., 222.]]) + input_grad = np.array([[[0., 0., 0., 0.], [0., 0., 0., 0.], + [0., 0., 0., 0.]], + [[8788., 10976., 13500., 16384.], + [19652., 23328., 27436., 32000.], + [37044., 42592., 48668., 55296.]]]) + self.assertTrue( + np.array_equal(inps.grad.numpy(), input_grad), + msg="The gradient of value should be \n{},\n but reveived {}". + format(input_grad, inps.grad.numpy())) + self.assertTrue( + np.array_equal(value.grad.numpy(), value_grad), + msg="The gradient of input should be \n{},\n but reveived {}". + format(value_grad, value.grad.numpy())) + + def test_static_graph(self): + paddle.enable_static() + + to_string = lambda x, i, : x + '_' + str(i) + numel = lambda input_shape: reduce(lambda x, y: x * y, input_shape) + + def op1(x): + value = paddle.fluid.layers.fill_constant([1], "float32", 1) + # test stop_gradient + value.stop_gradient = True + x.stop_gradient = False + start = paddle.fluid.layers.fill_constant( + [1], "int32", 5, force_cpu=True) + end = paddle.fluid.layers.fill_constant( + [1], "int32", 0, force_cpu=True) + step = paddle.fluid.layers.fill_constant( + [1], "int32", -2, force_cpu=True) + + inputs = { + 'Input': x, + 'ValueTensor': value, + 'StartsTensorList': [start, ], + 'EndsTensorList': [end, ], + 'StepsTensorList': [step, ] + } + + helper = LayerHelper("set_value") + y = helper.create_variable_for_type_inference(dtype=x.dtype) + + helper.append_op( + type="set_value", + inputs=inputs, + outputs={'Out': y}, + attrs={'axes': [0]}) + + return y, value + + def op2(x): + value = paddle.fluid.layers.fill_constant([1, 3, 2], "float32", 1) + # test stop_gradient + value.stop_gradient = False + x.stop_gradient = False + attrs = { + 'axes': [0], + 'starts': [6], + 'ends': [0], + 'steps': [-4], + 'decrease_axes': [], + 'none_axes': [], + 'dtype': paddle.float32 + } + inputs = {'Input': x, 'ValueTensor': value} + + helper = LayerHelper("set_value") + y = helper.create_variable_for_type_inference(dtype=x.dtype) + + helper.append_op( + type="set_value", + inputs=inputs, + outputs={'Out': y}, + attrs=attrs) + + return y, value + + def op3(x): + value = paddle.fluid.layers.fill_constant([1], "float32", 1) + x.stop_gradient = True + value.stop_gradient = False + start = paddle.fluid.layers.fill_constant( + [1], "int32", 0, force_cpu=True) + end = paddle.fluid.layers.fill_constant( + [1], "int32", 5, force_cpu=True) + step = paddle.fluid.layers.fill_constant( + [1], "int32", 3, force_cpu=True) + + inputs = { + 'Input': x, + 'ValueTensor': value, + 'StartsTensorList': [start, ], + 'EndsTensorList': [end, ], + 'StepsTensorList': [step, ] + } + + helper = LayerHelper("set_value") + y = helper.create_variable_for_type_inference(dtype=x.dtype) + + helper.append_op( + type="set_value", + inputs=inputs, + outputs={'Out': y}, + attrs={'axes': [0]}) + + return y, value + + def set_value(array, i, op): + name_x = to_string('x', i) + x = paddle.static.data( + name=name_x, shape=array.shape, dtype='float32') + + # set_value_op in __get/setitem__ is an inplace operation. + # When `input.stop_gradient = True` and `value.stop_gradient = False`, + # set_value_grad_op will not be run during backward. + y, value = op(x) + + y2 = y + 1 + loss = paddle.fluid.layers.reduce_sum(y2) + sgd = paddle.optimizer.Adam() + sgd.minimize(loss) + place = paddle.fluid.CPUPlace( + ) if not paddle.fluid.core.is_compiled_with_cuda( + ) else paddle.fluid.CUDAPlace(0) + + prog = paddle.static.default_main_program() + exe = paddle.static.Executor(place) + exe.run(paddle.static.default_startup_program()) + fetch_list = [] + if not x.stop_gradient: + fetch_list.append(x.grad_name) + if not value.stop_gradient: + fetch_list.append(value.grad_name) + out = exe.run(prog, feed={x.name: array}, fetch_list=fetch_list) + return out + + input_shape = [7, 6, 5, 4, 3, 2] + + array = np.arange( + 0, numel(input_shape), dtype="float32").reshape(input_shape) + + for i in range(len(input_shape)): + program = paddle.static.Program() + with paddle.static.program_guard(program): + out1 = set_value(array, i, op1) + self.assertTrue((out1[0][5:0:-2] == 0).all()) + + if len(array.shape) > 2: + program2 = paddle.static.Program() + with paddle.static.program_guard(program2): + out2 = set_value(array, i, op2) + self.assertTrue((out2[0][6:0:-4] == 0).all()) + + program3 = paddle.static.Program() + with paddle.static.program_guard(program3): + out3 = set_value(array, i, op3) + self.assertTrue((numel(out1[0][0:5:3].shape) == out3[0]).all()) + + array = array[0] if __name__ == '__main__': From 99f8f5c80551d685cd50b110d62d0d043fd18b8a Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Wed, 11 Aug 2021 20:41:59 +0800 Subject: [PATCH 016/126] [AMP] add state_dict and load_state_dict and unittest for class GradScaler (#34300) * add state_dict and load_state_dict and unittest for class GradScaler * refine unittest for coverage of load_state_dict * refine comments of code-block * refine some comments * refine state_dict code and unittest * add #require gpu, xpu for GradScaler get/set example code * add #require gpu, xpu for GradScaler get/set example code * refine example code * refine unittest for state_dict * refine unittest for state_dict * fix bug of DataLoader in TestGradScalerStateDict * add flag FLAGS_cudnn_deterministic --- python/paddle/amp/grad_scaler.py | 78 +++++++++- .../paddle/fluid/dygraph/amp/loss_scaler.py | 52 +++++++ .../test_imperative_auto_mixed_precision.py | 143 ++++++++++++++++++ 3 files changed, 268 insertions(+), 5 deletions(-) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 827a320b2cc9c4..18c436a0bb95f7 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -47,7 +47,7 @@ class GradScaler(AmpScaler): Examples: .. code-block:: python - + import paddle model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) @@ -91,7 +91,7 @@ def scale(self, var): Examples: .. code-block:: python - + import paddle model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) @@ -156,6 +156,7 @@ def is_enable(self): Examples: .. code-block:: python + # required: gpu,xpu import paddle scaler = paddle.amp.GradScaler(enable=True, init_loss_scaling=1024, @@ -178,7 +179,8 @@ def is_use_dynamic_loss_scaling(self): Examples: .. code-block:: python - + + # required: gpu,xpu import paddle scaler = paddle.amp.GradScaler(enable=True, init_loss_scaling=1024, @@ -202,6 +204,7 @@ def get_init_loss_scaling(self): Examples: .. code-block:: python + # required: gpu,xpu import paddle scaler = paddle.amp.GradScaler(enable=True, init_loss_scaling=1024, @@ -220,11 +223,12 @@ def set_init_loss_scaling(self, new_init_loss_scaling): Set the initial loss scaling factor by `new_init_loss_scaling`. Args: - new_init_loss_scaling(int): The new_init_loss_scaling used to update initial loss scaling factor. + new_init_loss_scaling(float): The new_init_loss_scaling used to update initial loss scaling factor. Examples: .. code-block:: python - + + # required: gpu,xpu import paddle scaler = paddle.amp.GradScaler(enable=True, init_loss_scaling=1024, @@ -250,6 +254,7 @@ def get_incr_ratio(self): Examples: .. code-block:: python + # required: gpu,xpu import paddle scaler = paddle.amp.GradScaler(enable=True, init_loss_scaling=1024, @@ -273,6 +278,7 @@ def set_incr_ratio(self, new_incr_ratio): Examples: .. code-block:: python + # required: gpu,xpu import paddle scaler = paddle.amp.GradScaler(enable=True, init_loss_scaling=1024, @@ -298,6 +304,7 @@ def get_decr_ratio(self): Examples: .. code-block:: python + # required: gpu,xpu import paddle scaler = paddle.amp.GradScaler(enable=True, init_loss_scaling=1024, @@ -321,6 +328,7 @@ def set_decr_ratio(self, new_decr_ratio): Examples: .. code-block:: python + # required: gpu,xpu import paddle scaler = paddle.amp.GradScaler(enable=True, init_loss_scaling=1024, @@ -346,6 +354,7 @@ def get_incr_every_n_steps(self): Examples: .. code-block:: python + # required: gpu,xpu import paddle scaler = paddle.amp.GradScaler(enable=True, init_loss_scaling=1024, @@ -369,6 +378,7 @@ def set_incr_every_n_steps(self, new_incr_every_n_steps): Examples: .. code-block:: python + # required: gpu,xpu import paddle scaler = paddle.amp.GradScaler(enable=True, init_loss_scaling=1024, @@ -394,6 +404,7 @@ def get_decr_every_n_nan_or_inf(self): Examples: .. code-block:: python + # required: gpu,xpu import paddle scaler = paddle.amp.GradScaler(enable=True, init_loss_scaling=1024, @@ -417,6 +428,7 @@ def set_decr_every_n_nan_or_inf(self, new_decr_every_n_nan_or_inf): Examples: .. code-block:: python + # required: gpu,xpu import paddle scaler = paddle.amp.GradScaler(enable=True, init_loss_scaling=1024, @@ -432,3 +444,59 @@ def set_decr_every_n_nan_or_inf(self, new_decr_every_n_nan_or_inf): """ super(GradScaler, self).set_decr_every_n_nan_or_inf(new_decr_every_n_nan_or_inf) + + def state_dict(self): + """ + Returns the state of the scaler as a `dict`, If this instance is not enabled, returns an empty dict. + + Reurns: + A dict of scaler includes: + init_loss_scaling (float, optional): The initial loss scaling factor. + incr_ratio(float, optional): The multiplier to use when increasing the loss scaling. + decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing the loss scaling. + incr_every_n_steps(int, optional): Increases loss scaling every n consecutive steps with finite gradients. + decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n accumulated steps with nan or inf gradients. + + Examples: + + .. code-block:: python + + # required: gpu,xpu + import paddle + + scaler = paddle.amp.GradScaler(enable=True, + init_loss_scaling=1024, + incr_ratio=2.0, + decr_ratio=0.5, + incr_every_n_steps=1000, + decr_every_n_nan_or_inf=2, + use_dynamic_loss_scaling=True) + scaler_state = scaler.state_dict() + """ + return super(GradScaler, self).state_dict() + + def load_state_dict(self, state_dict): + """ + Loads the scaler state. + + Args: + state_dict(dict): scaler state. Should be an object returned from a call to `GradScaler.state_dict()`. + + Examples: + + .. code-block:: python + + # required: gpu,xpu + import paddle + + scaler = paddle.amp.GradScaler(enable=True, + init_loss_scaling=1024, + incr_ratio=2.0, + decr_ratio=0.5, + incr_every_n_steps=1000, + decr_every_n_nan_or_inf=2, + use_dynamic_loss_scaling=True) + scaler_state = scaler.state_dict() + scaler.load_state_dict(scaler_state) + """ + super(GradScaler, self).load_state_dict(state_dict) diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index 96ee4514ac2b93..2065bec8af3bc4 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -357,3 +357,55 @@ def set_decr_every_n_nan_or_inf(self, new_decr_every_n_nan_or_inf): new_decr_every_n_nan_or_inf(int): The new_decr_every_n_nan_or_inf used to update the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients. """ self._decr_every_n_nan_or_inf = new_decr_every_n_nan_or_inf + + def state_dict(self): + """ + Returns the state of the scaler as a `dict`, If this instance is not enabled, returns an empty dict. + + Reurns: + A dict of scaler includes: + scale (tensor): The loss scaling factor. + incr_ratio(float): The multiplier to use when increasing the loss scaling. + decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling. + incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients. + decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients. + incr_count(int): The number of recent consecutive unskipped steps. + decr_count(int): The number of recent consecutive skipped steps. + use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True. + """ + return { + "scale": self._scale.numpy(), + "incr_ratio": self._incr_ratio, + "decr_ratio": self._decr_ratio, + "incr_every_n_steps": self._incr_every_n_steps, + "decr_every_n_nan_or_inf": self._decr_every_n_nan_or_inf, + "incr_count": self._incr_count, + "decr_count": self._decr_count, + "use_dynamic_loss_scaling": self._use_dynamic_loss_scaling + } if self._enable else {} + + def load_state_dict(self, state_dict): + """ + Loads the scaler state. + + Args: + state_dict(dict): scaler state. Should be an object returned from a call to `AmpScaler.state_dict()`. + """ + if not self._enable: + return + + if len(state_dict) == 0: + raise RuntimeError( + "The input state dict is empty, possibly because it was saved " + "from a disabled instance of GradScaler.") + + self._init_loss_scaling = state_dict["scale"][0] + self._scale = to_variable( + np.array([self._init_loss_scaling]).astype(np.float32)) + self._incr_ratio = state_dict["incr_ratio"] + self._decr_ratio = state_dict["decr_ratio"] + self._incr_every_n_steps = state_dict["incr_every_n_steps"] + self._decr_every_n_nan_or_inf = state_dict["decr_every_n_nan_or_inf"] + self._incr_count = state_dict["incr_count"] + self._decr_count = state_dict["decr_count"] + self._use_dynamic_loss_scaling = state_dict["use_dynamic_loss_scaling"] diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index e3d2bda8921287..17d50ed8c19de0 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -237,6 +237,37 @@ def test_get_and_set(self): scaler.set_init_loss_scaling(100) self.assertEqual(scaler.get_init_loss_scaling() == 100, True) + def test_state_dict_and_load_state_dict(self): + with fluid.dygraph.guard(): + scaler1 = paddle.amp.GradScaler( + enable=True, + init_loss_scaling=14, + incr_ratio=233.0, + decr_ratio=0.523, + incr_every_n_steps=1090, + decr_every_n_nan_or_inf=20, + use_dynamic_loss_scaling=True) + scaler_state = scaler1.state_dict() + scaler2 = paddle.amp.GradScaler(enable=True) + scaler2.load_state_dict(scaler_state) + self.assertEqual(scaler2.get_init_loss_scaling() == 14, True) + self.assertEqual(scaler2.get_incr_ratio() == 233.0, True) + self.assertEqual(scaler2.get_decr_ratio() == 0.523, True) + self.assertEqual(scaler2.get_incr_every_n_steps() == 1090, True) + self.assertEqual(scaler2.get_decr_every_n_nan_or_inf() == 20, True) + + scaler3 = paddle.amp.GradScaler(enable=False) + scaler3.load_state_dict(scaler_state) + self.assertEqual(scaler3.is_enable() == False, True) + + def test_state_dict_and_load_state_dict_error(self): + def test_error(): + state_empty = {} + scaler = paddle.amp.GradScaler(enable=True) + scaler.load_state_dict(state_empty) + + self.assertRaises(RuntimeError, test_error) + def reader_decorator(reader): def __reader__(): @@ -248,6 +279,112 @@ def __reader__(): return __reader__ +class TestGradScalerStateDict(unittest.TestCase): + def train_resnet(self, + enable_amp=True, + use_data_loader=True, + use_save_load=True): + seed = 90 + + batch_size = train_parameters["batch_size"] + batch_num = 4 + + paddle.seed(seed) + paddle.framework.random._manual_program_seed(seed) + + resnet = ResNet(use_cudnn=True) + optimizer = optimizer_setting( + train_parameters, parameter_list=resnet.parameters()) + np.random.seed(seed) + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size) + + dy_param_init_value = {} + for param in resnet.parameters(): + dy_param_init_value[param.name] = param.numpy() + + program = None + scaler = paddle.amp.GradScaler( + enable=enable_amp, init_loss_scaling=2.**10) + + if use_data_loader: + train_reader = paddle.batch( + reader_decorator(paddle.dataset.flowers.train(use_xmap=False)), + batch_size=batch_size, + drop_last=True) + train_loader = fluid.io.DataLoader.from_generator( + capacity=4, + use_double_buffer=True, + iterable=True, + return_list=True) + train_loader.set_sample_list_generator(train_reader) + train_reader = train_loader + + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num: + break + if use_data_loader: + img, label = data + else: + dy_x_data = np.array([x[0].reshape(3, 224, 224) + for x in data]).astype('float32') + if len(np.array([x[1] + for x in data]).astype('int64')) != batch_size: + continue + y_data = np.array( + [x[1] for x in data]).astype('int64').reshape(-1, 1) + + img = paddle.to_tensor(dy_x_data) + label = paddle.to_tensor(y_data) + label.stop_gradient = True + + with paddle.amp.auto_cast(enable=enable_amp): + out = resnet(img) + + loss = paddle.nn.functional.cross_entropy(input=out, label=label) + avg_loss = paddle.mean(x=loss) + + dy_out = avg_loss.numpy() + + scaled_loss = scaler.scale(avg_loss) + scaled_loss.backward() + + scaler.minimize(optimizer, scaled_loss) + + dy_grad_value = {} + for param in resnet.parameters(): + if param.trainable: + np_array = np.array(param._grad_ivar().value().get_tensor()) + dy_grad_value[param.name + fluid.core.grad_var_suffix( + )] = np_array + + resnet.clear_gradients() + + dy_param_value = {} + for param in resnet.parameters(): + dy_param_value[param.name] = param.numpy() + + if use_save_load and batch_id == 2: + paddle.save(scaler.state_dict(), 'ResNet_model.pdparams') + dict_load = paddle.load('ResNet_model.pdparams') + scaler.load_state_dict(dict_load) + if use_data_loader: + train_reader._reset() + return dy_out, dy_param_value, dy_grad_value + + def test_with_state_dict(self): + if fluid.core.is_compiled_with_cuda(): + fluid.set_flags({"FLAGS_cudnn_deterministic": True}) + with fluid.dygraph.guard(): + out_use_state_dict = self.train_resnet( + enable_amp=True, use_data_loader=True, use_save_load=True) + out_no_state_dict = self.train_resnet( + enable_amp=True, use_data_loader=True, use_save_load=False) + print('save_load:', out_use_state_dict[0], out_no_state_dict[0]) + self.assertTrue( + np.allclose(out_use_state_dict[0], out_no_state_dict[0])) + + class TestResnet2(unittest.TestCase): """ Use paddle-2.0 API @@ -338,6 +475,8 @@ def train_resnet(self, enable_amp=True, use_data_loader=False): return dy_out, dy_param_value, dy_grad_value def test_resnet(self): + if fluid.core.is_compiled_with_cuda(): + fluid.set_flags({"FLAGS_cudnn_deterministic": True}) with fluid.dygraph.guard(): out_fp32 = self.train_resnet(enable_amp=False) out_amp = self.train_resnet(enable_amp=True) @@ -345,6 +484,8 @@ def test_resnet(self): self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2)) def test_with_data_loader(self): + if fluid.core.is_compiled_with_cuda(): + fluid.set_flags({"FLAGS_cudnn_deterministic": True}) with fluid.dygraph.guard(): out_fp32 = self.train_resnet(enable_amp=False, use_data_loader=True) out_amp = self.train_resnet(enable_amp=True, use_data_loader=True) @@ -425,6 +566,8 @@ def train_resnet(self, enable_amp=True): return dy_out, dy_param_value, dy_grad_value def test_resnet(self): + if fluid.core.is_compiled_with_cuda(): + fluid.set_flags({"FLAGS_cudnn_deterministic": True}) out_fp32 = self.train_resnet(enable_amp=False) out_amp = self.train_resnet(enable_amp=True) print(out_fp32[0], out_amp[0]) From 0a5c99e85892b5c1d6603788a5a635b9528df042 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Wed, 11 Aug 2021 14:49:07 +0200 Subject: [PATCH 017/126] [oneDNN] Fix to issue #34554 (#34623) * - Added softmax without caching * - Binary is no longer manually cached * - Activation onednn caching removed * - Removed manual caching of activation * - modified UT * - fix * - fix * - fixes to building * - fix * - fix * - fix to UT * - Faulty UT workaround * - approval workaround * - Fixes after review * - compilation fixes * - more lint fixes * - more fixes after review * - fixes after another round of review --- .../mkldnn/elementwise_mkldnn_op.h | 19 +- .../mkldnn/elementwise_mul_mkldnn_op.cc | 10 +- .../operators/mkldnn/activation_mkldnn_op.cc | 11 +- .../operators/mkldnn/caching_tests.cmake | 2 +- .../fluid/operators/mkldnn/scale_mkldnn_op.cc | 8 +- .../operators/mkldnn/softmax_mkldnn_op.cc | 105 ++-- .../operators/mkldnn/test_mkldnn_caching.cc | 84 ++-- paddle/fluid/platform/mkldnn_reuse.h | 476 ++++++++++++------ 8 files changed, 436 insertions(+), 279 deletions(-) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index ddad70a6a5f31c..ffcdc079985fa6 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -47,13 +47,24 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); - platform::BinaryMKLDNNHandler handler( - BINARY_OP, axis, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, y, z, - scale_x, scale_y, scale_o, ctx.OutputName("Out")); + platform::BinaryMKLDNNHandler handler(BINARY_OP, axis, mkldnn_engine, + ctx.GetPlace(), x, y, z, scale_x, + scale_y, scale_o); const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_memory = handler.AcquireDstMemory(z); + // (jczaja) For Inplace src and dst should be the same memory object. + // So x should share buffer with z. But UT mechanics is testing inplace + // execution for this op not checking that x can be bradcasted to match in + // shape y tensor. + // This is wrong as when x is to be broadcasted then z(out) will match the + // shape of y which is bigger than x. Hence if x is smaller in shape than z + // and they share a buffer (of + // shape x) then this buffer is not big enough to hold result of elementwise + // operation. + auto dst_memory = (x->numel() == z->numel() && x->IsSharedBufferWith(*z)) + ? src_x_memory + : handler.AcquireDstMemory(z); const auto binary_prim = handler.AcquireForwardPrimitive(); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index 1c246e8d189370..af4aab8047888a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -48,9 +48,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { if (dx) { // dx = dout*y platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine, - ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f, - ctx.InputName(framework::GradVarName("Out"))); + dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), + dout, y, dx, 1.0f, 1.0f, 1.0f); const auto src_dout_memory = handler.AcquireSrcMemory(dout); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); @@ -75,9 +74,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { // Handler is having nullptr passed instead of output tensor as // we want Dst buffer to be allocated by oneDNN not to use Tensor platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine, - ctx.GetPlace(), dout, x, nullptr, 1.0f, 1.0f, 1.0f, - ctx.InputName(framework::GradVarName("Out"))); + dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), + dout, x, nullptr, 1.0f, 1.0f, 1.0f); const auto src_dout_memory = handler.AcquireSrcMemory(dout); const auto src_x_memory = handler.AcquireSecondSrcMemory(x); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 3b92d2e2d88913..d992890adeec3e 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -79,15 +79,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx, paddle::platform::errors::PreconditionNotMet( "Operator DNNL eletwise_forward must use CPUPlace")); auto &dev_ctx = ctx.template device_context(); + const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); bool is_inplaced = x->IsSharedBufferWith(*y); - platform::ActivationMKLDNNHandler handler(algorithm, ctx, dev_ctx, - ctx.GetPlace(), x, - ctx.InputName("X"), is_inplaced); + platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, + ctx.GetPlace(), x); auto src_memory_p = handler.AcquireSrcMemory(x); auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y); @@ -106,13 +106,14 @@ template void eltwise_grad(const framework::ExecutionContext &ctx, mkldnn::algorithm algorithm) { auto &dev_ctx = ctx.template device_context(); + const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); const auto *diff_y = ctx.Input(framework::GradVarName("Out")); auto *diff_x = ctx.Output(framework::GradVarName("X")); - platform::ActivationMKLDNNHandler handler( - algorithm, ctx, dev_ctx, ctx.GetPlace(), x, diff_y, ctx.InputName("X")); + platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, + ctx.GetPlace(), x, diff_y); auto src_memory_p = handler.AcquireBackwardSrcMemory(x); auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y); diff --git a/paddle/fluid/operators/mkldnn/caching_tests.cmake b/paddle/fluid/operators/mkldnn/caching_tests.cmake index 4130c295b203eb..d7c295672e0021 100644 --- a/paddle/fluid/operators/mkldnn/caching_tests.cmake +++ b/paddle/fluid/operators/mkldnn/caching_tests.cmake @@ -1 +1 @@ -cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op softmax scope device_context enforce) +cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op conv_op im2col vol2col softmax scope device_context enforce) diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc index ae17048b5d568b..84ac14d04b85b3 100644 --- a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc @@ -29,6 +29,7 @@ class ScaleMKLDNNKernel : public framework::OpKernel { void RunKernel(const framework::ExecutionContext& ctx) const { const auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); auto* x = ctx.Input("X"); auto* out = ctx.Output("Out"); @@ -36,11 +37,12 @@ class ScaleMKLDNNKernel : public framework::OpKernel { bool is_inplaced = x->IsSharedBufferWith(*out); platform::ActivationMKLDNNHandler handler( - mkldnn::algorithm::eltwise_linear, ctx, dev_ctx, ctx.GetPlace(), x, - ctx.InputName("X"), is_inplaced); + mkldnn::algorithm::eltwise_linear, ctx, mkldnn_engine, ctx.GetPlace(), + x); auto src_memory_p = handler.AcquireSrcMemory(x); - auto dst_memory_p = handler.AcquireDstMemory(out); + auto dst_memory_p = + is_inplaced ? src_memory_p : handler.AcquireDstMemory(out); auto activation_p = handler.AcquireForwardPrimitive(); auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index e065800e4d1c71..b0f27719bf9adc 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -32,69 +32,56 @@ using platform::to_void_cast; template class SoftmaxMKLDNNHandler - : public platform::MKLDNNHandlerT { + : public platform::MKLDNNHandlerNoCachingT { public: - SoftmaxMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, - const mkldnn::engine mkldnn_engine, + SoftmaxMKLDNNHandler(const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* input, - Tensor* output, const int axis, - const std::string uniq_name, bool is_inplaced) - : platform::MKLDNNHandlerT( - dev_ctx, mkldnn_engine, cpu_place, - // Softmax may be inplace then uniq_name is no longer unique - is_inplaced ? platform::CreateKey( - dev_ctx, framework::vectorize(input->dims()), - axis, uniq_name) - : platform::CreateKey( - dev_ctx, framework::vectorize(input->dims()), - uniq_name)) { - if (!this->isCached()) { - PADDLE_ENFORCE_EQ( - input->dims(), output->dims(), - platform::errors::InvalidArgument( - "The shape of input and output tensor must be identical.")); - - auto softmax_tz = framework::vectorize(input->dims()); - auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), - input->format()); - - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, - axis); - } + Tensor* output, const int axis) + : platform::MKLDNNHandlerNoCachingT( + mkldnn_engine, cpu_place) { + PADDLE_ENFORCE_EQ( + input->dims(), output->dims(), + platform::errors::InvalidArgument( + "The shape of input and output tensor must be identical.")); + + auto softmax_tz = framework::vectorize(input->dims()); + auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), + input->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, + axis); } SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* out, const Tensor* out_grad, Tensor* in_x_grad, const std::string& unique_name) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(out->dims()), - unique_name)) { - if (!this->isBwdCached()) { - PADDLE_ENFORCE_EQ( - out_grad->dims(), in_x_grad->dims(), - platform::errors::InvalidArgument("The shape of softmax_grad's input " - "and output must be identical.")); - - auto dims = out_grad->dims(); // input and output share the same shape - const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); - auto softmax_tz = framework::vectorize(dims); - - auto data_softmax_md = MKLDNNMemDesc( - softmax_tz, platform::MKLDNNGetDataType(), out->format()); - auto diff_softmax_md = MKLDNNMemDesc( - softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); - - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, - data_softmax_md, axis); - this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, - axis); - } + : platform::MKLDNNHandlerNoCachingT( + mkldnn_engine, cpu_place) { + PADDLE_ENFORCE_EQ(out_grad->dims(), in_x_grad->dims(), + platform::errors::InvalidArgument( + "The shape of softmax_grad's input " + "and output must be identical, but shapes differ, " + "out_grad: %s in_grad: %s", + out_grad->dims(), in_x_grad->dims())); + + auto dims = out_grad->dims(); // input and output share the same shape + const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); + auto softmax_tz = framework::vectorize(dims); + + auto data_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out->format()); + auto diff_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, + data_softmax_md, axis); + this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, + axis); } }; @@ -111,9 +98,8 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { const int axis = CanonicalAxis(ctx.Attr("axis"), input->dims().size()); - SoftmaxMKLDNNHandler handler(dev_ctx, mkldnn_engine, ctx.GetPlace(), - input, output, axis, ctx.OutputName("Out"), - is_inplaced); + SoftmaxMKLDNNHandler handler(mkldnn_engine, ctx.GetPlace(), input, + output, axis); auto softmax_src_memory_p = handler.AcquireSrcMemory(input); // For Inplace src and and dst are the same memory object @@ -149,11 +135,12 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { paddle::platform::errors::PreconditionNotMet( "Operator DNNL SoftmaxGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); const Tensor* output = ctx.Input("Out"); auto* out_grad = ctx.template Input(framework::GradVarName("Out")); auto* in_x_grad = ctx.template Output(framework::GradVarName("X")); - SoftmaxMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), output, + SoftmaxMKLDNNHandler handler(ctx, mkldnn_engine, ctx.GetPlace(), output, out_grad, in_x_grad, ctx.InputName("Out")); auto dst_memory_p = handler.AcquireDstMemory(output); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index cad4f47ec14022..7251653793f899 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -33,6 +33,8 @@ USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); +USE_OP(conv2d); +USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); namespace paddle { namespace operators { @@ -64,16 +66,19 @@ class CacheTester { template void RunOperator(const platform::Place &place, const std::string &op_type, - const framework::DDim &dims, const std::string &output_name, - bool inplace = false) { + const framework::DDim &dims, const std::string &first_input) { framework::Scope scope; std::map num_inputs = {{"softmax", 1}, {"relu", 1}, + {"conv2d", 2}, {"elementwise_add", 2}, {"elementwise_mul", 2}}; - std::string first_input = inplace == true ? output_name : "x"; + std::string first_input_var_name = (op_type == "conv2d") ? "Input" : "X"; + std::string second_input_var_name = (op_type == "conv2d") ? "Filter" : "Y"; + std::string output_var_name = (op_type == "conv2d") ? "Output" : "Out"; + std::string output_name = "output"; std::vector input_names = { {first_input, scope.Var(first_input)->GetMutable()}, @@ -113,71 +118,40 @@ void RunOperator(const platform::Place &place, const std::string &op_type, auto &pool = platform::DeviceContextPool::Instance(); - auto op = num_inputs[op_type] > 1 - ? framework::OpRegistry::CreateOp( - op_type, {{"X", {first_input}}, {"Y", {"x1"}}}, - {{"Out", {output_name}}}, {{"use_mkldnn", {true}}}) - : framework::OpRegistry::CreateOp( - op_type, {{"X", {first_input}}}, {{"Out", {output_name}}}, - {{"use_mkldnn", {true}}}); + auto op = + num_inputs[op_type] > 1 + ? framework::OpRegistry::CreateOp( + op_type, {{first_input_var_name, {first_input}}, + {second_input_var_name, {"x1"}}}, + {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}) + : framework::OpRegistry::CreateOp( + op_type, {{first_input_var_name, {first_input}}}, + {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}); op->Run(scope, place); pool.Get(place)->Wait(); } -TEST(test_softmax_reuse_cache, cpu_place) { - framework::DDim dims({32, 64}); +TEST(test_conv2d_reuse_cache, cpu_place) { + framework::DDim dims({1, 16, 32, 64}); platform::CPUPlace p; CacheTester ct; - RunOperator(p, "softmax", dims, "softmax_out"); - RunOperator(p, "softmax", dims, "softmax_out"); - PADDLE_ENFORCE_EQ(ct.Analyze(4), true, + RunOperator(p, "conv2d", dims, "input_signal"); + RunOperator(p, "conv2d", dims, "input_signal"); + PADDLE_ENFORCE_EQ(ct.Analyze(9), true, platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); + "Invalid number of cached oneDNN objects")); } -TEST(test_softmax_noreuse_cache, cpu_place) { - framework::DDim dims({32, 64}); +TEST(test_conv2d_noreuse_cache, cpu_place) { + framework::DDim dims({1, 16, 32, 64}); platform::CPUPlace p; CacheTester ct; - RunOperator(p, "softmax", dims, "softmax_out"); - RunOperator(p, "softmax", dims, "softmax_out2"); - PADDLE_ENFORCE_EQ(ct.Analyze(8), true, + RunOperator(p, "conv2d", dims, "input_signal"); + RunOperator(p, "conv2d", dims, "input_signal2"); + PADDLE_ENFORCE_EQ(ct.Analyze(18), true, platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - -TEST(test_softmax_inplace_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "softmax", dims, "softmax_out"); - RunOperator(p, "softmax", dims, "softmax_out", true); - PADDLE_ENFORCE_EQ(ct.Analyze(7), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - -TEST(test_relu_inplace_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "relu", dims, "relu_out"); - RunOperator(p, "relu", dims, "relu_out", true); - PADDLE_ENFORCE_EQ(ct.Analyze(7), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - -TEST(test_elementwise_add_reuse_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "elementwise_add", dims, "elementwise_add_out"); - RunOperator(p, "relu", dims, "elementwise_add_out", true); - PADDLE_ENFORCE_EQ(ct.Analyze(8), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); + "Invalid number of cached oneDNN objects")); } } // namespace operators diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index f63d45d7ff6ae6..95b8e0c610b1d4 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -34,6 +34,211 @@ using framework::Tensor; using user_function = std::function(const float*)>; using memory = mkldnn::memory; +template +class MKLDNNHandlerNoCachingT { + public: + MKLDNNHandlerNoCachingT(mkldnn::engine engine, platform::Place cpu_place) + : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { + platform::MKLDNNDeviceContext::tls().log_lib_version(); + } + + std::shared_ptr AcquireForwardPrimitive() { + return std::make_shared(*fwd_pd_); + } + + std::shared_ptr AcquireBackwardPrimitive() { + return std::make_shared(*bwd_pd_); + } + + std::shared_ptr AcquireBackwardWeightsPrimitive() { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, platform::errors::Unavailable("BWD_PD should be set when " + "getting BWD prim .")); + return std::make_shared(*bwd_w_pd_); + } + + std::shared_ptr AcquireSrcMemory( + const framework::Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(), + to_void_cast(input_data)); + } + + template + std::shared_ptr AcquireDstMemory(framework::Tensor* output) { + T_out* ptr = + output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr); + } + + template + std::shared_ptr AcquireDstMemory(void) { + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc()); + } + + template + std::shared_ptr AcquireDstMemory( + const framework::Tensor* output) { + const T_out* output_data = output->data(); + return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), + to_void_cast(output_data)); + } + + std::shared_ptr AcquireDiffDstMemory( + const framework::Tensor* diffdst) { + const T* ptr = diffdst->data(); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_desc(), + to_void_cast(ptr)); + } + + std::shared_ptr AcquireDiffSrcMemory( + framework::Tensor* diffsrc) { + T* ptr = + diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr); + } + + // Buffer of given Tensor is used for oneDNN computation + std::shared_ptr AcquireDiffWeightsMemory( + framework::Tensor* diff_weights) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + T* ptr = diff_weights->mutable_data( + place_, bwd_w_pd_->diff_weights_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), + ptr); + } + + // Buffer is allocated by oneDNN to store computation results + std::shared_ptr AcquireDiffWeightsMemory(void) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc()); + } + + protected: + // If your primitive descriptor requires attributes, pass them as a + // first argument and paramters to descriptor constructor in the following + // arguments. Otherwise, all arguments will be forwarded to descriptor + // constructor, including the first one. + template + void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { + CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); + } + + // Using sfinae to specialise variadic function. Workaround for not having + // if constexpr in C++ 11. + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(args)...); + fwd_pd_ = std::make_shared( + fwd_desc, first, engine_); + } + + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(first), + std::forward(args)...); + fwd_pd_ = + std::make_shared(fwd_desc, engine_); + } + + template + void AcquireBackwardPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL(fwd_pd_, + platform::errors::Unavailable( + "Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = typename TBackward::desc(std::forward(args)...); + bwd_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + template + void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL(fwd_pd_, + platform::errors::Unavailable( + "Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = + typename TBackward_params::desc(std::forward(args)...); + bwd_w_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::desc md, void* ptr) { + return std::make_shared(md, engine_, ptr); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::desc md) { + return std::make_shared(md, engine_); + } + + void AcquireReorder(const std::shared_ptr& user_memory_p, + const std::shared_ptr& target_memory_p) { + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } + + template + std::shared_ptr AcquireMemoryWithReorder( + const mkldnn::memory::desc& user_md, + const mkldnn::memory::desc& target_md, void* ptr, + const std::string& suffix, bool is_persistent = false, + std::function(const F*)> custom_reorder_func = {}) { + std::shared_ptr target_memory_p; + if (custom_reorder_func) { + auto reordered_data = + custom_reorder_func(reinterpret_cast(ptr)); + ptr = reinterpret_cast(reordered_data.get()); + } + auto user_memory_p = std::make_shared(user_md, engine_, ptr); + if (user_md != target_md) { + target_memory_p = std::make_shared(target_md, engine_); + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } + return target_memory_p; + } + + mkldnn::engine engine_; + platform::Place place_; + std::shared_ptr fwd_pd_; + std::shared_ptr bwd_pd_; + std::shared_ptr bwd_w_pd_; +}; + template @@ -79,7 +284,7 @@ class MKLDNNHandlerT { std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); if (backward_p == nullptr) { PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, platform::errors::Unavailable( - "Error: BWD_PD should be set when " + "BWD_PD should be set when " "getting BWD prim witk key: %s .", key_p)); backward_p = std::make_shared(*bwd_w_pd_); @@ -138,7 +343,7 @@ class MKLDNNHandlerT { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, platform::errors::Unavailable( - "Error: BWD_W_PD should be set when getting BWD grad of weights.")); + "BWD_W_PD should be set when getting BWD grad of weights.")); T* ptr = diff_weights->mutable_data( place_, bwd_w_pd_->diff_weights_desc().get_size()); return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), ptr, @@ -150,7 +355,7 @@ class MKLDNNHandlerT { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, platform::errors::Unavailable( - "Error: BWD_W_PD should be set when getting BWD grad of weights.")); + "BWD_W_PD should be set when getting BWD grad of weights.")); return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), "@diff_wei_mem_p"); } @@ -589,70 +794,70 @@ class MKLDNNHandler { }; template -class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT { +class BinaryMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { public: BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis, - const MKLDNNDeviceContext& dev_ctx, const mkldnn::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, Tensor* z, - float scale_x, float scale_y, float scale_z, - const std::string& uniq_name) - : platform::MKLDNNHandlerT( - dev_ctx, engine, cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), - uniq_name)) { - if (!this->isCached()) { - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for X tensor.")); - PADDLE_ENFORCE_NE( - x->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for X tensor.")); - - PADDLE_ENFORCE_EQ( - y->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for Y tensor.")); - PADDLE_ENFORCE_NE( - y->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for Y tensor.")); - - const auto src_x_tz = framework::vectorize(x->dims()); - const auto src_y_tz = framework::vectorize(y->dims()); - // if output tensor(z) is nullptr then we are computing into oneDNN - // managed buffer - auto rankdiff = x->dims().size() - y->dims().size(); - const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) - : framework::vectorize(z->dims()); - - auto src0_md = dnnl::memory::desc( - src_x_tz, platform::MKLDNNGetDataType(), x->format()); - auto src1_md = dnnl::memory::desc( - src_y_tz, platform::MKLDNNGetDataType(), y->format()); - if (rankdiff > 0) { // Second input is of smaller rank than first - std::vector dims1_ex(rankdiff, 1); - dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), - src_y_tz.begin(), src_y_tz.end()); - src1_md = src1_md.reshape(dims1_ex); - } else if (rankdiff < 0) { // First input is of smaller than second - std::vector dims0_ex(-rankdiff, 1); - dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), - src_x_tz.begin(), src_x_tz.end()); - src0_md = src0_md.reshape(dims0_ex); - } - const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); - - auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); - this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, - src1_md, dst_md); + float scale_x, float scale_y, float scale_z) + : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { + PADDLE_ENFORCE_EQ( + x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d", + DataLayout::kMKLDNN, x->layout())); + PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for X tensor : %d (undef)", + static_cast(x->format()))); + + PADDLE_ENFORCE_EQ( + y->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d", + DataLayout::kMKLDNN, y->layout())); + PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for Y tensor : %d (undef)", + static_cast(y->format()))); + + const auto src_x_tz = framework::vectorize(x->dims()); + const auto src_y_tz = framework::vectorize(y->dims()); + // if output tensor(z) is nullptr then we are computing into oneDNN + // managed buffer + auto rankdiff = x->dims().size() - y->dims().size(); + const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) + : framework::vectorize(z->dims()); + + auto src0_md = dnnl::memory::desc( + src_x_tz, platform::MKLDNNGetDataType(), x->format()); + auto src1_md = dnnl::memory::desc( + src_y_tz, platform::MKLDNNGetDataType(), y->format()); + if (rankdiff > 0) { // Second input is of smaller rank than first + std::vector dims1_ex(rankdiff, 1); + dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), + src_y_tz.begin(), src_y_tz.end()); + src1_md = src1_md.reshape(dims1_ex); + } else if (rankdiff < 0) { // First input is of smaller than second + std::vector dims0_ex(-rankdiff, 1); + dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), + src_x_tz.begin(), src_x_tz.end()); + src0_md = src0_md.reshape(dims0_ex); } + const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); + + auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); + this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, + dst_md); } std::shared_ptr AcquireSecondSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->src1_desc(), to_void_cast(input_data), "@src1_mem_p"); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(), + to_void_cast(input_data)); } private: @@ -775,111 +980,95 @@ class ReductionMKLDNNHandler template class ActivationMKLDNNHandler - : public MKLDNNHandlerT { + : public MKLDNNHandlerNoCachingT { public: ActivationMKLDNNHandler(mkldnn::algorithm algorithm, const framework::ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, Place cpu_place, - const framework::Tensor* in_x, - const std::string& unique_name, bool is_inplaced) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - is_inplaced ? platform::CreateKey( - dev_ctx, framework::vectorize(in_x->dims()), "a", - algorithm, unique_name) - : platform::CreateKey( - dev_ctx, framework::vectorize(in_x->dims()), "a", - unique_name)) { - if (!this->isCached()) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - // eltwise_linear means we are in scale op - if (algorithm == mkldnn::algorithm::eltwise_linear) { - bool bias_after_scale = ctx.Attr("bias_after_scale"); - auto* scale_tensor = ctx.Input("ScaleTensor"); - alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") - : (float)*(scale_tensor->data()); - beta = ctx.Attr("bias"); - // if bias_after_scale == true - // out = scale*X + bias - // else - // out = scale*(X + bias) = scale*X + scale*bias - if (!bias_after_scale) beta *= alpha; - } else { - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); - } + const mkldnn::engine engine, Place cpu_place, + const framework::Tensor* in_x) + : platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + // eltwise_linear means we are in scale op + if (algorithm == mkldnn::algorithm::eltwise_linear) { + bool bias_after_scale = ctx.Attr("bias_after_scale"); + auto* scale_tensor = ctx.Input("ScaleTensor"); + alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") + : (float)*(scale_tensor->data()); + beta = ctx.Attr("bias"); + // if bias_after_scale == true + // out = scale*X + bias + // else + // out = scale*(X + bias) = scale*X + scale*bias + if (!bias_after_scale) beta *= alpha; + } else { + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); } + } - PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, - platform::errors::Unimplemented( - "Input dimension size can be 1, 2, 3, 4, " - "5, or 6, but now the dimension size is", - in_x->dims().size())); + PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, + platform::errors::Unimplemented( + "Input dimension size can be 1, 2, 3, 4, " + "5, or 6, but now the dimension size is", + in_x->dims().size())); - auto src_tz = framework::vectorize(in_x->dims()); - auto src_fmt = - src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); - auto md = mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), - src_fmt); + auto src_tz = framework::vectorize(in_x->dims()); + auto src_fmt = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto md = + mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), src_fmt); - this->AcquireForwardPrimitiveDescriptor( - mkldnn::prop_kind::forward_training, algorithm, md, alpha, beta); - } + this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, + algorithm, md, alpha, beta); } ActivationMKLDNNHandler(mkldnn::algorithm algorithm, const framework::ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, Place cpu_place, - const framework::Tensor* in_x, const Tensor* out_grad, - const std::string& unique_name) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()), - "a", unique_name)) { - if (!this->isBwdCached()) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); - } + const mkldnn::engine engine, Place cpu_place, + const framework::Tensor* in_x, const Tensor* out_grad) + : platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); + } - auto diff_dst_tz = framework::vectorize(out_grad->dims()); + auto diff_dst_tz = framework::vectorize(out_grad->dims()); - auto src_fmt = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); - auto diff_fmt = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); + auto src_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto diff_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); - auto dims = framework::vectorize(in_x->dims()); - auto diff_dst_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), diff_fmt); - auto src_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), src_fmt); + auto dims = framework::vectorize(in_x->dims()); + auto diff_dst_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), diff_fmt); + auto src_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), src_fmt); - this->AcquireForwardPrimitiveDescriptor( - mkldnn::prop_kind::forward_training, algorithm, src_md, alpha, beta); - this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, - alpha, beta); - } + this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, + algorithm, src_md, alpha, beta); + this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, + alpha, beta); } std::shared_ptr AcquireBackwardSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(), - to_void_cast(input_data), - "@bwd-src_mem_p"); + to_void_cast(input_data)); } }; @@ -1430,11 +1619,6 @@ using ConvMKLDNNHandler = mkldnn::convolution_backward_data, mkldnn::convolution_backward_weights>; -using ConvTransposeMKLDNNHandler = - ConvMKLDNNTemplateHandler; - template static std::shared_ptr SetDstMemory( const framework::ExecutionContext& ctx, framework::Tensor* output, From 1c31d9d3fafaed1357b3e4496a09a688b208c363 Mon Sep 17 00:00:00 2001 From: Peihan Date: Thu, 12 Aug 2021 09:57:48 +0800 Subject: [PATCH 018/126] add det_mv3_db & LeViT test case in pr-ci-inference (#34803) * add det_mv3_db & LeViT test case in pr-ci-inference * fix LeViT model dir bugs * fix grammar error --- .../inference/tests/infer_ut/CMakeLists.txt | 2 +- paddle/fluid/inference/tests/infer_ut/run.sh | 52 +++++ .../inference/tests/infer_ut/test_LeViT.cc | 179 ++++++++++++++++++ .../tests/infer_ut/test_det_mv3_db.cc | 158 ++++++++++++++++ .../inference/tests/infer_ut/test_suite.h | 2 +- 5 files changed, 391 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/inference/tests/infer_ut/test_LeViT.cc create mode 100644 paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt index f546ef2b45e0a7..16d1f211a860f9 100644 --- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt +++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt @@ -66,7 +66,7 @@ else() if(WITH_MKL) set(FLAG_OPENMP "-fopenmp") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ${FLAG_OPENMP}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 ${FLAG_OPENMP}") endif() if(WITH_GPU) diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh index ba38a4489035fb..64ada23767f1fa 100755 --- a/paddle/fluid/inference/tests/infer_ut/run.sh +++ b/paddle/fluid/inference/tests/infer_ut/run.sh @@ -65,6 +65,18 @@ for model_name in $download_list; do download $url_prefix $model_name done +ocr_download_list='ocr_det_mv3_db' +for model_name in $ocr_download_list; do + url_prefix="https://paddle-qa.bj.bcebos.com/inference_model/2.1.1/ocr" + download $url_prefix $model_name +done + +clas_download_list='LeViT' +for model_name in $clas_download_list; do + url_prefix="https://paddle-qa.bj.bcebos.com/inference_model/2.1.1/class" + download $url_prefix $model_name +done + # compile and run test cd $current_dir mkdir -p build @@ -92,6 +104,46 @@ if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then fi fi +# ---------tensorrt det_mv3_db on linux--------- +if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then + cmake .. -DPADDLE_LIB=${inference_install_dir} \ + -DWITH_MKL=$TURN_ON_MKL \ + -DDEMO_NAME=test_det_mv3_db \ + -DWITH_GPU=$TEST_GPU_CPU \ + -DWITH_STATIC_LIB=OFF \ + -DUSE_TENSORRT=$USE_TENSORRT \ + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_GTEST=ON + make -j$(nproc) + ./test_det_mv3_db \ + --modeldir=$DATA_DIR/ocr_det_mv3_db/ocr_det_mv3_db \ + --gtest_output=xml:test_det_mv3_db.xml + if [ $? -ne 0 ]; then + echo "test_det_mv3_db runs failed" >> ${current_dir}/build/test_summary.txt + EXIT_CODE=1 + fi +fi + +# ---------tensorrt LeViT on linux--------- +if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then + cmake .. -DPADDLE_LIB=${inference_install_dir} \ + -DWITH_MKL=$TURN_ON_MKL \ + -DDEMO_NAME=test_LeViT \ + -DWITH_GPU=$TEST_GPU_CPU \ + -DWITH_STATIC_LIB=OFF \ + -DUSE_TENSORRT=$USE_TENSORRT \ + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_GTEST=ON + make -j$(nproc) + ./test_LeViT \ + --modeldir=$DATA_DIR/LeViT/LeViT \ + --gtest_output=xml:test_LeViT.xml + if [ $? -ne 0 ]; then + echo "test_LeViT runs failed" >> ${current_dir}/build/test_summary.txt + EXIT_CODE=1 + fi +fi + if [[ -f ${current_dir}/build/test_summary.txt ]];then echo "=====================test summary======================" cat ${current_dir}/build/test_summary.txt diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc new file mode 100644 index 00000000000000..f115d1f898c94a --- /dev/null +++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc @@ -0,0 +1,179 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test_suite.h" // NOLINT + +DEFINE_string(modeldir, "", "Directory of the inference model."); + +namespace paddle_infer { + +paddle::test::Record PrepareInput(int batch_size) { + // init input data + int channel = 3; + int width = 224; + int height = 224; + paddle::test::Record image_Record; + int input_num = batch_size * channel * width * height; + std::vector input_data(input_num, 1); + image_Record.data = input_data; + image_Record.shape = std::vector{batch_size, channel, width, height}; + image_Record.type = paddle::PaddleDType::FLOAT32; + return image_Record; +} + +TEST(test_LeViT, analysis_gpu_bz1) { + // init input data + std::map my_input_data_map; + my_input_data_map["x"] = PrepareInput(1); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, + &truth_output_data, 1); + // get infer results + paddle_infer::services::PredictorPool pred_pool(config, 1); + SingleThreadPrediction(pred_pool.Retrive(0), &my_input_data_map, + &infer_output_data); + // check outputs + CompareRecord(&truth_output_data, &infer_output_data); + std::cout << "finish test" << std::endl; +} + +TEST(test_LeViT, trt_fp32_bz2) { + // init input data + std::map my_input_data_map; + my_input_data_map["x"] = PrepareInput(2); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config.EnableUseGpu(100, 0); + config.EnableTensorRtEngine( + 1 << 20, 2, 6, paddle_infer::PrecisionType::kFloat32, false, false); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, + &truth_output_data, 1); + // get infer results + paddle_infer::services::PredictorPool pred_pool(config, 1); + SingleThreadPrediction(pred_pool.Retrive(0), &my_input_data_map, + &infer_output_data); + // check outputs + CompareRecord(&truth_output_data, &infer_output_data); + std::cout << "finish test" << std::endl; +} + +TEST(test_LeViT, serial_diff_batch_trt_fp32) { + int max_batch_size = 5; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config_no_ir.SwitchIrOptim(false); + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config.EnableUseGpu(100, 0); + config.EnableTensorRtEngine(1 << 20, max_batch_size, 6, + paddle_infer::PrecisionType::kFloat32, false, + false); + paddle_infer::services::PredictorPool pred_pool(config, 1); + + for (int i = 1; i < max_batch_size; i++) { + // init input data + std::map my_input_data_map; + my_input_data_map["x"] = PrepareInput(i); + // init output data + std::map infer_output_data, + truth_output_data; + // get groudtruth by disbale ir + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, + &truth_output_data, 1); + // get infer results + SingleThreadPrediction(pred_pool.Retrive(0), &my_input_data_map, + &infer_output_data); + // check outputs + CompareRecord(&truth_output_data, &infer_output_data); + } + std::cout << "finish test" << std::endl; +} + +TEST(test_LeViT, multi_thread4_trt_fp32_bz2) { + int thread_num = 4; + // init input data + std::map my_input_data_map; + my_input_data_map["x"] = PrepareInput(2); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config.EnableUseGpu(100, 0); + config.EnableTensorRtEngine( + 1 << 20, 2, 6, paddle_infer::PrecisionType::kFloat32, false, false); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, + &truth_output_data, 1); + + // get infer results from multi threads + std::vector threads; + services::PredictorPool pred_pool(config, thread_num); + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool.Retrive(i), &my_input_data_map, + &infer_output_data, 2); + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + threads[i].join(); + CompareRecord(&truth_output_data, &infer_output_data); + } + + std::cout << "finish multi-thread test" << std::endl; +} + +} // namespace paddle_infer + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ::google::ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); +} diff --git a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc new file mode 100644 index 00000000000000..c5920d3b2d8d55 --- /dev/null +++ b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc @@ -0,0 +1,158 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test_suite.h" // NOLINT + +DEFINE_string(modeldir, "", "Directory of the inference model."); + +namespace paddle_infer { + +paddle::test::Record PrepareInput(int batch_size, int image_shape = 640) { + // init input data + int channel = 3; + int width = image_shape; + int height = image_shape; + paddle::test::Record image_Record; + int input_num = batch_size * channel * width * height; + std::vector input_data(input_num, 1); + image_Record.data = input_data; + image_Record.shape = std::vector{batch_size, channel, width, height}; + image_Record.type = paddle::PaddleDType::FLOAT32; + return image_Record; +} + +void PrepareDynamicShape(paddle_infer::Config* config, int max_batch_size = 4) { + // set dynamic shape range + std::map> min_input_shape = { + {"x", {1, 3, 50, 50}}, + {"conv2d_92.tmp_0", {1, 120, 20, 20}}, + {"conv2d_91.tmp_0", {1, 24, 10, 10}}, + {"conv2d_59.tmp_0", {1, 96, 20, 20}}, + {"nearest_interp_v2_1.tmp_0", {1, 256, 10, 10}}, + {"nearest_interp_v2_2.tmp_0", {1, 256, 20, 20}}, + {"conv2d_124.tmp_0", {1, 256, 20, 20}}, + {"nearest_interp_v2_3.tmp_0", {1, 64, 20, 20}}, + {"nearest_interp_v2_4.tmp_0", {1, 64, 20, 20}}, + {"nearest_interp_v2_5.tmp_0", {1, 64, 20, 20}}, + {"elementwise_add_7", {1, 56, 2, 2}}, + {"nearest_interp_v2_0.tmp_0", {1, 256, 2, 2}}}; + std::map> max_input_shape = { + {"x", {max_batch_size, 3, 2000, 2000}}, + {"conv2d_92.tmp_0", {max_batch_size, 120, 400, 400}}, + {"conv2d_91.tmp_0", {max_batch_size, 24, 200, 200}}, + {"conv2d_59.tmp_0", {max_batch_size, 96, 400, 400}}, + {"nearest_interp_v2_1.tmp_0", {max_batch_size, 256, 200, 200}}, + {"nearest_interp_v2_2.tmp_0", {max_batch_size, 256, 400, 400}}, + {"conv2d_124.tmp_0", {max_batch_size, 256, 400, 400}}, + {"nearest_interp_v2_3.tmp_0", {max_batch_size, 64, 400, 400}}, + {"nearest_interp_v2_4.tmp_0", {max_batch_size, 64, 400, 400}}, + {"nearest_interp_v2_5.tmp_0", {max_batch_size, 64, 400, 400}}, + {"elementwise_add_7", {max_batch_size, 56, 400, 400}}, + {"nearest_interp_v2_0.tmp_0", {max_batch_size, 256, 400, 400}}}; + std::map> opt_input_shape = { + {"x", {1, 3, 640, 640}}, + {"conv2d_92.tmp_0", {1, 120, 160, 160}}, + {"conv2d_91.tmp_0", {1, 24, 80, 80}}, + {"conv2d_59.tmp_0", {1, 96, 160, 160}}, + {"nearest_interp_v2_1.tmp_0", {1, 256, 80, 80}}, + {"nearest_interp_v2_2.tmp_0", {1, 256, 160, 160}}, + {"conv2d_124.tmp_0", {1, 256, 160, 160}}, + {"nearest_interp_v2_3.tmp_0", {1, 64, 160, 160}}, + {"nearest_interp_v2_4.tmp_0", {1, 64, 160, 160}}, + {"nearest_interp_v2_5.tmp_0", {1, 64, 160, 160}}, + {"elementwise_add_7", {1, 56, 40, 40}}, + {"nearest_interp_v2_0.tmp_0", {1, 256, 40, 40}}}; + config->SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, + opt_input_shape); +} + +TEST(test_det_mv3_db, analysis_gpu_bz4) { + // init input data + std::map my_input_data_map; + my_input_data_map["x"] = PrepareInput(4, 640); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, + &truth_output_data, 1); + // get infer results + paddle_infer::services::PredictorPool pred_pool(config, 1); + SingleThreadPrediction(pred_pool.Retrive(0), &my_input_data_map, + &infer_output_data); + // check outputs + CompareRecord(&truth_output_data, &infer_output_data, 1e-4); + std::cout << "finish test" << std::endl; +} + +TEST(test_det_mv3_db, multi_thread2_trt_fp32_dynamic_shape_bz2) { + int thread_num = 2; // thread > 2 may OOM + // init input data + std::map my_input_data_map; + my_input_data_map["x"] = PrepareInput(2, 640); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config.EnableUseGpu(100, 0); + config.EnableTensorRtEngine( + 1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, false, false); + PrepareDynamicShape(&config, 4); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, + &truth_output_data, 1); + + // get infer results from multi threads + std::vector threads; + services::PredictorPool pred_pool(config, thread_num); + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool.Retrive(i), &my_input_data_map, + &infer_output_data, 2); + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + threads[i].join(); + CompareRecord(&truth_output_data, &infer_output_data, 1e-4); + } + + std::cout << "finish multi-thread test" << std::endl; +} + +} // namespace paddle_infer + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ::google::ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); +} diff --git a/paddle/fluid/inference/tests/infer_ut/test_suite.h b/paddle/fluid/inference/tests/infer_ut/test_suite.h index 0e116b01847bfb..b0da828998ca24 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_suite.h +++ b/paddle/fluid/inference/tests/infer_ut/test_suite.h @@ -51,7 +51,7 @@ void SingleThreadPrediction(paddle_infer::Predictor *predictor, // inference for (size_t i = 0; i < repeat_times; ++i) { - predictor->Run(); + CHECK(predictor->Run()); } // get output data to Record From bc543e35d659086083b1acb00c60c148e88cc502 Mon Sep 17 00:00:00 2001 From: Fan Zhang Date: Thu, 12 Aug 2021 10:39:06 +0800 Subject: [PATCH 019/126] [NPU] Support npu op expand_v2 and expand_v2_grad (#34764) * [NPU] Support npu op expand_v2 and expand_v2_grad * [NPU] Support npu op expand_v2 and expand_v2_grad * [NPU] Support npu op expand_v2 and expand_v2_grad * update test_expand_v2_op_npu.py * update test_expand_v2_op_npu.py * modify expand_v2_op_npu.cc * modify expand_v2_op_npu.cc --- paddle/fluid/operators/expand_v2_op.h | 16 +- paddle/fluid/operators/expand_v2_op_npu.cc | 191 ++++++++++++ .../unittests/npu/test_expand_v2_op_npu.py | 277 ++++++++++++++++++ 3 files changed, 483 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/expand_v2_op_npu.cc create mode 100755 python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h index a720bd7b551823..08131b71064287 100644 --- a/paddle/fluid/operators/expand_v2_op.h +++ b/paddle/fluid/operators/expand_v2_op.h @@ -36,6 +36,12 @@ inline std::vector get_expand_shape( TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); shape_data = cpu_shape_tensor.data(); } +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(shape_tensor->place())) { + TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); + shape_data = cpu_shape_tensor.data(); + } +#endif auto vec_shape = std::vector(shape_data, shape_data + shape_tensor->numel()); return vec_shape; @@ -52,7 +58,15 @@ inline std::vector get_expand_shape( framework::Tensor temp; TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_epxand_shape.push_back(*temp.data()); - } else { + } +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(tensor->place())) { // NOLINT + framework::Tensor temp; + TensorCopySync(*tensor, platform::CPUPlace(), &temp); + vec_epxand_shape.push_back(*temp.data()); + } +#endif + else { // NOLINT vec_epxand_shape.push_back(*tensor->data()); } } diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc new file mode 100644 index 00000000000000..85fe86a9e606f3 --- /dev/null +++ b/paddle/fluid/operators/expand_v2_op_npu.cc @@ -0,0 +1,191 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/operators/expand_v2_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +class ExpandV2NPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* X = ctx.Input("X"); + auto* Out = ctx.Output("Out"); + + auto in_dims = X->dims(); + auto expand_shape = get_expand_shape(ctx); + auto vec_in_dims = framework::vectorize(in_dims); + auto diff = expand_shape.size() - vec_in_dims.size(); + vec_in_dims.insert(vec_in_dims.begin(), diff, 1); + std::vector final_expand_shape(vec_in_dims.size()); + for (size_t i = 0; i < vec_in_dims.size(); ++i) { + PADDLE_ENFORCE_NE(expand_shape[i], 0, + platform::errors::InvalidArgument( + "The expanded size cannot be zero.")); + if (i < diff) { // expand_shape = [3,4,-1,-1], X = [10,2] --> + // final_expand_shape = [3,4,10,2] + PADDLE_ENFORCE_GT( + expand_shape[i], 0, + platform::errors::InvalidArgument( + "The expanded size (%d) for non-existing dimensions must be " + "positive for expand_v2 op.", + expand_shape[i])); + final_expand_shape[i] = expand_shape[i]; + } else if (expand_shape[i] > 0) { // expand_shape = [3,4,10,4], X = + // [10,1] --> final_expand_shape = + // [3,4,10,4] + if (vec_in_dims[i] != 1) { + PADDLE_ENFORCE_EQ( + vec_in_dims[i], expand_shape[i], + platform::errors::InvalidArgument( + "The value (%d) of the non-singleton dimension does not match" + " the corresponding value (%d) in shape for expand_v2 op.", + vec_in_dims[i], expand_shape[i])); + final_expand_shape[i] = expand_shape[i]; + } else { + final_expand_shape[i] = expand_shape[i]; + } + } else { // expand_shape = [3,4,-1,-1], X = [10,2] --> final_expand_shape + // = [3,4,10,2] + PADDLE_ENFORCE_EQ( + expand_shape[i], -1, + platform::errors::InvalidArgument( + "When the value in shape is negative for expand_v2 op, " + "only -1 is supported, but the value received is %d.", + expand_shape[i])); + final_expand_shape[i] = vec_in_dims[i]; + } + } + + framework::NPUAttributeMap attr_input = {{"shape", final_expand_shape}}; + + auto rank = X->dims().size(); + + PADDLE_ENFORCE_GE( + rank, 1, + platform::errors::InvalidArgument( + "The rank of the input 'X' for expand_v2_npu op must be positive, " + "but the value received is %d.", + rank)); + PADDLE_ENFORCE_LE( + rank, MAX_RANK_SUPPORTED, + platform::errors::InvalidArgument( + "The rank of the input 'X' for expand_v2_npu op must be less than " + "or equal to %d, but the value received is %d.", + MAX_RANK_SUPPORTED, rank)); + auto shape_size = final_expand_shape.size(); + PADDLE_ENFORCE_GE( + shape_size, rank, + platform::errors::InvalidArgument( + "The number (%d) of elements of 'shape' for expand_v2_npu op must " + "be " + "greater than or equal to the rank (%d) of the input 'X'.", + shape_size, rank)); + PADDLE_ENFORCE_LE(shape_size, MAX_RANK_SUPPORTED, + platform::errors::InvalidArgument( + "The number (%d) of elements of 'shape' for " + "expand_v2_npu op must be " + "less than or equal to %d.", + shape_size, MAX_RANK_SUPPORTED)); + + framework::DDim out_dims = framework::make_ddim(final_expand_shape); + Out->Resize(out_dims); + Out->mutable_data(ctx.GetPlace()); + + const auto& runner = NpuOpRunner("ExpandD", {*X}, {*Out}, attr_input); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class ExpandV2NPUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); + + auto stream = + ctx.template device_context() + .stream(); + + // case 1: reduce dout dims to dx dims + // For example: [2, 120] --> [120] + auto reduce_ndim = dout->dims().size() - dx->dims().size(); + std::vector axes; + for (auto i = 0; i < reduce_ndim; ++i) { + axes.push_back(i); + } + + Tensor tmp_dout(dout->type()); + Tensor reduced_dout(dx->type()); + tmp_dout.ShareDataWith(*dout); + if (axes.size() != 0) { + std::vector reduced_dout_dims; + for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { + reduced_dout_dims.push_back(dout->dims()[i]); + } + tmp_dout.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_dout.mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); + tmp_dout = reduced_dout; + } + + // case 2: reduce axis of dout in which dim is 1 + // For example: [12, 140] --> [1, 140] + + // case 3: copy dout to dx when shape is totally same, and dim in dx != 1 + // For example: [2, 10, 5] --> [2, 10, 5] + axes.clear(); + for (auto i = 0; i < dx->dims().size(); ++i) { + if (dx->dims()[i] == 1) { + axes.push_back(i); + } + } + if (axes.size() != 0) { + const auto& runner = NpuOpRunner("ReduceSumD", {tmp_dout}, {*dx}, + {{"axes", axes}, {"keep_dims", true}}); + runner.Run(stream); + } else { + framework::TensorCopySync(tmp_dout, ctx.GetPlace(), dx); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL( + expand_v2, + ops::ExpandV2NPUKernel, + ops::ExpandV2NPUKernel, + ops::ExpandV2NPUKernel); + +REGISTER_OP_NPU_KERNEL( + expand_v2_grad, + ops::ExpandV2NPUGradKernel, + ops::ExpandV2NPUGradKernel, + ops::ExpandV2NPUGradKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py new file mode 100755 index 00000000000000..d48d2a8430134a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py @@ -0,0 +1,277 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import sys +import numpy as np +sys.path.append("..") +from op_test import OpTest +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard +import paddle + +paddle.enable_static() +np.random.seed(10) + + +# CANN Op Support X: float16, float32, int32, int8 ,uint8 +# Situation 1: shape is a list(without tensor) +class TestExpandV2NPUOpRank1(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "expand_v2" + self.dtype = np.float32 + self.init_data() + + self.inputs = {'X': np.random.random(self.ori_shape).astype(self.dtype)} + self.attrs = {'shape': self.shape} + output = np.tile(self.inputs['X'], self.expand_times) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def init_data(self): + self.ori_shape = [100] + self.shape = [100] + self.expand_times = [1] + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + +class TestExpandV2OpRank2_DimExpanding(TestExpandV2NPUOpRank1): + def init_data(self): + self.ori_shape = [120] + self.shape = [2, 120] + self.expand_times = [2, 1] + + +class TestExpandV2OpRank2(TestExpandV2NPUOpRank1): + def init_data(self): + self.ori_shape = [1, 140] + self.shape = [12, 140] + self.expand_times = [12, 1] + + +class TestExpandV2OpRank3_Corner(TestExpandV2NPUOpRank1): + def init_data(self): + self.ori_shape = (2, 10, 5) + self.shape = (2, 10, 5) + self.expand_times = (1, 1, 1) + + +class TestExpandV2OpRank4(TestExpandV2NPUOpRank1): + def init_data(self): + self.ori_shape = (2, 4, 5, 7) + self.shape = (-1, -1, -1, -1) + self.expand_times = (1, 1, 1, 1) + + +class TestExpandV2OpRank5(TestExpandV2NPUOpRank1): + def init_data(self): + self.ori_shape = (2, 4, 1, 15) + self.shape = (2, -1, 4, -1) + self.expand_times = (1, 1, 4, 1) + + +class TestExpandV2OpRank6(TestExpandV2NPUOpRank1): + def init_data(self): + self.ori_shape = (4, 1, 30) + self.shape = (2, -1, 4, 30) + self.expand_times = (2, 1, 4, 1) + + +# Situation 2: shape is a list(with tensor) +class TestExpandV2OpNPURank1_tensor_attr(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "expand_v2" + self.init_data() + self.dtype = np.float32 + expand_shapes_tensor = [] + for index, ele in enumerate(self.expand_shape): + expand_shapes_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs = { + 'X': np.random.random(self.ori_shape).astype(self.dtype), + 'expand_shapes_tensor': expand_shapes_tensor, + } + self.attrs = {"shape": self.infer_expand_shape} + output = np.tile(self.inputs['X'], self.expand_times) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def init_data(self): + self.ori_shape = [100] + self.expand_times = [1] + self.expand_shape = [100] + self.infer_expand_shape = [-1] + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + +class TestExpandV2OpRank2_Corner_tensor_attr( + TestExpandV2OpNPURank1_tensor_attr): + def init_data(self): + self.ori_shape = [12, 14] + self.expand_times = [1, 1] + self.expand_shape = [12, 14] + self.infer_expand_shape = [12, -1] + + +# Situation 3: shape is a tensor +class TestExpandV2NPUOpRank1_tensor(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "expand_v2" + self.init_data() + self.dtype = np.float32 + + self.inputs = { + 'X': np.random.random(self.ori_shape).astype(self.dtype), + 'Shape': np.array(self.expand_shape).astype("int32"), + } + self.attrs = {} + output = np.tile(self.inputs['X'], self.expand_times) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def init_data(self): + self.ori_shape = [100] + self.expand_times = [2, 1] + self.expand_shape = [2, 100] + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + +# Situation 4: input x is float16 +# skip grad check for float16 +class TestExpandV2OpFloat(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "expand_v2" + self.dtype = np.float16 + self.ori_shape = (2, 4, 20) + self.inputs = {'X': np.random.random(self.ori_shape).astype(self.dtype)} + self.attrs = {'shape': [2, 4, 20]} + output = np.tile(self.inputs['X'], (1, 1, 1)) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +# Situation 5: input x is int32 +# skip grad check for int32 +class TestExpandV2OpInteger(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "expand_v2" + self.inputs = { + 'X': np.random.randint( + 10, size=(2, 4, 20)).astype("int32") + } + self.attrs = {'shape': [2, 4, 20]} + output = np.tile(self.inputs['X'], (1, 1, 1)) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestExpandV2Error(unittest.TestCase): + def test_errors(self): + with program_guard(Program(), Program()): + x1 = fluid.create_lod_tensor( + np.array([[-1]]), [[1]], paddle.NPUPlace(0)) + shape = [2, 2] + self.assertRaises(TypeError, paddle.tensor.expand, x1, shape) + x2 = fluid.layers.data(name='x2', shape=[2], dtype="uint8") + self.assertRaises(TypeError, paddle.tensor.expand, x2, shape) + x3 = fluid.layers.data(name='x3', shape=[2], dtype="bool") + x3.stop_gradient = False + self.assertRaises(ValueError, paddle.tensor.expand, x3, shape) + + +# Test python API +class TestExpandV2API(unittest.TestCase): + def test_static(self): + with fluid.program_guard(fluid.Program(), fluid.Program()): + input = np.random.random([12, 14]).astype("float32") + x = fluid.layers.data( + name='x', + shape=[12, 14], + append_batch_size=False, + dtype="float32") + + positive_2 = fluid.layers.fill_constant([1], "int32", 12) + expand_shape = fluid.layers.data( + name="expand_shape", + shape=[2], + append_batch_size=False, + dtype="int32") + + out_1 = paddle.expand(x, shape=[12, 14]) + out_2 = paddle.expand(x, shape=[positive_2, 14]) + out_3 = paddle.expand(x, shape=expand_shape) + + g0 = fluid.backward.calc_gradient(out_2, x) + + exe = fluid.Executor(place=paddle.NPUPlace(0)) + res_1, res_2, res_3 = exe.run(fluid.default_main_program(), + feed={ + "x": input, + "expand_shape": + np.array([12, 14]).astype("int32") + }, + fetch_list=[out_1, out_2, out_3]) + + assert np.array_equal(res_1, np.tile(input, (1, 1))) + assert np.array_equal(res_2, np.tile(input, (1, 1))) + assert np.array_equal(res_3, np.tile(input, (1, 1))) + + +if __name__ == "__main__": + unittest.main() From cfa691337a866b2a77321253bf859a2f2cf2e6bf Mon Sep 17 00:00:00 2001 From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com> Date: Thu, 12 Aug 2021 10:52:31 +0800 Subject: [PATCH 020/126] [NPU] Support npu kernel for smooth_l1_loss op (#34674) --- .../fluid/operators/smooth_l1_loss_op_npu.cc | 203 ++++++++++++++++++ .../npu/test_smooth_l1_loss_op_npu.py | 147 +++++++++++++ 2 files changed, 350 insertions(+) create mode 100644 paddle/fluid/operators/smooth_l1_loss_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py diff --git a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc new file mode 100644 index 00000000000000..b5a04ce2cabd38 --- /dev/null +++ b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc @@ -0,0 +1,203 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/smooth_l1_loss_op.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class SmoothL1LossNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in_x = context.Input("X"); + auto* in_y = context.Input("Y"); + auto* inside_weight = context.Input("InsideWeight"); + auto* outside_weight = context.Input("OutsideWeight"); + auto* out_diff = context.Output("Diff"); + auto* out_loss = context.Output("Out"); + out_diff->mutable_data(context.GetPlace()); + out_loss->mutable_data(context.GetPlace()); + + auto sigma = context.Attr("sigma"); + T sigma2 = 1.0 / (sigma * sigma); + bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr); + // out_diff = in_x - in_y + auto stream = + context.template device_context() + .stream(); + const auto& runner1 = NpuOpRunner("Sub", {*in_x, *in_y}, {*out_diff}, {}); + runner1.Run(stream); + + Tensor no_reduce_loss(in_x->type()); + no_reduce_loss.Resize(in_x->dims()); + no_reduce_loss.mutable_data(context.GetPlace()); + // multiply inside weight before get the loss + if (has_weight) { + Tensor tmp_diff(out_diff->type()); + tmp_diff.Resize(out_diff->dims()); + tmp_diff.mutable_data(context.GetPlace()); + const auto& runner2 = + NpuOpRunner("Mul", {*out_diff, *inside_weight}, {tmp_diff}, {}); + runner2.Run(stream); + framework::TensorCopy( + tmp_diff, context.GetPlace(), + context.template device_context(), + out_diff); + + Tensor tmp_x(in_x->type()); + tmp_x.Resize(in_x->dims()); + tmp_x.mutable_data(context.GetPlace()); + + Tensor tmp_y(in_y->type()); + tmp_y.Resize(in_y->dims()); + tmp_y.mutable_data(context.GetPlace()); + + // mul input and inside_weight + const auto& runner_x = + NpuOpRunner("Mul", {*in_x, *inside_weight}, {tmp_x}, {}); + runner_x.Run(stream); + const auto& runner_y = + NpuOpRunner("Mul", {*in_y, *inside_weight}, {tmp_y}, {}); + runner_y.Run(stream); + const auto& runner3 = NpuOpRunner("SmoothL1Loss", {tmp_x, tmp_y}, + {no_reduce_loss}, {{"sigma", sigma2}}); + runner3.Run(stream); + } else { + const auto& runner3 = NpuOpRunner("SmoothL1Loss", {*in_x, *in_y}, + {no_reduce_loss}, {{"sigma", sigma2}}); + runner3.Run(stream); + } + + // multiply outside weight and loss + // reduceSum because the output'shape must be [B,1] + if (has_weight) { + Tensor tmp_loss(no_reduce_loss.type()); + tmp_loss.Resize(no_reduce_loss.dims()); + tmp_loss.mutable_data(context.GetPlace()); + const auto& runner4 = + NpuOpRunner("Mul", {no_reduce_loss, *outside_weight}, {tmp_loss}, {}); + runner4.Run(stream); + const auto& runner5 = + NpuOpRunner("ReduceSumD", {tmp_loss}, {*out_loss}, + {{"axes", std::vector{1}}, {"keep_dims", true}}); + runner5.Run(stream); + } else { + const auto& runner5 = + NpuOpRunner("ReduceSumD", {no_reduce_loss}, {*out_loss}, + {{"axes", std::vector{1}}, {"keep_dims", true}}); + runner5.Run(stream); + } + } +}; + +template +class SmoothL1LossGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* inside_weight = context.Input("InsideWeight"); + auto* outside_weight = context.Input("OutsideWeight"); + auto* diff = context.Input("Diff"); + auto* og = context.Input(framework::GradVarName("Out")); + auto* outx_grad = context.Output(framework::GradVarName("X")); + auto* outy_grad = context.Output(framework::GradVarName("Y")); + auto sigma = context.Attr("sigma"); + T sigma2 = 1.0 / (sigma * sigma); + bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr); + + auto stream = + context.template device_context() + .stream(); + + // diff == in_x - in_y == diff - 0 + Tensor tmp_zero(diff->type()); + tmp_zero.Resize(diff->dims()); + tmp_zero.mutable_data(context.GetPlace()); + const auto& runner_zero = NpuOpRunner("ZerosLike", {*diff}, {tmp_zero}, {}); + runner_zero.Run(stream); + + Tensor grad(diff->type()); + grad.Resize(diff->dims()); + grad.mutable_data(context.GetPlace()); + // broadcast og(output_grad) to adapt to the npu interface + const auto& runner_broad = + NpuOpRunner("BroadcastToD", {*og}, {grad}, + {{"shape", framework::vectorize(diff->dims())}}); + runner_broad.Run(stream); + + Tensor gradient(diff->type()); + gradient.Resize(diff->dims()); + gradient.mutable_data(context.GetPlace()); + // diff == diff - 0 == in_x - in_y + const auto& runner_grad = + NpuOpRunner("SmoothL1LossGrad", {*diff, tmp_zero, grad}, {gradient}, + {{"sigma", sigma2}}); + runner_grad.Run(stream); + + // mul weight and gradient + if (has_weight) { + Tensor weight(inside_weight->type()); + weight.Resize(inside_weight->dims()); + weight.mutable_data(context.GetPlace()); + const auto& runner_weight = + NpuOpRunner("Mul", {*inside_weight, *outside_weight}, {weight}, {}); + runner_weight.Run(stream); + + Tensor tmp_grad(gradient.type()); + tmp_grad.Resize(gradient.dims()); + tmp_grad.mutable_data(context.GetPlace()); + const auto& runner_weight_grad = + NpuOpRunner("Mul", {gradient, weight}, {tmp_grad}, {}); + runner_weight_grad.Run(stream); + + framework::TensorCopy( + tmp_grad, context.GetPlace(), + context.template device_context(), + &gradient); + } + // outx_grad = gradient + if (outx_grad) { + outx_grad->mutable_data(context.GetPlace()); + framework::TensorCopy( + gradient, context.GetPlace(), + context.template device_context(), + outx_grad); + } + + // outy_grad = - gradient + if (outy_grad) { + outy_grad->mutable_data(context.GetPlace()); + Tensor coeff(framework::proto::VarType::FP32); + coeff.mutable_data({1}, context.GetPlace()); + FillNpuTensorWithConstant(&coeff, -1); + const auto& runner_y_grad = + NpuOpRunner("Mul", {coeff, gradient}, {*outy_grad}, {}); + runner_y_grad.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL( + smooth_l1_loss, + ops::SmoothL1LossNPUKernel); + +REGISTER_OP_NPU_KERNEL( + smooth_l1_loss_grad, + ops::SmoothL1LossGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py new file mode 100644 index 00000000000000..8c20f25061b85b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_smooth_l1_loss_op_npu.py @@ -0,0 +1,147 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +paddle.enable_static() + + +def smooth_l1_loss_forward(val, sigma2): + abs_val = abs(val) + if abs_val < 1.0 / sigma2: + return 0.5 * val * val * sigma2 + else: + return abs_val - 0.5 / sigma2 + + +class TestSmoothL1LossOp1(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "smooth_l1_loss" + dims = (5, 20) + self.inputs = { + 'X': np.random.random(dims).astype("float32"), + 'Y': np.random.random(dims).astype("float32") + } + sigma = 3.0 + self.attrs = {'sigma': sigma} + sigma2 = sigma * sigma + diff = self.inputs['X'] - self.inputs['Y'] + loss = np.vectorize(smooth_l1_loss_forward)(diff, sigma2).sum(1) + loss = loss.reshape((dims[0], 1)) + self.outputs = { + 'Diff': diff.astype('float32'), + 'Out': loss.astype('float32') + } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], 'Out', max_relative_error=0.02) + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, ['Y'], + 'Out', + max_relative_error=0.03, + no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, ['X'], + 'Out', + max_relative_error=0.03, + no_grad_set=set('Y')) + + +class TestSmoothL1LossOp2(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "smooth_l1_loss" + dims = (5, 20) + self.inputs = { + 'X': np.random.random(dims).astype("float32"), + 'Y': np.random.random(dims).astype("float32"), + 'InsideWeight': np.random.random(dims).astype("float32"), + 'OutsideWeight': np.random.random(dims).astype("float32") + } + sigma = 3.0 + self.attrs = {'sigma': sigma} + sigma2 = sigma * sigma + diff = self.inputs['X'] - self.inputs['Y'] + diff = diff * self.inputs['InsideWeight'] + loss = np.vectorize(smooth_l1_loss_forward)(diff, sigma2) + loss = loss * self.inputs['OutsideWeight'] + loss = loss.sum(1).reshape((dims[0], 1)) + self.outputs = { + 'Diff': diff.astype('float32'), + 'Out': loss.astype('float32') + } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], 'Out', max_relative_error=0.03) + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, ['Y'], + 'Out', + max_relative_error=0.03, + no_grad_set=set(['X', 'InsideWeight', 'OutsideWeight'])) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, ['X'], + 'Out', + max_relative_error=0.03, + no_grad_set=set(['Y', 'InsideWeight', 'OutsideWeight'])) + + +class TestSmoothL1LossOpError(unittest.TestCase): + def test_errors(self): + with fluid.program_guard(fluid.Program(), fluid.Program()): + # The input type of accuracy_op must be Variable. + x1 = fluid.create_lod_tensor( + np.array([[-1]]), [[1]], fluid.NPUPlace(0)) + y1 = fluid.create_lod_tensor( + np.array([[-1]]), [[1]], fluid.NPUPlace(0)) + self.assertRaises(TypeError, fluid.layers.smooth_l1, x1, y1) + # The input dtype of accuracy_op must be float32 or float64. + x2 = fluid.layers.data(name='x2', shape=[4], dtype="int32") + y2 = fluid.layers.data(name='x2', shape=[4], dtype="int32") + self.assertRaises(TypeError, fluid.layers.smooth_l1, x2, y2) + + +if __name__ == '__main__': + unittest.main() From 589d13c5e1f29559bbd7744275e0eec19b6ad5e2 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Thu, 12 Aug 2021 11:45:41 +0800 Subject: [PATCH 021/126] [HybridParallel]Add Recompute for PipeLineParallel (#34607) * add recompute for pp * add recompute offload * add recompute partition --- paddle/fluid/pybind/op_function_generator.cc | 1 - python/paddle/distributed/collective.py | 11 +- .../parallel_layers/pp_layers.py | 53 +++- .../meta_parallel/parallel_layers/random.py | 12 + .../fleet/meta_parallel/pipeline_parallel.py | 7 +- .../fleet/meta_parallel/pp_utils/utils.py | 230 +++++++++++++++++- .../unittests/hybrid_parallel_pp_recompute.py | 172 +++++++++++++ ...test_parallel_dygraph_pipeline_parallel.py | 3 + 8 files changed, 476 insertions(+), 13 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/hybrid_parallel_pp_recompute.py diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index d81783c677622a..07a3fc8a8df331 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -136,7 +136,6 @@ std::map> op_passing_outs_map = { {"c_reduce_min", {"Out"}}, {"c_reduce_prod", {"Out"}}, {"c_reduce", {"Out"}}, - {"c_allgather", {"Out"}}, {"c_scatter", {"Out"}}, {"barrier", {"Out"}}, {"fake_quantize_dequantize_moving_average_abs_max", diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index f1dcf55a56a965..e5dfb34f24304d 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -632,14 +632,13 @@ def all_gather(tensor_list, tensor, group=None, use_calc_stream=True): ring_id = 0 if group is None else group.id nranks = _get_global_group().nranks if group is None else group.nranks - op_type = 'c_allgather' - helper = LayerHelper(op_type, **locals()) - out = helper.create_variable_for_type_inference(dtype=tensor.dtype) - if in_dygraph_mode(): - _C_ops.c_allgather(tensor, out, 'use_calc_stream', use_calc_stream, - 'ring_id', ring_id, 'nranks', nranks) + out = _C_ops.c_allgather(tensor, 'use_calc_stream', use_calc_stream, + 'ring_id', ring_id, 'nranks', nranks) else: + op_type = 'c_allgather' + helper = LayerHelper(op_type, **locals()) + out = helper.create_variable_for_type_inference(dtype=tensor.dtype) if not isinstance(tensor_list, list): raise ValueError("The type of 'tensor_list' for all_gather " "should be list.") diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py index f546adc65ea714..5ea3659bed1102 100644 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py @@ -23,6 +23,7 @@ import paddle from paddle.fluid.dygraph.layers import Layer from ...utils.log_util import logger, layer_to_str +from ..pp_utils.utils import _hp_recompute, _initialize_recompute_setting __all__ = [] @@ -134,7 +135,10 @@ def __init__(self, num_stages=None, topology=None, loss_fn=None, - seg_method="uniform"): + seg_method="uniform", + recompute_interval=0, + recompute_offload=False, + recompute_partition=False): super(PipelineLayer, self).__init__() if num_stages is None and topology is None: raise ValueError("should provide num_stages or topology") @@ -147,6 +151,16 @@ def __init__(self, self.layers = layers self._loss_fn = loss_fn self._topo = topology + self._recompute_interval = recompute_interval + self._recompute_offload = recompute_offload + self._recompute_partition = recompute_partition + + if recompute_interval > 0: + logger.info( + "Start Recompute for PipeLineParallel. recompute_offload: {}, recompute_partition: {}". + format(recompute_offload, recompute_partition)) + _initialize_recompute_setting(recompute_offload, recompute_partition) + world_size = dist.get_world_size() self.global_rank = dist.get_rank() @@ -312,11 +326,44 @@ def _build_layer(self): else: self.run_function.append(layer) + def forward_function(self, start, end): + def execute_func(*x): + if len(x) == 1: + x = x[0] + for idx, layer in enumerate(self.run_function[start:end]): + x = layer(x) + return x + + return execute_func + def forward(self, input): - for layer in self.run_function: - input = layer(input) + if self._recompute_interval == 0: + input = self.forward_function(0, len(self.run_function))(input) + else: + num_layers = len(self.run_function) + for start_idx in range(0, num_layers, self._recompute_interval): + end_idx = min(start_idx + self._recompute_interval, num_layers) + funcs = self.run_function[start_idx:end_idx] + + if not isinstance(input, tuple): + input = (input, ) + + if self._need_recompute(funcs, input): + input = _hp_recompute( + self.forward_function(start_idx, end_idx), *input) + else: + input = self.forward_function(start_idx, end_idx)(*input) + return input + def _need_recompute(self, funcs, inputs): + if not any(input_.stop_gradient == False for input_ in inputs + if isinstance(input_, paddle.Tensor)): + return False + + params = [f.parameters() for f in funcs if isinstance(f, Layer)] + return any(len(list(p)) > 0 for p in params) + def save_state_dict(self, path): if self._topo.get_coord(self.global_rank).data != 0: return diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py index 70daa3b25365e4..ec80ba71036c06 100644 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py @@ -20,6 +20,9 @@ MODEL_PARALLEL_RNG = 'model_parallel_rng' +# This file is inspired by Megatron to control random states for MP: +# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/mpu/random.py + class RNGStatesTracker: """ @@ -46,6 +49,15 @@ def add(self, name, seed): self.states_[name] = paddle.get_cuda_rng_state() paddle.set_cuda_rng_state(orig_rng_state) + def get_states_tracker(self): + states = {} + for name in self.states_: + states[name] = self.states_[name] + return states + + def set_states_tracker(self, states): + self.states_ = states + @contextlib.contextmanager def rng_state(self, name=MODEL_PARALLEL_RNG): if name not in self.states_: diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 16ea7de2946bfd..fc7b39ede244dc 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -14,7 +14,7 @@ import paddle import paddle.fluid as fluid from .meta_parallel_base import MetaParallelBase -from .pp_utils.utils import is_float_tensor +from .pp_utils.utils import is_float_tensor, _initialize_recompute_hcg from .parallel_layers.pp_layers import PipelineLayer from ..utils.hybrid_parallel_util import broadcast_mp_parameters @@ -48,6 +48,8 @@ def __init__(self, layers, hcg, strategy): p2p.initialize_p2p_groups(hcg) + _initialize_recompute_hcg(hcg) + self.is_first_stage = self.stage_id == 0 self.is_last_stage = (self.stage_id == (self.num_stages - 1)) self.global_rank = self._hcg.get_global_rank() @@ -213,6 +215,9 @@ def _load_micro_batch(self, cache_id): if self.is_first_stage: assert len(inputs) == 2, "length of input should be 2" if isinstance(inputs[0], tuple): + assert len( + inputs[0] + ) > 1, "If you use tuple for input data, it should have at least two inputs." batch_size = inputs[0][0].shape[0] assert self.micro_batch_size * self.accumulate_steps == batch_size, ( "batch_size needs to be divisible by micro_batch_size. Currently, " diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py index 8c204820b16615..728080a7cd248e 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py @@ -12,9 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import abc +import contextlib + import paddle -from ...utils import log_util as hp_util +from paddle.fluid import core +from paddle import _C_ops +import paddle.distributed as dist +from paddle.autograd import PyLayer +from paddle.fluid import framework +from paddle.distributed.fleet.utils.recompute import check_recompute_necessary, detach_variable +from ..parallel_layers.random import get_rng_state_tracker __all__ = [] @@ -79,3 +86,222 @@ def get_tensor_bytes(tensor): else: raise ValueError("unknown data type: {}".format(tensor.dtype)) return tensor.numel() * elem_size + + +_hcg = None +_recompute_offload = False +_recompute_partition = False + + +def _initialize_recompute_setting(is_offload, is_partition): + global _recompute_offload, _recompute_partition + + _recompute_offload = is_offload + _recompute_partition = is_partition + + +def _initialize_recompute_hcg(hcg): + global _hcg + _hcg = hcg + + +def _all_gather(tensor, group=None, use_calc_stream=True): + """ + The main difference with paddle.distributed.all_gather: + no need to pass in tensor_list, the returned tensor is spliced + """ + if group is not None and not group.is_member(): + return + ring_id = 0 if group is None else group.id + nranks = paddle.distributed.collective._get_global_group( + ).nranks if group is None else group.nranks + return _C_ops.c_allgather(tensor, 'use_calc_stream', use_calc_stream, + 'ring_id', ring_id, 'nranks', nranks) + + +def _split_activation(tensor): + global _hcg + + mp_degree = _hcg.get_model_parallel_world_size() + mp_rank = _hcg.get_model_parallel_rank() + if mp_degree < 2: + return tensor + + tensor_numel = paddle.numel(tensor) + assert tensor_numel != 0, "can't recompute zero element" + assert tensor_numel % mp_degree == 0, "The capacity of the activation () cannot be divisible by mp_degree()".format( + tensor_numel, mp_degree) + + # use inplace operation to save memory + data = tensor.flatten_() + part_size = tensor_numel // mp_degree + start = part_size * mp_rank + end = start + part_size + return data[start:end] + + +def _merge_activation(tensor): + global _hcg + mp_degree = _hcg.get_model_parallel_world_size() + mp_rank = _hcg.get_model_parallel_rank() + mp_group = _hcg.get_model_parallel_group() + if mp_degree < 2: + return tensor + return _all_gather(tensor, group=mp_group) + + +@contextlib.contextmanager +def _swith_rng_state_tracker(rng_state, tracker): + orig_cuda_rng_state = paddle.get_cuda_rng_state() + orig_cuda_rng_tracker = get_rng_state_tracker().get_states_tracker() + + paddle.set_cuda_rng_state(rng_state) + get_rng_state_tracker().set_states_tracker(tracker) + try: + yield + finally: + paddle.set_cuda_rng_state(orig_cuda_rng_state) + get_rng_state_tracker().set_states_tracker(orig_cuda_rng_tracker) + + +class _HPRecomputeFunction(PyLayer): + """ + Compared with paddle.distributed.fleet.utils.recompute, there are the following differences: + 1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type. + 2. Offload support for activation + 3. Support MP segmentation of activation to further reduce cuda memory + 4. Adapt to the random state of MP + """ + + @staticmethod + def forward(ctx, run_function, all_outputs, *args): + check_recompute_necessary(args) + + # store for recomputing + ctx.run_function = run_function + + # store the rng states + ctx.fwd_cuda_rng_state = paddle.get_cuda_rng_state() + ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker( + ).get_states_tracker() + + # save input for backward + ctx.inputs = [] + ctx.tensor_indices = [] + ctx.tensor_shapes = [] + tensor_inputs = [] + + cur_device = paddle.get_device() + assert 'gpu:' in paddle.get_device( + ), "Recompute with RNG is not support current device: {}.".format( + cur_device) + + # TODO support AMP + tracer = framework._dygraph_tracer() + ctx.is_fw_autocast = tracer._enable_autocast + ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list() + + with paddle.no_grad(): + outputs = run_function(*args) + + for i, arg in enumerate(args): + if paddle.is_tensor(arg): + state = arg.stop_gradient + if _recompute_partition: + ctx.tensor_shapes.append(arg.shape) + partition = _split_activation(arg.detach()).clone() + # TODO(shenliang03) not use calculate stream to D2H to speed + arg = partition.cpu() if _recompute_offload else partition + else: + arg = arg.cpu() if _recompute_offload else arg + arg.stop_gradient = state + tensor_inputs.append(arg) + ctx.tensor_indices.append(i) + ctx.inputs.append(None) + else: + ctx.inputs.append(arg) + + ctx.save_for_backward(*tensor_inputs) + + if paddle.is_tensor(outputs): + all_outputs += [outputs] + return outputs + else: + all_outputs += outputs + return tuple(outputs) + + @staticmethod + def backward(ctx, *args): + with paddle.fluid.dygraph.guard(): + # Restore inputs + inputs = list(ctx.inputs) + tensor_indices = ctx.tensor_indices + tensor_shapes = ctx.tensor_shapes + tensors = list(ctx.saved_tensor()) + + device_id = dist.ParallelEnv().device_id + for i, idx in enumerate(tensor_indices): + if _recompute_partition: + state = tensors[i].stop_gradient + tensors[i] = _merge_activation(tensors[i]).detach( + ).reshape_(tensor_shapes[i]) + tensors[i].stop_gradient = state + inputs[idx] = tensors[i].cuda( + device_id) if _recompute_offload else tensors[i] + + tracer = framework._dygraph_tracer() + tracer._has_grad = True + + # need restore auto_cast state as well as w/b list + with _swith_rng_state_tracker(ctx.fwd_cuda_rng_state, + ctx.fwd_cuda_rng_state_tracker): + with paddle.amp.auto_cast( + enable=ctx.is_fw_autocast, + custom_white_list=ctx.amp_white_list, + custom_black_list=ctx.amp_black_list): + detached_inputs = detach_variable(tuple(inputs)) + outputs = ctx.run_function(*detached_inputs) + + if isinstance(outputs, core.VarBase): + outputs = (outputs, ) + assert len(outputs) == len(args) + + forward_outputs_with_grad = [] + backward_inputs = [] + + for i in range(len(outputs)): + if isinstance(outputs[i], + core.VarBase) and not outputs[i].stop_gradient: + forward_outputs_with_grad.append(outputs[i]) + backward_inputs.append(args[i]) + + if len(forward_outputs_with_grad) == 0: + raise RuntimeError( + "none of output has stop_gradient=False, this recompute() is not necessary" + ) + + # actually backward + paddle.autograd.backward(forward_outputs_with_grad, backward_inputs) + grads = list(inp._grad_ivar() for inp in detached_inputs + if isinstance(inp, core.VarBase)) + return grads + + +def _hp_recompute(function, *args): + # NODTE(shenliang03)The current hybrid parallel recompute has limitations. + # It cannot handle the following situations: + # 1. The calculation output of recompute, there are tensors that do not require gradients. + # 2. The forward output tensor has no gradient. This problem can be solved temporarily by detach(). + # 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor + + all_outputs = [] + _HPRecomputeFunction.apply(function, all_outputs, *args) + + if len(all_outputs) == 1: + return all_outputs[0] + else: + for output in all_outputs: + if paddle.is_tensor(output) and not is_float_tensor(output): + output.stop_gradient = True + + return tuple(all_outputs) diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_recompute.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_recompute.py new file mode 100644 index 00000000000000..ebcac70a3b68a3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_recompute.py @@ -0,0 +1,172 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest +import paddle +import numpy as np +import random +import paddle.distributed as dist +import paddle.distributed.fleet as fleet +from paddle.fluid import layers +import paddle.nn.functional as F +from paddle.distributed.fleet.meta_parallel import PipelineLayer, LayerDesc +from paddle.fluid.dygraph.layers import Layer +import paddle.nn as nn + + +def set_random_seed(seed, dp_id, rank_id): + """Set random seed for reproducability.""" + random.seed(seed) + np.random.seed(seed + dp_id) + paddle.seed(seed + dp_id) + + +batch_size = 8 +length = 8 +micro_batch_size = 2 +vocab_size = 128 +hidden_size = 16 +d_model = hidden_size +dim_feedforward = 4 * d_model + + +class EmbeddingNet(Layer): + def __init__(self): + super(EmbeddingNet, self).__init__() + self.word_embeddings = nn.Embedding(vocab_size, hidden_size) + self.position_embeddings = nn.Embedding(vocab_size, hidden_size) + + def forward(self, x): + w_emb = self.word_embeddings(x) + p_emb = self.position_embeddings(x) + w_emb = w_emb + p_emb + return w_emb + + +class TransformerNet(Layer): + def __init__(self): + super(TransformerNet, self).__init__() + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.q_proj = nn.Linear(d_model, d_model) + self.k_proj = nn.Linear(d_model, d_model) + self.v_proj = nn.Linear(d_model, d_model) + + self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) + + def forward(self, x): + q = self.q_proj(x) + k = self.k_proj(x) + v = self.v_proj(x) + product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_model**-0.5) + weights = F.softmax(product) + + weights = F.dropout(weights, 0.2) + tgt = layers.matmul(weights, v) + residual = tgt + tgt = self.norm1(tgt) + tgt = residual + tgt + + out = self.linear2(F.gelu(self.linear1(tgt), approximate=True)) + return out + + +class EmbeddingPipe(EmbeddingNet): + def forward(self, x): + return super().forward(x) + + +class TransformerNetPipe(TransformerNet): + def forward(self, x): + output = super().forward(x) + return output + + +class CriterionPipe(Layer): + def __init__(self): + super(CriterionPipe, self).__init__() + + def forward(self, out, label): + loss = out.mean() + return loss + + +class ModelPipe(PipelineLayer): + def __init__(self, topology): + self.descs = [] + self.descs.append(LayerDesc(EmbeddingPipe)) + + for x in range(2): + self.descs.append(LayerDesc(TransformerNetPipe)) + + super().__init__( + layers=self.descs, + loss_fn=CriterionPipe(), + topology=topology, + seg_method="layer:TransformerNetPipe", + recompute_interval=1, + recompute_partition=False, + recompute_offload=False) + + +class TestDistPPTraning(unittest.TestCase): + def setUp(self): + strategy = fleet.DistributedStrategy() + self.model_parallel_size = 1 + self.data_parallel_size = 1 + self.pipeline_parallel_size = 2 + strategy.hybrid_configs = { + "dp_degree": self.data_parallel_size, + "mp_degree": self.model_parallel_size, + "pp_degree": self.pipeline_parallel_size, + } + strategy.pipeline_configs = { + "accumulate_steps": batch_size // micro_batch_size, + "micro_batch_size": micro_batch_size + } + fleet.init(is_collective=True, strategy=strategy) + + def test_pp_model(self): + hcg = fleet.get_hybrid_communicate_group() + word_size = hcg.get_model_parallel_world_size() + dp_id = hcg.get_data_parallel_rank() + pp_id = hcg.get_stage_id() + rank_id = dist.get_rank() + topology = hcg.topology() + set_random_seed(1024, dp_id, rank_id) + + model = ModelPipe(topology) + scheduler = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True) + optimizer = paddle.optimizer.SGD(learning_rate=scheduler, + parameters=model.parameters()) + + model = fleet.distributed_model(model) + optimizer = fleet.distributed_optimizer(optimizer) + + for step_id in range(5): + x_data = np.random.randint(0, vocab_size, size=[batch_size, length]) + x = paddle.to_tensor(x_data) + x.stop_gradient = True + loss = model.train_batch([x, x], optimizer, scheduler) + # TODO(shenliang03) add utest for loss + print("loss: ", loss) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py index 003e0c1685cae7..35fd49dfffff83 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py @@ -39,6 +39,9 @@ def test_hybrid_parallel_transformer(self): def test_hybrid_parallel_transformer(self): self.run_mnist_2gpu('hybrid_parallel_pp_save_load.py') + def test_hybrid_parallel_transformer(self): + self.run_mnist_2gpu('hybrid_parallel_pp_recompute.py') + if __name__ == "__main__": unittest.main() From 0e28c8bb00a782ca876878037dd059aa47df6308 Mon Sep 17 00:00:00 2001 From: zhulei <563755780@qq.com> Date: Thu, 12 Aug 2021 11:52:59 +0800 Subject: [PATCH 022/126] Fix safety-bug of functional.linear (#34696) * Fix safety-bug of functional.linear * Fix safety-bug of functional.linear * Fix safety-bug of functional.linear * Fix safety-bug of functional.linear --- paddle/fluid/operators/math/blas_impl.h | 6 ++++++ python/paddle/fluid/tests/unittests/test_linear.py | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 55151c5483a38b..cb4044b1b08c7a 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -1041,6 +1041,12 @@ void Blas::BatchedGEMM( CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha, const T *A, const T *B, T beta, T *C, int batchCount, int64_t strideA, int64_t strideB) const { + PADDLE_ENFORCE_NOT_NULL( + A, platform::errors::InvalidArgument("Pointer A should not be null.")); + PADDLE_ENFORCE_NOT_NULL( + B, platform::errors::InvalidArgument("Pointer B should not be null.")); + PADDLE_ENFORCE_NOT_NULL( + C, platform::errors::InvalidArgument("Pointer C should not be null.")); #ifdef PADDLE_WITH_MKLML int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; diff --git a/python/paddle/fluid/tests/unittests/test_linear.py b/python/paddle/fluid/tests/unittests/test_linear.py index 9d07a80da15dbf..2f722fc8005013 100644 --- a/python/paddle/fluid/tests/unittests/test_linear.py +++ b/python/paddle/fluid/tests/unittests/test_linear.py @@ -73,6 +73,15 @@ def test_error(self, place=paddle.CPUPlace()): np.testing.assert_array_almost_equal(res_f, res_nn) np.testing.assert_array_almost_equal(res_nn, res_np) + def test_error_dummy_input(self, place=paddle.CPUPlace()): + with self.assertRaises(ValueError): + x_arr = np.array([], dtype=np.float32) + x = paddle.to_tensor( + np.reshape(x_arr, (0, 4, 4, 4)), dtype='float32') + weight = paddle.zeros([4, 4, 4], dtype='float32') + bias = paddle.to_tensor([], dtype='float32') + paddle.nn.functional.linear(x, weight, bias=bias) + if __name__ == "__main__": unittest.main() From 016cc56d2fd020774356dee8a5adb0e29a94c76a Mon Sep 17 00:00:00 2001 From: Feng Xing <79969986+xingfeng01@users.noreply.github.com> Date: Thu, 12 Aug 2021 12:17:55 +0800 Subject: [PATCH 023/126] transformer c files (#34706) This PR adds fused transformer related files defining c interface including class, function etc.. --- paddle/fluid/operators/fused/CMakeLists.txt | 3 +- .../operators/fused/fused_transformer_op.cc | 161 ++++++++++++++++++ .../operators/fused/fused_transformer_op.cu | 13 ++ .../operators/fused/fused_transformer_op.h | 155 +++++++++++++++++ 4 files changed, 331 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/fused/fused_transformer_op.cc create mode 100644 paddle/fluid/operators/fused/fused_transformer_op.cu create mode 100644 paddle/fluid/operators/fused/fused_transformer_op.h diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 104298e037319c..541e5afdf9b71e 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -15,7 +15,8 @@ register_operators(EXCLUDES fusion_group_op fusion_gru_op fusion_lstm_op - fused_bn_add_activation_op) + fused_bn_add_activation_op + fused_transformer_op) # fusion_gru_op does not have CUDA kernel op_library(fusion_gru_op) diff --git a/paddle/fluid/operators/fused/fused_transformer_op.cc b/paddle/fluid/operators/fused/fused_transformer_op.cc new file mode 100644 index 00000000000000..9e5fc42fc76dd1 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_transformer_op.cc @@ -0,0 +1,161 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused/fused_transformer_op.h" +#include + +namespace paddle { +namespace operators { + +// constructor and init +template +FusedTransformerEncoderLayer::FusedTransformerEncoderLayer( + int batch_size_, int max_seq_len_, int dim_embed_, int dim_feedforward_, + int num_head_, float dropout_, float act_dropout_, float attn_dropout_, + std::string act_method_, bool normalize_pre_or_post_) { + // configurations + batch_size = batch_size_; + max_seq_len = max_seq_len_; + dim_embed = dim_embed_; + dim_feedforward = dim_feedforward_; + num_head = num_head_; + head_size = dim_embed_ / num_head; + + dropout = dropout_; + act_dropout = act_dropout_; + attn_dropout = attn_dropout_; + + act_method = act_method_; + normalize_pre_or_post = normalize_pre_or_post_; + + // init attn + fused_attn = + new FusedAttention(batch_size, max_seq_len, dim_embed, num_head, + dropout, attn_dropout, normalize_pre_or_post); + + // init ffn + fused_ffn = + new FusedFFN(batch_size, max_seq_len, dim_embed, dim_feedforward_, + act_dropout, act_method, normalize_pre_or_post); +} + +// deconstructor +template +FusedTransformerEncoderLayer::~FusedTransformerEncoderLayer() { + delete fused_attn; + delete fused_ffn; +} + +// compute forward +template +void FusedTransformerEncoderLayer::ComputeForward(T* src, T* output) { + T* output_attn; // todo + + fused_attn->ComputeForward(src, output_attn); + fused_ffn->ComputeForward(output_attn, output); +} + +// compute backward +template +void FusedTransformerEncoderLayer::ComputeBackward() {} + +// constructor and init +template +FusedAttention::FusedAttention(int batch_size_, int max_seq_len_, + int dim_embed_, int num_head_, float dropout_, + float attn_dropout_, + bool normalize_pre_or_post_) { + // configurations + batch_size = batch_size_; + max_seq_len = max_seq_len_; + dim_embed = dim_embed_; + num_head = num_head_; + head_size = dim_embed_ / num_head; + + dropout = dropout_; + attn_dropout = attn_dropout_; + + normalize_pre_or_post = normalize_pre_or_post_; + + // init fmha + fmha = new FusedMHA(); +} + +// compute forward +template +void FusedAttention::ComputeForward(T* src, T* output) {} + +template +FusedAttention::~FusedAttention() { + delete fmha; +} + +// compute backward +template +void FusedAttention::ComputeBackward() {} + +// constructor and init +template +FusedFFN::FusedFFN(int batch_size_, int max_seq_len_, int dim_embed_, + int dim_feedforward_, float act_dropout_, + std::string act_method_, bool normalize_pre_or_post_) { + batch_size = batch_size_; + max_seq_len = max_seq_len_; + dim_embed = dim_embed_; + dim_feedforward = dim_feedforward_; + act_dropout = act_dropout_; + + act_method = act_method_; + normalize_pre_or_post = normalize_pre_or_post_; +} + +template +FusedFFN::~FusedFFN() {} + +// compute forward +template +void FusedFFN::ComputeForward(T* src, T* output) {} + +// compute backward +template +void FusedFFN::ComputeBackward() {} + +// init +template +FusedMHA::FusedMHA(int batch_size_, int max_seq_len_, int dim_embed_, + int num_head_, float dropout_, bool is_test_, + uint64_t seed_, uint64_t* seqlen_, uint64_t* cu_seqlen_) { + batch_size = batch_size_; + max_seq_len = max_seq_len_; + dim_embed = dim_embed_; + num_head = num_head_; + head_size = dim_embed_ / num_head; + + dropout = dropout_; + is_test = is_test_; + seed = seed_; + seqlen = seqlen_; + cu_seqlen = cu_seqlen_; +} + +// compute forward +template +void FusedMHA::ComputeForward(T* output, T* softmax_mask) {} + +// compute backward +template +void FusedMHA::ComputeBackward(const T* grad_output, T* softmax_mask, + T* grad_x) {} +} +} \ No newline at end of file diff --git a/paddle/fluid/operators/fused/fused_transformer_op.cu b/paddle/fluid/operators/fused/fused_transformer_op.cu new file mode 100644 index 00000000000000..43bf3acdc6156b --- /dev/null +++ b/paddle/fluid/operators/fused/fused_transformer_op.cu @@ -0,0 +1,13 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ \ No newline at end of file diff --git a/paddle/fluid/operators/fused/fused_transformer_op.h b/paddle/fluid/operators/fused/fused_transformer_op.h new file mode 100644 index 00000000000000..2d2d390d243e5a --- /dev/null +++ b/paddle/fluid/operators/fused/fused_transformer_op.h @@ -0,0 +1,155 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle { +namespace operators { + +template +class FusedMHA { + FusedMHA(int, int, int, int, float, bool, uint64_t, uint64_t*, uint64_t*); + ~FusedMHA(); + + void ComputeForward(T*, T*); + void ComputeBackward(const T*, T*, T*); + + private: + int batch_size; + int max_seq_len; + int dim_embed; + + int num_head; + int head_size; + + float dropout; + + bool is_test; + uint64_t seed; + + int32_t seqlen; + int32_t* cu_seqlen; +}; + +template +class FusedAttention { + public: + FusedAttention(int, int, int, int, float, float, bool); + ~FusedAttention(); + + void ComputeForward(T*, T*); + void ComputeBackward(); + + private: + FusedMHA* fmha; // fused multihead attention + + int batch_size; + int max_seq_len; + int dim_embed; + + int num_head; + int head_size; + + float dropout; + T attn_dropout; + + bool normalize_pre_or_post; + + // weights and bias used in attention + T* fattn_qkv_w; + T* fattn_qkv_b; + T* fattn_o_w; + T* fattn_o_b; + T* fattn_n_w; + T* fattn_n_b; + T* fattn_norm_w; + T* fattn_norm_b; + + T* fattn_grad_qkv_w; + T* fattn_grad_qkv_b; + T* fattn_grad_o_w; + T* fattn_grad_o_b; + T* fattn_grad_n_w; + T* fattn_grad_n_b; + T* fattn_grad_norm_w; + T* fattn_grad_norm_b; +}; + +template +class FusedFFN { + FusedFFN(int, int, int, int, float, std::string, bool); + ~FusedFFN(); + + void ComputeForward(T*, T*); + void ComputeBackward(); + + private: + int batch_size; + int max_seq_len; + int dim_embed; + int dim_feedforward; + + float attn_dropout; + float act_dropout; + + bool normalize_pre_or_post; + + std::string act_method; + + // weights and bias used in ffn + T* fffn_inter_w; + T* fffn_inter_b; + T* fffn_output_w; + T* fffn_output_b; + + T* fffn_grad_inter_w; + T* fffn_grad_inter_b; + T* fffn_grad_output_w; + T* fffn_grad_output_b; +}; + +template +class FusedTransformerEncoderLayer { + public: + FusedTransformerEncoderLayer(int, int, int, int, int, float, float, float, + std::string, bool); + ~FusedTransformerEncoderLayer(); + + void ComputeForward(T* src, T* output); + void ComputeBackward(); + + private: + FusedAttention* fused_attn; + FusedFFN* fused_ffn; + + int batch_size; + int max_seq_len; + int dim_embed; + int dim_feedforward; + + int num_head; + int head_size; + + float dropout; + float attn_dropout; + float act_dropout; + + bool normalize_pre_or_post; + + std::string act_method; +}; +} +} From 6326c3efbec9a024364b1fe4450a48c3eaa63de2 Mon Sep 17 00:00:00 2001 From: Wilber Date: Thu, 12 Aug 2021 13:21:10 +0800 Subject: [PATCH 024/126] [Inference] Inference python api support fp16 (#34676) --- .../fluid/inference/api/analysis_predictor.h | 4 ++ .../inference/api/details/zero_copy_tensor.cc | 7 +++ paddle/fluid/inference/api/paddle_tensor.h | 1 + paddle/fluid/pybind/inference_api.cc | 43 +++++++++++++++++++ 4 files changed, 55 insertions(+) diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index b55d08dda5a4c4..656db31d473812 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -25,12 +25,16 @@ #include "paddle/fluid/inference/api/details/reset_tensor_array.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/string/printf.h" #ifdef PADDLE_WITH_TESTING #include #include #endif +namespace paddle_infer { +using float16 = paddle::platform::float16; +} /// /// \file analysis_predictor.h /// diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index b117a21dea3e65..ff167aa7cf1068 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -19,9 +19,12 @@ #include "paddle/fluid/inference/api/paddle_tensor.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" namespace paddle_infer { +using float16 = paddle::platform::float16; + void Tensor::Reshape(const std::vector &shape) { PADDLE_ENFORCE_EQ( name_.empty(), false, @@ -104,6 +107,8 @@ DataType Tensor::type() const { auto type = tensor->type(); if (type == paddle::framework::proto::VarType::FP32) { return DataType::FLOAT32; + } else if (type == paddle::framework::proto::VarType::FP16) { + return DataType::FLOAT16; } else if (type == paddle::framework::proto::VarType::INT64) { return DataType::INT64; } else if (type == paddle::framework::proto::VarType::INT32) { @@ -261,12 +266,14 @@ template PD_INFER_DECL void Tensor::CopyFromCpu(const int64_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const int32_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const uint8_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const int8_t *data); +template PD_INFER_DECL void Tensor::CopyFromCpu(const float16 *data); template PD_INFER_DECL void Tensor::CopyToCpu(float *data); template PD_INFER_DECL void Tensor::CopyToCpu(int64_t *data); template PD_INFER_DECL void Tensor::CopyToCpu(int32_t *data); template PD_INFER_DECL void Tensor::CopyToCpu(uint8_t *data); template PD_INFER_DECL void Tensor::CopyToCpu(int8_t *data); +template PD_INFER_DECL void Tensor::CopyToCpu(float16 *data); template PD_INFER_DECL float *Tensor::data(PlaceType *place, int *size) const; diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index fa3067b62d65a3..1f813d52ef5e76 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -25,6 +25,7 @@ enum DataType { INT32, UINT8, INT8, + FLOAT16, // TODO(Superjomn) support more data types if needed. }; diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index ecef0c350b6785..b7cf907b5db614 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -34,6 +34,36 @@ namespace py = pybind11; +namespace pybind11 { +namespace detail { + +// Note: use same enum number of float16 in numpy. +// import numpy as np +// print np.dtype(np.float16).num # 23 +constexpr int NPY_FLOAT16_ = 23; +constexpr int NPY_UINT16_ = 4; + +// Note: Since float16 is not a builtin type in C++, we register +// paddle::platform::float16 as numpy.float16. +// Ref: https://github.com/pybind/pybind11/issues/1776 +template <> +struct npy_format_descriptor { + static py::dtype dtype() { + handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_FLOAT16_); + return reinterpret_borrow(ptr); + } + static std::string format() { + // Note: "e" represents float16. + // Details at: + // https://docs.python.org/3/library/struct.html#format-characters. + return "e"; + } + static constexpr auto name = _("float16"); +}; + +} // namespace detail +} // namespace pybind11 + namespace paddle { namespace pybind { using paddle::AnalysisPredictor; @@ -126,6 +156,9 @@ py::dtype PaddleDTypeToNumpyDType(PaddleDType dtype) { case PaddleDType::UINT8: dt = py::dtype::of(); break; + case PaddleDType::FLOAT16: + dt = py::dtype::of(); + break; default: PADDLE_THROW(platform::errors::Unimplemented( "Unsupported data type. Now only supports INT32, INT64, UINT8 and " @@ -196,6 +229,10 @@ py::array ZeroCopyTensorToNumpy(ZeroCopyTensor &tensor) { // NOLINT case PaddleDType::FLOAT32: tensor.copy_to_cpu(static_cast(array.mutable_data())); break; + case PaddleDType::FLOAT16: + tensor.copy_to_cpu( + static_cast(array.mutable_data())); + break; case PaddleDType::UINT8: tensor.copy_to_cpu(static_cast(array.mutable_data())); break; @@ -226,6 +263,10 @@ py::array PaddleInferTensorToNumpy(paddle_infer::Tensor &tensor) { // NOLINT case PaddleDType::FLOAT32: tensor.CopyToCpu(static_cast(array.mutable_data())); break; + case PaddleDType::FLOAT16: + tensor.CopyToCpu( + static_cast(array.mutable_data())); + break; case PaddleDType::UINT8: tensor.CopyToCpu(static_cast(array.mutable_data())); break; @@ -642,6 +683,7 @@ void BindZeroCopyTensor(py::module *m) { .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) + .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_to_cpu", &ZeroCopyTensorToNumpy) .def("shape", &ZeroCopyTensor::shape) .def("set_lod", &ZeroCopyTensor::SetLoD) @@ -655,6 +697,7 @@ void BindPaddleInferTensor(py::module *m) { .def("copy_from_cpu", &PaddleInferTensorCreate) .def("copy_from_cpu", &PaddleInferTensorCreate) .def("copy_from_cpu", &PaddleInferTensorCreate) + .def("copy_from_cpu", &PaddleInferTensorCreate) .def("copy_to_cpu", &PaddleInferTensorToNumpy) .def("shape", &paddle_infer::Tensor::shape) .def("set_lod", &paddle_infer::Tensor::SetLoD) From dffb0b223339319b859207c10e389311169d3a18 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Thu, 12 Aug 2021 14:37:20 +0800 Subject: [PATCH 025/126] fix set_grad_ivar bug of Tensor.backward (#34819) --- .../fluid/imperative/gradient_accumulator.cc | 17 ++-- paddle/fluid/imperative/layer.cc | 87 ++++++++++++++----- paddle/fluid/imperative/layer.h | 11 ++- .../tests/unittests/test_imperative_basic.py | 12 ++- 4 files changed, 94 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 57657941ef83f3..9f08d0b73fc087 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -184,6 +184,12 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { auto data_type = src_tensor.type(); auto place = src_tensor.place(); + PADDLE_ENFORCE_EQ(dst_tensor->type(), data_type, + platform::errors::PreconditionNotMet( + "The data type of source tensor and destination tensor " + "should be equal, Otherwise, the calculation results " + "will be incorrect.")); + #define PADDLE_TENSOR_ADD(cpp_type) \ if (data_type == framework::DataTypeTrait::DataType()) { \ TensorAddFunctor func( \ @@ -422,9 +428,9 @@ void GradientAccumulator::AccumulateGrad() { auto* src = inner_var_->MutableVar(); auto* dst = var_->MutableVar(); if (!var_->IsEmpty()) { - VLOG(6) << "Leaf Gradient Var(" << var_->Name() - << ") has been calculated by previous graph, will accumulate on " - "previous graph."; + VLOG(6) << "Leaf Var(" << var_->Name() + << ")'s Gradient has been initizlized, will accumulate on " + "previous gradient."; if (dst->IsType()) { if (src->IsType()) { TensorAdd(*src, dst); @@ -444,8 +450,9 @@ void GradientAccumulator::AccumulateGrad() { "Only support LoDTensor and SelectedRows for gradient var")); } } else { - VLOG(6) << "Leaf Gradient Var(" << var_->Name() - << ") has not been initialized, not accumulate. Just move"; + VLOG(6) + << "Leaf Var(" << var_->Name() + << ")'s Gradient has not been initialized, not accumulate. Just move"; *(dst) = std::move(*src); var_->SetType(inner_var_->Type()); var_->SetDataType(inner_var_->DataType()); diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 6e28ecd9971abc..53ae5b8127fdba 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -277,32 +277,73 @@ std::shared_ptr VarBase::NewVarBase(const platform::Place& dst_place, } void VarBase::CopyFrom(const VarBase& src, const bool blocking) { - if (SharedVar()->IsEmpty()) { - VLOG(3) << "deep copy Variable from " << src.Name() << " to " << Name(); - SetPersistable(src.Persistable()); + if (src.SharedVar()->IsEmpty()) { + return; + } + + VLOG(3) << "Deep copy Tensor from " << src.Name() << " to " << Name(); + if (Var().IsInitialized()) { + PADDLE_ENFORCE_EQ(DataType(), src.DataType(), + platform::errors::PreconditionNotMet( + "Tensor %s has different data type with Tensor %s, " + "Tensor Copy cannot be performed!", + Name(), src.Name())); + PADDLE_ENFORCE_EQ(Type(), src.Type(), + platform::errors::PreconditionNotMet( + "Tensor %s has different type with Tensor %s, Tensor " + "Copy cannot be performed!", + Name(), src.Name())); + } else { SetDataType(src.DataType()); SetType(src.Type()); - SetOverridedStopGradient(src.OverridedStopGradient()); - if (!src.SharedVar()->IsEmpty()) { - const platform::Place& place = src.Place(); - if (src.Var().IsType()) { - auto& src_tensor = src.Var().Get(); - auto* dst_tensor = MutableVar()->GetMutable(); - dst_tensor->set_lod(src_tensor.lod()); - framework::TensorCopy(src_tensor, place, dst_tensor); - } else if (src.Var().IsType()) { - auto& src_selected_rows = src.Var().Get(); - auto* dst_selected_rows = - MutableVar()->GetMutable(); - dst_selected_rows->set_height(src_selected_rows.height()); - dst_selected_rows->set_rows(src_selected_rows.rows()); - framework::TensorCopy(src_selected_rows.value(), place, - dst_selected_rows->mutable_value()); - } - if (blocking) { - platform::DeviceContextPool::Instance().Get(place)->Wait(); - } + SetPersistable(src.Persistable()); + InnerSetOverridedStopGradient(src.OverridedStopGradient()); + } + + platform::Place place = src.Place(); + if (src.Var().IsType()) { + auto& src_tensor = src.Var().Get(); + auto* dst_tensor = MutableVar()->GetMutable(); + if (dst_tensor && dst_tensor->IsInitialized()) { + PADDLE_ENFORCE_EQ(dst_tensor->dims(), src_tensor.dims(), + platform::errors::PreconditionNotMet( + "Tensor %s has different dims with Tensor %s, " + "Tensor Copy cannot be performed!", + Name(), src.Name())); + PADDLE_ENFORCE_EQ(dst_tensor->lod(), src_tensor.lod(), + platform::errors::PreconditionNotMet( + "Tensor %s has different dims with Tensor %s, " + "Tensor Copy cannot be performed!", + Name(), src.Name())); + place = Place(); + } else { + dst_tensor->set_lod(src_tensor.lod()); + dst_tensor->Resize(src_tensor.dims()); + } + framework::TensorCopy(src_tensor, place, dst_tensor); + } else if (src.Var().IsType()) { + auto& src_selected_rows = src.Var().Get(); + auto* dst_selected_rows = + MutableVar()->GetMutable(); + dst_selected_rows->set_height(src_selected_rows.height()); + dst_selected_rows->set_rows(src_selected_rows.rows()); + + auto& src_tensor = src_selected_rows.value(); + auto* dst_tensor = dst_selected_rows->mutable_value(); + if (dst_tensor && dst_tensor->IsInitialized()) { + PADDLE_ENFORCE_EQ(dst_tensor->dims(), src_tensor.dims(), + platform::errors::PreconditionNotMet( + "Tensor %s has different dims with Tensor %s, " + "Tensor Copy cannot be performed!", + Name(), src.Name())); + place = Place(); + } else { + dst_tensor->Resize(src_tensor.dims()); } + framework::TensorCopy(src_tensor, place, dst_tensor); + } + if (blocking) { + platform::DeviceContextPool::Instance().Get(place)->Wait(); } } diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 56e16ba199707c..16580627ed1964 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -110,6 +110,7 @@ class VarBase { void SetGradVarBase(const VarBase& grad_var) { MutableGradVarBase()->CopyFrom(grad_var, true); + MutableGradVarBase()->SharedVar()->SetIsEmpty(false); } const std::shared_ptr& MutableGradVarBase() { @@ -142,6 +143,8 @@ class VarBase { return grad_var_->MutableVar(); } + bool IsLeaf() const { return var_->IsLeaf(); } + void SetOverridedStopGradient(bool stop_gradient) { var_->SetOverridedStopGradient(stop_gradient); if (grad_var_) { @@ -151,10 +154,8 @@ class VarBase { bool OverridedStopGradient() const { return var_->OverridedStopGradient(); } - bool IsLeaf() const { return var_->IsLeaf(); } - void InnerSetOverridedStopGradient(bool stop_gradient) { - if (var_->InnerOverridedStopGradient() == -1) { + if (InnerOverridedStopGradient() == -1) { var_->InnerSetOverridedStopGradient(stop_gradient); if (grad_var_) { grad_var_->InnerSetOverridedStopGradient(stop_gradient); @@ -162,6 +163,10 @@ class VarBase { } } + int InnerOverridedStopGradient() const { + return var_->InnerOverridedStopGradient(); + } + void SetPersistable(bool persistable) { var_->SetPersistable(persistable); } bool Persistable() const { return var_->Persistable(); } diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 1cdb57c540ac4d..3aed1af59795a6 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -41,7 +41,6 @@ def forward(self, inputs): class MLP(fluid.Layer): def __init__(self, input_size): super(MLP, self).__init__() - self._linear1 = None self._linear1 = Linear( input_size, 3, @@ -607,12 +606,21 @@ def test_mlp(sort_sum_gradient): mlp2.clear_gradients() self.assertTrue(np.array_equal(clear_loss.grad.numpy(), [1])) - if ((batch_id + 1) % 10) == 0: + if ((batch_id + 1) % 10) % 2 == 0: mlp1.clear_gradients() expected_weight1_grad = 0. expected_bias1_grad = 0. expected_weight2_grad = 0. expected_bias2_grad = 0. + elif ((batch_id + 1) % 10) % 2 == 1: + mlp1.clear_gradients() + mlp1._linear1.weight._set_grad_ivar( + paddle.ones([input_size, 3])) + mlp1._linear2.weight._set_grad_ivar(paddle.ones([3, 4])) + expected_weight1_grad = 1. + expected_bias1_grad = 0. + expected_weight2_grad = 1. + expected_bias2_grad = 0. with fluid.dygraph.guard(): test_single_api(False) From dc62a227d4b22a1f81e6476b8caaf7ddb8850daa Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 12 Aug 2021 18:58:51 +0800 Subject: [PATCH 026/126] Revert "[oneDNN] Fix to issue #34554 (#34623)" (#34838) This reverts commit 0a5c99e85892b5c1d6603788a5a635b9528df042. --- .../mkldnn/elementwise_mkldnn_op.h | 19 +- .../mkldnn/elementwise_mul_mkldnn_op.cc | 10 +- .../operators/mkldnn/activation_mkldnn_op.cc | 11 +- .../operators/mkldnn/caching_tests.cmake | 2 +- .../fluid/operators/mkldnn/scale_mkldnn_op.cc | 8 +- .../operators/mkldnn/softmax_mkldnn_op.cc | 105 ++-- .../operators/mkldnn/test_mkldnn_caching.cc | 84 ++-- paddle/fluid/platform/mkldnn_reuse.h | 476 ++++++------------ 8 files changed, 279 insertions(+), 436 deletions(-) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index ffcdc079985fa6..ddad70a6a5f31c 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -47,24 +47,13 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); - platform::BinaryMKLDNNHandler handler(BINARY_OP, axis, mkldnn_engine, - ctx.GetPlace(), x, y, z, scale_x, - scale_y, scale_o); + platform::BinaryMKLDNNHandler handler( + BINARY_OP, axis, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, y, z, + scale_x, scale_y, scale_o, ctx.OutputName("Out")); const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - // (jczaja) For Inplace src and dst should be the same memory object. - // So x should share buffer with z. But UT mechanics is testing inplace - // execution for this op not checking that x can be bradcasted to match in - // shape y tensor. - // This is wrong as when x is to be broadcasted then z(out) will match the - // shape of y which is bigger than x. Hence if x is smaller in shape than z - // and they share a buffer (of - // shape x) then this buffer is not big enough to hold result of elementwise - // operation. - auto dst_memory = (x->numel() == z->numel() && x->IsSharedBufferWith(*z)) - ? src_x_memory - : handler.AcquireDstMemory(z); + const auto dst_memory = handler.AcquireDstMemory(z); const auto binary_prim = handler.AcquireForwardPrimitive(); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index af4aab8047888a..1c246e8d189370 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -48,8 +48,9 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { if (dx) { // dx = dout*y platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, y, dx, 1.0f, 1.0f, 1.0f); + dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine, + ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f, + ctx.InputName(framework::GradVarName("Out"))); const auto src_dout_memory = handler.AcquireSrcMemory(dout); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); @@ -74,8 +75,9 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { // Handler is having nullptr passed instead of output tensor as // we want Dst buffer to be allocated by oneDNN not to use Tensor platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, x, nullptr, 1.0f, 1.0f, 1.0f); + dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine, + ctx.GetPlace(), dout, x, nullptr, 1.0f, 1.0f, 1.0f, + ctx.InputName(framework::GradVarName("Out"))); const auto src_dout_memory = handler.AcquireSrcMemory(dout); const auto src_x_memory = handler.AcquireSecondSrcMemory(x); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index d992890adeec3e..3b92d2e2d88913 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -79,15 +79,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx, paddle::platform::errors::PreconditionNotMet( "Operator DNNL eletwise_forward must use CPUPlace")); auto &dev_ctx = ctx.template device_context(); - const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); bool is_inplaced = x->IsSharedBufferWith(*y); - platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, - ctx.GetPlace(), x); + platform::ActivationMKLDNNHandler handler(algorithm, ctx, dev_ctx, + ctx.GetPlace(), x, + ctx.InputName("X"), is_inplaced); auto src_memory_p = handler.AcquireSrcMemory(x); auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y); @@ -106,14 +106,13 @@ template void eltwise_grad(const framework::ExecutionContext &ctx, mkldnn::algorithm algorithm) { auto &dev_ctx = ctx.template device_context(); - const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); const auto *diff_y = ctx.Input(framework::GradVarName("Out")); auto *diff_x = ctx.Output(framework::GradVarName("X")); - platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, - ctx.GetPlace(), x, diff_y); + platform::ActivationMKLDNNHandler handler( + algorithm, ctx, dev_ctx, ctx.GetPlace(), x, diff_y, ctx.InputName("X")); auto src_memory_p = handler.AcquireBackwardSrcMemory(x); auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y); diff --git a/paddle/fluid/operators/mkldnn/caching_tests.cmake b/paddle/fluid/operators/mkldnn/caching_tests.cmake index d7c295672e0021..4130c295b203eb 100644 --- a/paddle/fluid/operators/mkldnn/caching_tests.cmake +++ b/paddle/fluid/operators/mkldnn/caching_tests.cmake @@ -1 +1 @@ -cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op conv_op im2col vol2col softmax scope device_context enforce) +cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op softmax scope device_context enforce) diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc index 84ac14d04b85b3..ae17048b5d568b 100644 --- a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc @@ -29,7 +29,6 @@ class ScaleMKLDNNKernel : public framework::OpKernel { void RunKernel(const framework::ExecutionContext& ctx) const { const auto& dev_ctx = ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); auto* x = ctx.Input("X"); auto* out = ctx.Output("Out"); @@ -37,12 +36,11 @@ class ScaleMKLDNNKernel : public framework::OpKernel { bool is_inplaced = x->IsSharedBufferWith(*out); platform::ActivationMKLDNNHandler handler( - mkldnn::algorithm::eltwise_linear, ctx, mkldnn_engine, ctx.GetPlace(), - x); + mkldnn::algorithm::eltwise_linear, ctx, dev_ctx, ctx.GetPlace(), x, + ctx.InputName("X"), is_inplaced); auto src_memory_p = handler.AcquireSrcMemory(x); - auto dst_memory_p = - is_inplaced ? src_memory_p : handler.AcquireDstMemory(out); + auto dst_memory_p = handler.AcquireDstMemory(out); auto activation_p = handler.AcquireForwardPrimitive(); auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index b0f27719bf9adc..e065800e4d1c71 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -32,56 +32,69 @@ using platform::to_void_cast; template class SoftmaxMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { + : public platform::MKLDNNHandlerT { public: - SoftmaxMKLDNNHandler(const mkldnn::engine mkldnn_engine, + SoftmaxMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* input, - Tensor* output, const int axis) - : platform::MKLDNNHandlerNoCachingT( - mkldnn_engine, cpu_place) { - PADDLE_ENFORCE_EQ( - input->dims(), output->dims(), - platform::errors::InvalidArgument( - "The shape of input and output tensor must be identical.")); - - auto softmax_tz = framework::vectorize(input->dims()); - auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), - input->format()); - - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, - axis); + Tensor* output, const int axis, + const std::string uniq_name, bool is_inplaced) + : platform::MKLDNNHandlerT( + dev_ctx, mkldnn_engine, cpu_place, + // Softmax may be inplace then uniq_name is no longer unique + is_inplaced ? platform::CreateKey( + dev_ctx, framework::vectorize(input->dims()), + axis, uniq_name) + : platform::CreateKey( + dev_ctx, framework::vectorize(input->dims()), + uniq_name)) { + if (!this->isCached()) { + PADDLE_ENFORCE_EQ( + input->dims(), output->dims(), + platform::errors::InvalidArgument( + "The shape of input and output tensor must be identical.")); + + auto softmax_tz = framework::vectorize(input->dims()); + auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), + input->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, + axis); + } } SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx, - const mkldnn::engine mkldnn_engine, + const MKLDNNDeviceContext& dev_ctx, platform::Place cpu_place, const Tensor* out, const Tensor* out_grad, Tensor* in_x_grad, const std::string& unique_name) - : platform::MKLDNNHandlerNoCachingT( - mkldnn_engine, cpu_place) { - PADDLE_ENFORCE_EQ(out_grad->dims(), in_x_grad->dims(), - platform::errors::InvalidArgument( - "The shape of softmax_grad's input " - "and output must be identical, but shapes differ, " - "out_grad: %s in_grad: %s", - out_grad->dims(), in_x_grad->dims())); - - auto dims = out_grad->dims(); // input and output share the same shape - const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); - auto softmax_tz = framework::vectorize(dims); - - auto data_softmax_md = MKLDNNMemDesc( - softmax_tz, platform::MKLDNNGetDataType(), out->format()); - auto diff_softmax_md = MKLDNNMemDesc( - softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); - - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, - data_softmax_md, axis); - this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, - axis); + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(out->dims()), + unique_name)) { + if (!this->isBwdCached()) { + PADDLE_ENFORCE_EQ( + out_grad->dims(), in_x_grad->dims(), + platform::errors::InvalidArgument("The shape of softmax_grad's input " + "and output must be identical.")); + + auto dims = out_grad->dims(); // input and output share the same shape + const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); + auto softmax_tz = framework::vectorize(dims); + + auto data_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out->format()); + auto diff_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, + data_softmax_md, axis); + this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, + axis); + } } }; @@ -98,8 +111,9 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { const int axis = CanonicalAxis(ctx.Attr("axis"), input->dims().size()); - SoftmaxMKLDNNHandler handler(mkldnn_engine, ctx.GetPlace(), input, - output, axis); + SoftmaxMKLDNNHandler handler(dev_ctx, mkldnn_engine, ctx.GetPlace(), + input, output, axis, ctx.OutputName("Out"), + is_inplaced); auto softmax_src_memory_p = handler.AcquireSrcMemory(input); // For Inplace src and and dst are the same memory object @@ -135,12 +149,11 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { paddle::platform::errors::PreconditionNotMet( "Operator DNNL SoftmaxGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); const Tensor* output = ctx.Input("Out"); auto* out_grad = ctx.template Input(framework::GradVarName("Out")); auto* in_x_grad = ctx.template Output(framework::GradVarName("X")); - SoftmaxMKLDNNHandler handler(ctx, mkldnn_engine, ctx.GetPlace(), output, + SoftmaxMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), output, out_grad, in_x_grad, ctx.InputName("Out")); auto dst_memory_p = handler.AcquireDstMemory(output); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 7251653793f899..cad4f47ec14022 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -33,8 +33,6 @@ USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); -USE_OP(conv2d); -USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); namespace paddle { namespace operators { @@ -66,19 +64,16 @@ class CacheTester { template void RunOperator(const platform::Place &place, const std::string &op_type, - const framework::DDim &dims, const std::string &first_input) { + const framework::DDim &dims, const std::string &output_name, + bool inplace = false) { framework::Scope scope; std::map num_inputs = {{"softmax", 1}, {"relu", 1}, - {"conv2d", 2}, {"elementwise_add", 2}, {"elementwise_mul", 2}}; - std::string first_input_var_name = (op_type == "conv2d") ? "Input" : "X"; - std::string second_input_var_name = (op_type == "conv2d") ? "Filter" : "Y"; - std::string output_var_name = (op_type == "conv2d") ? "Output" : "Out"; - std::string output_name = "output"; + std::string first_input = inplace == true ? output_name : "x"; std::vector input_names = { {first_input, scope.Var(first_input)->GetMutable()}, @@ -118,40 +113,71 @@ void RunOperator(const platform::Place &place, const std::string &op_type, auto &pool = platform::DeviceContextPool::Instance(); - auto op = - num_inputs[op_type] > 1 - ? framework::OpRegistry::CreateOp( - op_type, {{first_input_var_name, {first_input}}, - {second_input_var_name, {"x1"}}}, - {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}) - : framework::OpRegistry::CreateOp( - op_type, {{first_input_var_name, {first_input}}}, - {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}); + auto op = num_inputs[op_type] > 1 + ? framework::OpRegistry::CreateOp( + op_type, {{"X", {first_input}}, {"Y", {"x1"}}}, + {{"Out", {output_name}}}, {{"use_mkldnn", {true}}}) + : framework::OpRegistry::CreateOp( + op_type, {{"X", {first_input}}}, {{"Out", {output_name}}}, + {{"use_mkldnn", {true}}}); op->Run(scope, place); pool.Get(place)->Wait(); } -TEST(test_conv2d_reuse_cache, cpu_place) { - framework::DDim dims({1, 16, 32, 64}); +TEST(test_softmax_reuse_cache, cpu_place) { + framework::DDim dims({32, 64}); platform::CPUPlace p; CacheTester ct; - RunOperator(p, "conv2d", dims, "input_signal"); - RunOperator(p, "conv2d", dims, "input_signal"); - PADDLE_ENFORCE_EQ(ct.Analyze(9), true, + RunOperator(p, "softmax", dims, "softmax_out"); + RunOperator(p, "softmax", dims, "softmax_out"); + PADDLE_ENFORCE_EQ(ct.Analyze(4), true, platform::errors::InvalidArgument( - "Invalid number of cached oneDNN objects")); + "Wrong number of cached oneDNN objects")); } -TEST(test_conv2d_noreuse_cache, cpu_place) { - framework::DDim dims({1, 16, 32, 64}); +TEST(test_softmax_noreuse_cache, cpu_place) { + framework::DDim dims({32, 64}); platform::CPUPlace p; CacheTester ct; - RunOperator(p, "conv2d", dims, "input_signal"); - RunOperator(p, "conv2d", dims, "input_signal2"); - PADDLE_ENFORCE_EQ(ct.Analyze(18), true, + RunOperator(p, "softmax", dims, "softmax_out"); + RunOperator(p, "softmax", dims, "softmax_out2"); + PADDLE_ENFORCE_EQ(ct.Analyze(8), true, platform::errors::InvalidArgument( - "Invalid number of cached oneDNN objects")); + "Wrong number of cached oneDNN objects")); +} + +TEST(test_softmax_inplace_cache, cpu_place) { + framework::DDim dims({32, 64}); + platform::CPUPlace p; + CacheTester ct; + RunOperator(p, "softmax", dims, "softmax_out"); + RunOperator(p, "softmax", dims, "softmax_out", true); + PADDLE_ENFORCE_EQ(ct.Analyze(7), true, + platform::errors::InvalidArgument( + "Wrong number of cached oneDNN objects")); +} + +TEST(test_relu_inplace_cache, cpu_place) { + framework::DDim dims({32, 64}); + platform::CPUPlace p; + CacheTester ct; + RunOperator(p, "relu", dims, "relu_out"); + RunOperator(p, "relu", dims, "relu_out", true); + PADDLE_ENFORCE_EQ(ct.Analyze(7), true, + platform::errors::InvalidArgument( + "Wrong number of cached oneDNN objects")); +} + +TEST(test_elementwise_add_reuse_cache, cpu_place) { + framework::DDim dims({32, 64}); + platform::CPUPlace p; + CacheTester ct; + RunOperator(p, "elementwise_add", dims, "elementwise_add_out"); + RunOperator(p, "relu", dims, "elementwise_add_out", true); + PADDLE_ENFORCE_EQ(ct.Analyze(8), true, + platform::errors::InvalidArgument( + "Wrong number of cached oneDNN objects")); } } // namespace operators diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 95b8e0c610b1d4..f63d45d7ff6ae6 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -34,211 +34,6 @@ using framework::Tensor; using user_function = std::function(const float*)>; using memory = mkldnn::memory; -template -class MKLDNNHandlerNoCachingT { - public: - MKLDNNHandlerNoCachingT(mkldnn::engine engine, platform::Place cpu_place) - : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { - platform::MKLDNNDeviceContext::tls().log_lib_version(); - } - - std::shared_ptr AcquireForwardPrimitive() { - return std::make_shared(*fwd_pd_); - } - - std::shared_ptr AcquireBackwardPrimitive() { - return std::make_shared(*bwd_pd_); - } - - std::shared_ptr AcquireBackwardWeightsPrimitive() { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, platform::errors::Unavailable("BWD_PD should be set when " - "getting BWD prim .")); - return std::make_shared(*bwd_w_pd_); - } - - std::shared_ptr AcquireSrcMemory( - const framework::Tensor* input) { - const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(), - to_void_cast(input_data)); - } - - template - std::shared_ptr AcquireDstMemory(framework::Tensor* output) { - T_out* ptr = - output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); - return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr); - } - - template - std::shared_ptr AcquireDstMemory(void) { - return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc()); - } - - template - std::shared_ptr AcquireDstMemory( - const framework::Tensor* output) { - const T_out* output_data = output->data(); - return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), - to_void_cast(output_data)); - } - - std::shared_ptr AcquireDiffDstMemory( - const framework::Tensor* diffdst) { - const T* ptr = diffdst->data(); - return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_desc(), - to_void_cast(ptr)); - } - - std::shared_ptr AcquireDiffSrcMemory( - framework::Tensor* diffsrc) { - T* ptr = - diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); - return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr); - } - - // Buffer of given Tensor is used for oneDNN computation - std::shared_ptr AcquireDiffWeightsMemory( - framework::Tensor* diff_weights) { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); - T* ptr = diff_weights->mutable_data( - place_, bwd_w_pd_->diff_weights_desc().get_size()); - return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), - ptr); - } - - // Buffer is allocated by oneDNN to store computation results - std::shared_ptr AcquireDiffWeightsMemory(void) { - PADDLE_ENFORCE_NOT_NULL( - bwd_w_pd_, - platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); - return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc()); - } - - protected: - // If your primitive descriptor requires attributes, pass them as a - // first argument and paramters to descriptor constructor in the following - // arguments. Otherwise, all arguments will be forwarded to descriptor - // constructor, including the first one. - template - void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { - CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); - } - - // Using sfinae to specialise variadic function. Workaround for not having - // if constexpr in C++ 11. - template - typename std::enable_if::type, - dnnl::primitive_attr>::value>::type - CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { - auto fwd_desc = typename TForward::desc(std::forward(args)...); - fwd_pd_ = std::make_shared( - fwd_desc, first, engine_); - } - - template - typename std::enable_if::type, - dnnl::primitive_attr>::value>::type - CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { - auto fwd_desc = typename TForward::desc(std::forward(first), - std::forward(args)...); - fwd_pd_ = - std::make_shared(fwd_desc, engine_); - } - - template - void AcquireBackwardPrimitiveDescriptor(Args&&... args) { - // fwd_pd_ is set during grad by calling - // AcquireForwardPrimitiveDescriptor - PADDLE_ENFORCE_NOT_NULL(fwd_pd_, - platform::errors::Unavailable( - "Get MKLDNN Forward primitive %s failed.")); - auto bwd_desc = typename TBackward::desc(std::forward(args)...); - bwd_pd_ = std::make_shared( - bwd_desc, engine_, *fwd_pd_); - } - - template - void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { - // fwd_pd_ is set during grad by calling - // AcquireForwardPrimitiveDescriptor - PADDLE_ENFORCE_NOT_NULL(fwd_pd_, - platform::errors::Unavailable( - "Get MKLDNN Forward primitive %s failed.")); - auto bwd_desc = - typename TBackward_params::desc(std::forward(args)...); - bwd_w_pd_ = std::make_shared( - bwd_desc, engine_, *fwd_pd_); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md, void* ptr) { - return std::make_shared(md, engine_, ptr); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md) { - return std::make_shared(md, engine_); - } - - void AcquireReorder(const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p) { - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } - - template - std::shared_ptr AcquireMemoryWithReorder( - const mkldnn::memory::desc& user_md, - const mkldnn::memory::desc& target_md, void* ptr, - const std::string& suffix, bool is_persistent = false, - std::function(const F*)> custom_reorder_func = {}) { - std::shared_ptr target_memory_p; - if (custom_reorder_func) { - auto reordered_data = - custom_reorder_func(reinterpret_cast(ptr)); - ptr = reinterpret_cast(reordered_data.get()); - } - auto user_memory_p = std::make_shared(user_md, engine_, ptr); - if (user_md != target_md) { - target_memory_p = std::make_shared(target_md, engine_); - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } else { - target_memory_p = user_memory_p; - } - return target_memory_p; - } - - mkldnn::engine engine_; - platform::Place place_; - std::shared_ptr fwd_pd_; - std::shared_ptr bwd_pd_; - std::shared_ptr bwd_w_pd_; -}; - template @@ -284,7 +79,7 @@ class MKLDNNHandlerT { std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); if (backward_p == nullptr) { PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, platform::errors::Unavailable( - "BWD_PD should be set when " + "Error: BWD_PD should be set when " "getting BWD prim witk key: %s .", key_p)); backward_p = std::make_shared(*bwd_w_pd_); @@ -343,7 +138,7 @@ class MKLDNNHandlerT { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); + "Error: BWD_W_PD should be set when getting BWD grad of weights.")); T* ptr = diff_weights->mutable_data( place_, bwd_w_pd_->diff_weights_desc().get_size()); return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), ptr, @@ -355,7 +150,7 @@ class MKLDNNHandlerT { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, platform::errors::Unavailable( - "BWD_W_PD should be set when getting BWD grad of weights.")); + "Error: BWD_W_PD should be set when getting BWD grad of weights.")); return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), "@diff_wei_mem_p"); } @@ -794,70 +589,70 @@ class MKLDNNHandler { }; template -class BinaryMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { +class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT { public: BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis, + const MKLDNNDeviceContext& dev_ctx, const mkldnn::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, Tensor* z, - float scale_x, float scale_y, float scale_z) - : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d", - DataLayout::kMKLDNN, x->layout())); - PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for X tensor : %d (undef)", - static_cast(x->format()))); - - PADDLE_ENFORCE_EQ( - y->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d", - DataLayout::kMKLDNN, y->layout())); - PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for Y tensor : %d (undef)", - static_cast(y->format()))); - - const auto src_x_tz = framework::vectorize(x->dims()); - const auto src_y_tz = framework::vectorize(y->dims()); - // if output tensor(z) is nullptr then we are computing into oneDNN - // managed buffer - auto rankdiff = x->dims().size() - y->dims().size(); - const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) - : framework::vectorize(z->dims()); - - auto src0_md = dnnl::memory::desc( - src_x_tz, platform::MKLDNNGetDataType(), x->format()); - auto src1_md = dnnl::memory::desc( - src_y_tz, platform::MKLDNNGetDataType(), y->format()); - if (rankdiff > 0) { // Second input is of smaller rank than first - std::vector dims1_ex(rankdiff, 1); - dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), - src_y_tz.begin(), src_y_tz.end()); - src1_md = src1_md.reshape(dims1_ex); - } else if (rankdiff < 0) { // First input is of smaller than second - std::vector dims0_ex(-rankdiff, 1); - dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), - src_x_tz.begin(), src_x_tz.end()); - src0_md = src0_md.reshape(dims0_ex); - } - const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); + float scale_x, float scale_y, float scale_z, + const std::string& uniq_name) + : platform::MKLDNNHandlerT( + dev_ctx, engine, cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), + uniq_name)) { + if (!this->isCached()) { + PADDLE_ENFORCE_EQ( + x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for X tensor.")); + PADDLE_ENFORCE_NE( + x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for X tensor.")); + + PADDLE_ENFORCE_EQ( + y->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for Y tensor.")); + PADDLE_ENFORCE_NE( + y->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for Y tensor.")); + + const auto src_x_tz = framework::vectorize(x->dims()); + const auto src_y_tz = framework::vectorize(y->dims()); + // if output tensor(z) is nullptr then we are computing into oneDNN + // managed buffer + auto rankdiff = x->dims().size() - y->dims().size(); + const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) + : framework::vectorize(z->dims()); + + auto src0_md = dnnl::memory::desc( + src_x_tz, platform::MKLDNNGetDataType(), x->format()); + auto src1_md = dnnl::memory::desc( + src_y_tz, platform::MKLDNNGetDataType(), y->format()); + if (rankdiff > 0) { // Second input is of smaller rank than first + std::vector dims1_ex(rankdiff, 1); + dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), + src_y_tz.begin(), src_y_tz.end()); + src1_md = src1_md.reshape(dims1_ex); + } else if (rankdiff < 0) { // First input is of smaller than second + std::vector dims0_ex(-rankdiff, 1); + dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), + src_x_tz.begin(), src_x_tz.end()); + src0_md = src0_md.reshape(dims0_ex); + } + const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); - auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); - this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, - dst_md); + auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); + this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, + src1_md, dst_md); + } } std::shared_ptr AcquireSecondSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(), - to_void_cast(input_data)); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->src1_desc(), to_void_cast(input_data), "@src1_mem_p"); } private: @@ -980,95 +775,111 @@ class ReductionMKLDNNHandler template class ActivationMKLDNNHandler - : public MKLDNNHandlerNoCachingT { + : public MKLDNNHandlerT { public: ActivationMKLDNNHandler(mkldnn::algorithm algorithm, const framework::ExecutionContext& ctx, - const mkldnn::engine engine, Place cpu_place, - const framework::Tensor* in_x) - : platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - // eltwise_linear means we are in scale op - if (algorithm == mkldnn::algorithm::eltwise_linear) { - bool bias_after_scale = ctx.Attr("bias_after_scale"); - auto* scale_tensor = ctx.Input("ScaleTensor"); - alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") - : (float)*(scale_tensor->data()); - beta = ctx.Attr("bias"); - // if bias_after_scale == true - // out = scale*X + bias - // else - // out = scale*(X + bias) = scale*X + scale*bias - if (!bias_after_scale) beta *= alpha; - } else { - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); + const MKLDNNDeviceContext& dev_ctx, Place cpu_place, + const framework::Tensor* in_x, + const std::string& unique_name, bool is_inplaced) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + is_inplaced ? platform::CreateKey( + dev_ctx, framework::vectorize(in_x->dims()), "a", + algorithm, unique_name) + : platform::CreateKey( + dev_ctx, framework::vectorize(in_x->dims()), "a", + unique_name)) { + if (!this->isCached()) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + // eltwise_linear means we are in scale op + if (algorithm == mkldnn::algorithm::eltwise_linear) { + bool bias_after_scale = ctx.Attr("bias_after_scale"); + auto* scale_tensor = ctx.Input("ScaleTensor"); + alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") + : (float)*(scale_tensor->data()); + beta = ctx.Attr("bias"); + // if bias_after_scale == true + // out = scale*X + bias + // else + // out = scale*(X + bias) = scale*X + scale*bias + if (!bias_after_scale) beta *= alpha; + } else { + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); + } } - } - PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, - platform::errors::Unimplemented( - "Input dimension size can be 1, 2, 3, 4, " - "5, or 6, but now the dimension size is", - in_x->dims().size())); + PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, + platform::errors::Unimplemented( + "Input dimension size can be 1, 2, 3, 4, " + "5, or 6, but now the dimension size is", + in_x->dims().size())); - auto src_tz = framework::vectorize(in_x->dims()); - auto src_fmt = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); - auto md = - mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), src_fmt); + auto src_tz = framework::vectorize(in_x->dims()); + auto src_fmt = + src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto md = mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), + src_fmt); - this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, - algorithm, md, alpha, beta); + this->AcquireForwardPrimitiveDescriptor( + mkldnn::prop_kind::forward_training, algorithm, md, alpha, beta); + } } ActivationMKLDNNHandler(mkldnn::algorithm algorithm, const framework::ExecutionContext& ctx, - const mkldnn::engine engine, Place cpu_place, - const framework::Tensor* in_x, const Tensor* out_grad) - : platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); - } + const MKLDNNDeviceContext& dev_ctx, Place cpu_place, + const framework::Tensor* in_x, const Tensor* out_grad, + const std::string& unique_name) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()), + "a", unique_name)) { + if (!this->isBwdCached()) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); + } - auto diff_dst_tz = framework::vectorize(out_grad->dims()); + auto diff_dst_tz = framework::vectorize(out_grad->dims()); - auto src_fmt = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); - auto diff_fmt = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); + auto src_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto diff_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); - auto dims = framework::vectorize(in_x->dims()); - auto diff_dst_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), diff_fmt); - auto src_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), src_fmt); + auto dims = framework::vectorize(in_x->dims()); + auto diff_dst_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), diff_fmt); + auto src_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), src_fmt); - this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, - algorithm, src_md, alpha, beta); - this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, - alpha, beta); + this->AcquireForwardPrimitiveDescriptor( + mkldnn::prop_kind::forward_training, algorithm, src_md, alpha, beta); + this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, + alpha, beta); + } } std::shared_ptr AcquireBackwardSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(), - to_void_cast(input_data)); + to_void_cast(input_data), + "@bwd-src_mem_p"); } }; @@ -1619,6 +1430,11 @@ using ConvMKLDNNHandler = mkldnn::convolution_backward_data, mkldnn::convolution_backward_weights>; +using ConvTransposeMKLDNNHandler = + ConvMKLDNNTemplateHandler; + template static std::shared_ptr SetDstMemory( const framework::ExecutionContext& ctx, framework::Tensor* output, From 572adccdeec5b6531c67e31d95c77e8fe0cdc056 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 12 Aug 2021 20:11:32 +0800 Subject: [PATCH 027/126] Remove incorrect signal error stack trace (#34842) * remove unmatched signal error stack * fix error writing for cond --- paddle/fluid/platform/enforce.h | 17 ++++++++++++----- paddle/fluid/platform/init.cc | 12 +++++++++++- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index c63ea3fa8573b8..52be0c805bbd2a 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -270,12 +270,14 @@ inline std::string SimplifyDemangleStr(std::string str) { return str; } -inline std::string GetCurrentTraceBackString() { +inline std::string GetCurrentTraceBackString(bool for_signal = false) { std::ostringstream sout; - sout << "\n\n--------------------------------------\n"; - sout << "C++ Traceback (most recent call last):"; - sout << "\n--------------------------------------\n"; + if (!for_signal) { + sout << "\n\n--------------------------------------\n"; + sout << "C++ Traceback (most recent call last):"; + sout << "\n--------------------------------------\n"; + } #if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL) static constexpr int TRACE_STACK_LIMIT = 100; @@ -284,7 +286,12 @@ inline std::string GetCurrentTraceBackString() { auto symbols = backtrace_symbols(call_stack, size); Dl_info info; int idx = 0; - for (int i = size - 1; i >= 0; --i) { + // `for_signal` used to remove the stack trace introduced by + // obtaining the error stack trace when the signal error occurred, + // that is not related to the signal error self, remove it to + // avoid misleading users and developers + int end_idx = for_signal ? 2 : 0; + for (int i = size - 1; i >= end_idx; --i) { if (dladdr(call_stack[i], &info) && info.dli_sname) { auto demangled = demangle(info.dli_sname); std::string path(info.dli_fname); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 2e0ba9d241c72b..3ee5a578601045 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -294,7 +294,17 @@ void SignalHandle(const char *data, int size) { // Here does not throw an exception, // otherwise it will casue "terminate called recursively" std::ostringstream sout; - sout << platform::GetCurrentTraceBackString(); + sout << "\n\n--------------------------------------\n"; + sout << "C++ Traceback (most recent call last):"; + sout << "\n--------------------------------------\n"; + auto traceback = platform::GetCurrentTraceBackString(/*for_signal=*/true); + if (traceback.empty()) { + sout + << "No stack trace in paddle, may be caused by external reasons.\n"; + } else { + sout << traceback; + } + sout << "\n----------------------\nError Message " "Summary:\n----------------------\n"; sout << platform::errors::Fatal( From 3f71e8d21c5ebc118d474ea489bf97f9a531422f Mon Sep 17 00:00:00 2001 From: Qi Li Date: Thu, 12 Aug 2021 20:22:56 +0800 Subject: [PATCH 028/126] [NPU] add meshgrid, test=develop (#34576) --- paddle/fluid/operators/meshgrid_op_npu.cc | 84 +++++++ .../unittests/npu/test_meshgrid_op_npu.py | 216 ++++++++++++++++++ 2 files changed, 300 insertions(+) create mode 100644 paddle/fluid/operators/meshgrid_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_meshgrid_op_npu.py diff --git a/paddle/fluid/operators/meshgrid_op_npu.cc b/paddle/fluid/operators/meshgrid_op_npu.cc new file mode 100644 index 00000000000000..a72c611a658d5f --- /dev/null +++ b/paddle/fluid/operators/meshgrid_op_npu.cc @@ -0,0 +1,84 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/operators/meshgrid_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class MeshgridNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto ins = context.MultiInput("X"); + auto outs = context.MultiOutput("Out"); + PADDLE_ENFORCE_EQ( + (ins.size() > 1) && (ins.size() < 7), true, + platform::errors::InvalidArgument( + "Excepted Tensor numbers between 2 and 6, but only received d% .", + ins.size())); + + int64_t size = ins.size(); + std::vector shape(size); + + for (int64_t i = 0; i < size; i++) { + switch (ins[i]->dims().size()) { + case 0: + shape[i] = 1; + break; + case 1: + shape[i] = ins[i]->dims()[0]; + break; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Expected scalar or 1D tensor in the tensor list but got tensor " + "%d: ", + i)); + } + } + + for (int64_t i = 0; i < size; i++) { + std::vector view_shape(size, 1); + view_shape[i] = shape[i]; + + framework::DDim out_dims_reshape = framework::make_ddim(view_shape); + framework::Tensor reshape_ins_tensor(ins[i]->type()); + reshape_ins_tensor.ShareDataWith(*ins[i]); + reshape_ins_tensor.Resize(out_dims_reshape); + + framework::DDim out_dims = framework::make_ddim(shape); + outs[i]->Resize(out_dims); + outs[i]->mutable_data(context.GetPlace()); + + auto stream = + context.template device_context() + .stream(); + const auto& runner = NpuOpRunner("BroadcastToD", {reshape_ins_tensor}, + {*(outs[i])}, {{"shape", shape}}); + runner.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + meshgrid, ops::MeshgridNPUKernel, + ops::MeshgridNPUKernel, + ops::MeshgridNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_meshgrid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_meshgrid_op_npu.py new file mode 100644 index 00000000000000..216a6418ac65cf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_meshgrid_op_npu.py @@ -0,0 +1,216 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest, skip_check_grad_ci +import paddle.fluid as fluid +import paddle +from paddle.fluid import compiler, Program, program_guard, core + +paddle.enable_static() + + +class TestMeshgridOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "meshgrid" + self.dtype = self.get_dtype() + ins, outs = self.init_test_data() + self.inputs = {'X': [('x%d' % i, ins[i]) for i in range(len(ins))]} + self.outputs = { + 'Out': [('out%d' % i, outs[i]) for i in range(len(outs))] + } + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def get_dtype(self): + return "float32" + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + def init_test_data(self): + self.shape = self.get_x_shape() + ins = [] + outs = [] + for i in range(len(self.shape)): + ins.append(np.random.random((self.shape[i], )).astype(self.dtype)) + + for i in range(len(self.shape)): + out_reshape = [1] * len(self.shape) + out_reshape[i] = self.shape[i] + out_temp = np.reshape(ins[i], out_reshape) + outs.append(np.broadcast_to(out_temp, self.shape)) + return ins, outs + + def get_x_shape(self): + return [100, 200] + + +@skip_check_grad_ci( + reason="The backward test is not supported for float16 type on NPU.") +class TestMeshgridOpFP16(TestMeshgridOp): + def get_dtype(self): + return "float16" + + +class TestMeshgridOp2(TestMeshgridOp): + def get_x_shape(self): + return [100, 300] + + +class TestMeshgridOp3(unittest.TestCase): + def test_api(self): + x = fluid.data(shape=[100], dtype='int32', name='x') + y = fluid.data(shape=[200], dtype='int32', name='y') + + input_1 = np.random.randint(0, 100, [100, ]).astype('int32') + input_2 = np.random.randint(0, 100, [200, ]).astype('int32') + + out_1 = np.reshape(input_1, [100, 1]) + out_1 = np.broadcast_to(out_1, [100, 200]) + out_2 = np.reshape(input_2, [1, 200]) + out_2 = np.broadcast_to(out_2, [100, 200]) + + exe = fluid.Executor(place=fluid.NPUPlace(0)) + grid_x, grid_y = paddle.tensor.meshgrid(x, y) + res_1, res_2 = exe.run(fluid.default_main_program(), + feed={'x': input_1, + 'y': input_2}, + fetch_list=[grid_x, grid_y]) + + self.assertTrue(np.allclose(res_1, out_1)) + self.assertTrue(np.allclose(res_2, out_2)) + + +class TestMeshgridOp4(unittest.TestCase): + def test_list_input(self): + x = fluid.data(shape=[100], dtype='int32', name='x') + y = fluid.data(shape=[200], dtype='int32', name='y') + + input_1 = np.random.randint(0, 100, [100, ]).astype('int32') + input_2 = np.random.randint(0, 100, [200, ]).astype('int32') + + out_1 = np.reshape(input_1, [100, 1]) + out_1 = np.broadcast_to(out_1, [100, 200]) + out_2 = np.reshape(input_2, [1, 200]) + out_2 = np.broadcast_to(out_2, [100, 200]) + + exe = fluid.Executor(place=fluid.NPUPlace(0)) + grid_x, grid_y = paddle.tensor.meshgrid([x, y]) + res_1, res_2 = exe.run(fluid.default_main_program(), + feed={'x': input_1, + 'y': input_2}, + fetch_list=[grid_x, grid_y]) + + self.assertTrue(np.allclose(res_1, out_1)) + self.assertTrue(np.allclose(res_2, out_2)) + + +class TestMeshgridOp5(unittest.TestCase): + def test_tuple_input(self): + x = fluid.data(shape=[100], dtype='int32', name='x') + y = fluid.data(shape=[200], dtype='int32', name='y') + + input_1 = np.random.randint(0, 100, [100, ]).astype('int32') + input_2 = np.random.randint(0, 100, [200, ]).astype('int32') + + out_1 = np.reshape(input_1, [100, 1]) + out_1 = np.broadcast_to(out_1, [100, 200]) + out_2 = np.reshape(input_2, [1, 200]) + out_2 = np.broadcast_to(out_2, [100, 200]) + + exe = fluid.Executor(place=fluid.NPUPlace(0)) + grid_x, grid_y = paddle.tensor.meshgrid((x, y)) + res_1, res_2 = exe.run(fluid.default_main_program(), + feed={'x': input_1, + 'y': input_2}, + fetch_list=[grid_x, grid_y]) + + self.assertTrue(np.allclose(res_1, out_1)) + self.assertTrue(np.allclose(res_2, out_2)) + + +class TestMeshgridOp6(unittest.TestCase): + def test_api_with_dygraph(self): + paddle.disable_static(paddle.NPUPlace(0)) + input_3 = np.random.randint(0, 100, [100, ]).astype('int32') + input_4 = np.random.randint(0, 100, [200, ]).astype('int32') + + out_3 = np.reshape(input_3, [100, 1]) + out_3 = np.broadcast_to(out_3, [100, 200]) + out_4 = np.reshape(input_4, [1, 200]) + out_4 = np.broadcast_to(out_4, [100, 200]) + + tensor_3 = paddle.to_tensor(input_3) + tensor_4 = paddle.to_tensor(input_4) + res_3, res_4 = paddle.tensor.meshgrid(tensor_3, tensor_4) + + self.assertTrue(np.allclose(res_3.numpy(), out_3)) + self.assertTrue(np.allclose(res_4.numpy(), out_4)) + paddle.enable_static() + + +class TestMeshgridOp7(unittest.TestCase): + def test_api_with_dygraph_list_input(self): + paddle.disable_static(paddle.NPUPlace(0)) + input_3 = np.random.randint(0, 100, [100, ]).astype('int32') + input_4 = np.random.randint(0, 100, [200, ]).astype('int32') + + out_3 = np.reshape(input_3, [100, 1]) + out_3 = np.broadcast_to(out_3, [100, 200]) + out_4 = np.reshape(input_4, [1, 200]) + out_4 = np.broadcast_to(out_4, [100, 200]) + + tensor_3 = paddle.to_tensor(input_3) + tensor_4 = paddle.to_tensor(input_4) + res_3, res_4 = paddle.meshgrid([tensor_3, tensor_4]) + + self.assertTrue(np.allclose(res_3.numpy(), out_3)) + self.assertTrue(np.allclose(res_4.numpy(), out_4)) + paddle.enable_static() + + +class TestMeshgridOp8(unittest.TestCase): + def test_api_with_dygraph_tuple_input(self): + paddle.disable_static(paddle.NPUPlace(0)) + input_3 = np.random.randint(0, 100, [100, ]).astype('int32') + input_4 = np.random.randint(0, 100, [200, ]).astype('int32') + + out_3 = np.reshape(input_3, [100, 1]) + out_3 = np.broadcast_to(out_3, [100, 200]) + out_4 = np.reshape(input_4, [1, 200]) + out_4 = np.broadcast_to(out_4, [100, 200]) + + tensor_3 = paddle.to_tensor(input_3) + tensor_4 = paddle.to_tensor(input_4) + res_3, res_4 = paddle.tensor.meshgrid((tensor_3, tensor_4)) + + self.assertTrue(np.allclose(res_3.numpy(), out_3)) + self.assertTrue(np.allclose(res_4.numpy(), out_4)) + paddle.enable_static() + + +if __name__ == '__main__': + unittest.main() From 2164ad6137f6c8b26bd7811b352d74003da7fbbe Mon Sep 17 00:00:00 2001 From: andyjpaddle <87074272+andyjpaddle@users.noreply.github.com> Date: Fri, 13 Aug 2021 11:06:30 +0800 Subject: [PATCH 029/126] [npu]add unsqueeze2_grad,test=develop (#34733) --- paddle/fluid/operators/unsqueeze_op_npu.cc | 16 ++ .../unittests/npu/test_unsqueeze_op_npu.py | 141 ++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/npu/test_unsqueeze_op_npu.py diff --git a/paddle/fluid/operators/unsqueeze_op_npu.cc b/paddle/fluid/operators/unsqueeze_op_npu.cc index c3daeffc13d1a7..9469fb3d9cbcda 100644 --- a/paddle/fluid/operators/unsqueeze_op_npu.cc +++ b/paddle/fluid/operators/unsqueeze_op_npu.cc @@ -38,4 +38,20 @@ REGISTER_OP_NPU_KERNEL( ops::UnsqueezeKernel, ops::UnsqueezeKernel, ops::UnsqueezeKernel); +REGISTER_OP_NPU_KERNEL( + unsqueeze_grad, ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel); +REGISTER_OP_NPU_KERNEL( + unsqueeze2_grad, ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel); #endif diff --git a/python/paddle/fluid/tests/unittests/npu/test_unsqueeze_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_unsqueeze_op_npu.py new file mode 100644 index 00000000000000..cebfed1629abac --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_unsqueeze_op_npu.py @@ -0,0 +1,141 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import Program, program_guard + +paddle.enable_static() + + +# unsqueeze +class TestUnsqueezeOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "unsqueeze" + self.place = paddle.NPUPlace(0) + self.init_test_case() + self.x = np.random.random(self.ori_shape).astype("float32") + self.inputs = {"X": OpTest.np_dtype_to_fluid_dtype(self.x)} + self.init_attrs() + self.outputs = {"Out": self.x.reshape(self.new_shape), } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.ori_shape = (3, 40) + self.axes = (0, 2) + self.new_shape = (1, 3, 1, 40) + + def init_attrs(self): + self.attrs = {"axes": self.axes} + + +class TestUnsqueezeOp1(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (3, 40) + self.axes = (0, -2) + self.new_shape = (1, 3, 1, 40) + + +# No axes input. +class TestUnsqueezeOp2(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = () + self.new_shape = (1, 20, 5) + + +# Just part of axes be squeezed. +class TestUnsqueezeOp3(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (6, 5, 1, 4) + self.axes = (1, -1) + self.new_shape = (6, 1, 5, 1, 4, 1) + + +# unsqueeze 2 +class TestUnsqueeze2Op(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "unsqueeze2" + self.place = paddle.NPUPlace(0) + self.init_test_case() + self.x = np.random.random(self.ori_shape).astype("float32") + self.inputs = {"X": OpTest.np_dtype_to_fluid_dtype(self.x)} + self.init_attrs() + self.outputs = { + "Out": self.x.reshape(self.new_shape), + "XShape": np.random.random(self.ori_shape).astype("float32") + } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, no_check_set=['XShape']) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.ori_shape = (3, 40) + self.axes = (0, 2) + self.new_shape = (1, 3, 1, 40) + + def init_attrs(self): + self.attrs = {"axes": self.axes} + + +# Correct: There is mins axis. +class TestUnsqueeze2Op1(TestUnsqueeze2Op): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (0, -2) + self.new_shape = (1, 20, 1, 5) + + +# Correct: No axes input. +class TestUnsqueeze2Op2(TestUnsqueeze2Op): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = () + self.new_shape = (1, 20, 5) + + +# Correct: Just part of axes be squeezed. +class TestUnsqueeze2Op3(TestUnsqueeze2Op): + def init_test_case(self): + self.ori_shape = (6, 5, 1, 4) + self.axes = (1, -1) + self.new_shape = (6, 1, 5, 1, 4, 1) + + +if __name__ == "__main__": + unittest.main() From e92f0388c11b2bfee0bea35ab2ada16fb6cef9ca Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Fri, 13 Aug 2021 11:21:35 +0800 Subject: [PATCH 030/126] add retry for gethostbyname (#34855) --- .../operators/collective/gen_hccl_id_op_helper.cc | 12 +++++++++++- paddle/fluid/platform/flags.cc | 14 ++++++++++++++ paddle/fluid/platform/gen_comm_id_helper.cc | 13 ++++++++++++- python/paddle/fluid/__init__.py | 2 ++ 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc index 15940a76f71105..e1a0c7fd29506b 100644 --- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc +++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc @@ -34,6 +34,8 @@ limitations under the License. */ #include "paddle/fluid/platform/hccl_helper.h" #endif +DECLARE_int32(get_host_by_name_time); + namespace paddle { namespace operators { @@ -226,7 +228,15 @@ static int ConnectAddr(const std::string& ep, const char* head) { char* ip = NULL; struct hostent* hp = NULL; - hp = gethostbyname(host.c_str()); + // sleep for get_host_by_name_time seconds. + for (int i = 0; 2 * i < FLAGS_get_host_by_name_time; i++) { + hp = gethostbyname(host.c_str()); + if (hp != NULL) { + break; + } + std::this_thread::sleep_for(std::chrono::seconds(2)); + LOG(WARNING) << "gethostbyname " << host.c_str() << " error!"; + } PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument( "Fail to get host by name %s.", host)); diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index ae4a7b8b67263e..33d9c6efef852d 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -606,3 +606,17 @@ DEFINE_bool(check_kernel_launch, false, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d"); #endif + +/** + * Distributed related FLAG + * Name: FLAGS_get_host_by_name_time + * Since Version: 2.2.0 + * Value Range: int32, default=120 + * Example: + * Note: Get host by name time. + */ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \ + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP) +DEFINE_int32(get_host_by_name_time, 120, + "The maximum time for get host by name time"); +#endif diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc index 73bc2c41a0bc9c..e9fe2a38c6c43c 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.cc +++ b/paddle/fluid/platform/gen_comm_id_helper.cc @@ -37,6 +37,8 @@ limitations under the License. */ #include "paddle/fluid/platform/collective_helper.h" #endif +DECLARE_int32(get_host_by_name_time); + namespace paddle { namespace platform { @@ -236,7 +238,16 @@ static int ConnectAddr(const std::string& ep, const CommHead head) { char* ip = NULL; struct hostent* hp = NULL; - hp = gethostbyname(host.c_str()); + + // sleep for get_host_by_name_time seconds. + for (int i = 0; 2 * i < FLAGS_get_host_by_name_time; i++) { + hp = gethostbyname(host.c_str()); + if (hp != NULL) { + break; + } + std::this_thread::sleep_for(std::chrono::seconds(2)); + LOG(WARNING) << "gethostbyname " << host.c_str() << " error!"; + } PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument( "Fail to get host by name %s.", host)); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index fcb2641710facb..5d1274a1f05324 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -236,6 +236,7 @@ def __bootstrap__(): 'local_exe_sub_scope_limit', 'gpu_memory_limit_mb', 'conv2d_disable_cudnn', + 'get_host_by_name_time', ] if core.is_compiled_with_npu(): @@ -246,6 +247,7 @@ def __bootstrap__(): 'reallocate_gpu_memory_in_mb', 'gpu_memory_limit_mb', 'npu_config_path', + 'get_host_by_name_time', ] core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) From 507ea06f9b66630385ab96dec144631492877353 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Fri, 13 Aug 2021 14:01:51 +0800 Subject: [PATCH 031/126] [Bug-Fix]fix bug of py36 import utils (#34873) * fix bug of py36 import --- .../paddle/distributed/fleet/meta_parallel/pp_utils/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py index 728080a7cd248e..fc1fc4f992e36e 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py @@ -17,10 +17,9 @@ import paddle from paddle.fluid import core from paddle import _C_ops -import paddle.distributed as dist from paddle.autograd import PyLayer from paddle.fluid import framework -from paddle.distributed.fleet.utils.recompute import check_recompute_necessary, detach_variable +from ...utils.recompute import check_recompute_necessary, detach_variable from ..parallel_layers.random import get_rng_state_tracker __all__ = [] @@ -239,7 +238,7 @@ def backward(ctx, *args): tensor_shapes = ctx.tensor_shapes tensors = list(ctx.saved_tensor()) - device_id = dist.ParallelEnv().device_id + device_id = paddle.distributed.ParallelEnv().device_id for i, idx in enumerate(tensor_indices): if _recompute_partition: state = tensors[i].stop_gradient From 17a99760d13a49ebd37b629f36136b7da6cb8571 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Fri, 13 Aug 2021 01:10:24 -0500 Subject: [PATCH 032/126] fix npu_finalize (#34857) --- paddle/fluid/pybind/pybind.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 589ea088a6cf03..7137115ac0a396 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2221,6 +2221,7 @@ All parameter, weight, gradient are variables in Paddle. auto &pool = platform::DeviceContextPool::Instance(); auto devices = platform::GetSelectedNPUDevices(); for (size_t i = 0; i < devices.size(); ++i) { + platform::NPUDeviceGuard guard(devices[i]); pool.Get(platform::NPUPlace(devices[i]))->Wait(); } platform::AclInstance::Instance().Finalize(); From 5b86b9996001a7365511c09a6cba9cd5003df167 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Fri, 13 Aug 2021 14:17:25 +0800 Subject: [PATCH 033/126] [NPU] fix bce_loss_npu, test=develop (#34876) --- paddle/fluid/operators/bce_loss_op_npu.cc | 7 ++++--- .../paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/bce_loss_op_npu.cc b/paddle/fluid/operators/bce_loss_op_npu.cc index f6b0f7b3fbbd53..3136c02af41e60 100644 --- a/paddle/fluid/operators/bce_loss_op_npu.cc +++ b/paddle/fluid/operators/bce_loss_op_npu.cc @@ -34,8 +34,9 @@ class BCELossNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - const auto& runner = NpuOpRunner("BinaryCrossEntropy", {*x, *labels}, - {*out}, {{"reduction", "none"}}); + const auto& runner = + NpuOpRunner("BinaryCrossEntropy", {*x, *labels}, {*out}, + {{"reduction", static_cast("none")}}); runner.Run(stream); } }; @@ -57,7 +58,7 @@ class BCELossGradNPUKernel : public framework::OpKernel { const auto& runner = NpuOpRunner("BinaryCrossEntropyGrad", {*x, *labels, *dout}, {*dx}, - {{"reduction", "none"}}); + {{"reduction", static_cast("none")}}); runner.Run(stream); } }; diff --git a/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py b/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py index 16db952533437c..7c3d32647aea9d 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py @@ -96,7 +96,7 @@ def test_dygraph_layer(place, label_np, reduction='mean', weight_np=None): - paddle.disable_static() + paddle.disable_static(place) if weight_np is not None: weight = paddle.to_tensor(weight_np) bce_loss = paddle.nn.loss.BCELoss(weight=weight, reduction=reduction) @@ -113,7 +113,7 @@ def test_dygraph_functional(place, label_np, reduction='mean', weight_np=None): - paddle.disable_static() + paddle.disable_static(place) input = paddle.to_tensor(input_np) label = paddle.to_tensor(label_np) From 8bc4d854c816e1271fecfedb34c8a7b113564aa1 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Fri, 13 Aug 2021 15:12:02 +0800 Subject: [PATCH 034/126] Support sccache distributed storage on windows (#34879) --- paddle/scripts/paddle_build.bat | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index a70c9ca9963035..a5b5a3705e514b 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -161,13 +161,23 @@ if "%WITH_SCCACHE%"=="ON" ( del D:\sccache\sccache_log.txt cmd /C sccache -V || call :install_sccache sccache --stop-server 2> NUL + + :: Localy storage on windows if not exist D:\sccache mkdir D:\sccache set SCCACHE_DIR=D:\sccache\.cache - :: sccache will shut down if a source file takes more than 10 mins to compile + + :: Sccache will shut down if a source file takes more than 10 mins to compile set SCCACHE_IDLE_TIMEOUT=0 - set SCCACHE_CACHE_SIZE=30G + set SCCACHE_CACHE_SIZE=100G set SCCACHE_ERROR_LOG=D:\sccache\sccache_log.txt set SCCACHE_LOG=quiet + + :: Distributed storage on windows + set SCCACHE_ENDPOINT=s3.bj.bcebos.com + set SCCACHE_BUCKET=paddle-windows + set SCCACHE_S3_KEY_PREFIX=sccache/ + set SCCACHE_S3_USE_SSL=true + sccache --start-server sccache -z goto :CASE_%1 From ac56d54e0cf2646697510a143cc55f8941e35b0a Mon Sep 17 00:00:00 2001 From: Hao Lin Date: Fri, 13 Aug 2021 17:33:41 +0800 Subject: [PATCH 035/126] Add EmptyGradOpMaker CI Approval (#34810) * Add EmptyGradOpMaker CI Approval, test=develop * Fix typo in echo_line --- tools/check_file_diff_approvals.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index 1928dda447ac6f..f65a65c8b75b18 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -228,6 +228,12 @@ if [ "${INVALID_PADDLE_CHECK}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then check_approval 1 6836917 47554610 22561442 fi +EMPTY_GRAD_OP_REGISTERED=`echo $ALL_ADDED_LINES |grep -zoE "REGISTER_OP_WITHOUT_GRADIENT\([^;.]*\)[;\s]" || echo $ALL_ADDED_LINES |grep -zoE "[[:graph:]]*EmptyGradOpMaker<[[:graph:]]*>" || true` +if [ "${EMPTY_GRAD_OP_REGISTERED}" != "" ] && [ "${GIT_PT_ID}" != "" ]; then + echo_line="You must have one RD (phlrain, XiaoguangHu01, kolinwei or JiabinYang) approval for the usage of REGISTER_OP_WITHOUT_GRADIENT or EmptyGradOpMaker.\nThe code that do not meet the specification are as follows:\n${EMPTY_GRAD_OP_REGISTERED}\n" + check_approval 1 43953930 46782768 22165420 22361972 +fi + ALL_CHANGE_FILES=`git diff --numstat upstream/$BRANCH | awk '{print $3}' | grep ".py"` ALL_OPTEST_BAN_DYGRAPH_MESSAGE="" for CHANGE_FILE in ${ALL_CHANGE_FILES}; do From f421741cfa5ce53470540c02ff20408f4d9ca82e Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 13 Aug 2021 17:54:41 +0800 Subject: [PATCH 036/126] fix generator thread safety bug (#34888) --- paddle/fluid/framework/generator.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc index 737dbafb64cb21..4b64722a7abf5a 100644 --- a/paddle/fluid/framework/generator.cc +++ b/paddle/fluid/framework/generator.cc @@ -156,17 +156,15 @@ uint64_t Generator::Random64() { std::pair Generator::IncrementOffset( uint64_t increament_offset) { - uint64_t cur_offset = this->state_.thread_offset; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::lock_guard lock(this->mu_); - + uint64_t cur_offset = this->state_.thread_offset; this->state_.thread_offset += increament_offset; - + return std::make_pair(this->state_.current_seed, cur_offset); #else PADDLE_THROW(platform::errors::PermissionDenied( "Increment Offset only support in CUDA place")); #endif - return std::make_pair(this->state_.current_seed, cur_offset); } void Generator::SetIsInitPy(bool is_init_py) { From fc6b4a506dda4237be46aa06a85bb17e15ffdd96 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Fri, 13 Aug 2021 19:11:52 +0800 Subject: [PATCH 037/126] Bug fix : Can't load multiple modules of custom c++ op (#34505) * Fix a bug : can't load more than one custom op module * Fix a bug : can't load more than one custom op module * add test for load multiple modules of custom c++ op * add config for Coverage CI --- paddle/fluid/framework/custom_operator.cc | 8 +++++++- .../fluid/tests/custom_op/test_custom_relu_op_jit.py | 11 +++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 7fef165f373969..19e661587716b3 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -517,6 +517,12 @@ void RegisterOperatorWithMetaInfo( auto& base_op_meta = op_meta_infos.front(); auto op_name = OpMetaInfoHelper::GetOpName(base_op_meta); + + if (OpInfoMap::Instance().Has(op_name)) { + LOG(WARNING) << "Operator (" << op_name << ")has been registered."; + return; + } + auto& op_inputs = OpMetaInfoHelper::GetInputs(base_op_meta); auto& op_outputs = OpMetaInfoHelper::GetOutputs(base_op_meta); auto& op_attrs = OpMetaInfoHelper::GetAttrs(base_op_meta); @@ -867,7 +873,7 @@ void RegisterOperatorWithMetaInfoMap( // load op api void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) { void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name); - + VLOG(1) << "load custom_op lib: " << dso_name; typedef OpMetaInfoMap& get_op_meta_info_map_t(); auto* get_op_meta_info_map = detail::DynLoad(handle, "PD_GetOpMetaInfoMap"); diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py index 0f7ba84ffc147b..052fe8b156a53c 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py @@ -130,6 +130,17 @@ def test_exception(self): str(e)) self.assertTrue(caught_exception) + def test_load_multiple_module(self): + custom_module = load( + name='custom_conj_jit', + sources=['custom_conj_op.cc'], + extra_include_paths=paddle_includes, # add for Coverage CI + extra_cxx_cflags=extra_cc_args, # test for cc flags + extra_cuda_cflags=extra_nvcc_args, # test for nvcc flags + verbose=True) + custom_conj = custom_module.custom_conj + self.assertIsNotNone(custom_conj) + if __name__ == '__main__': unittest.main() From ff4bdac31b5b6b1f4ea801f157c98e63b40ec750 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Fri, 13 Aug 2021 19:14:59 +0800 Subject: [PATCH 038/126] fix a bug of slice by none index (#34877) --- paddle/fluid/pybind/imperative.cc | 23 +++++++++++++++++++ .../fluid/tests/unittests/test_var_base.py | 5 +++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index b540d459c26294..0b6af3b542395d 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -921,6 +921,29 @@ void BindImperative(py::module *m_ptr) { axis -= len; } + // Deal with cases that there are more than one + // prefix none index, For example: + // [None, None, :, :, None] + // the none_axes int the return of ParseIndexingSlice is: + // [0, 0, 2 ] + // according to the interface of "unsqueeze2", + // we should convert it to: + // [0, 0, 4 ] + int prefix_zero_cnt = 0; + for (const auto &axis : none_axes) { + if (axis == 0) { + prefix_zero_cnt++; + } else { + break; + } + } + if (prefix_zero_cnt > 0) { + int none_axes_num = static_cast(none_axes.size()); + for (int i = prefix_zero_cnt; i < none_axes_num; ++i) { + none_axes[i] += prefix_zero_cnt; + } + } + imperative::NameVarBaseMap ins = {{"X", {out}}}; framework::AttributeMap attrs = {{"axes", none_axes}}; auto new_out = std::shared_ptr( diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index 4b52cfceabf853..cdf34c27c0a345 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -711,6 +711,7 @@ def _test_none_index(self): var_tensor[None, 2, None, 1].numpy(), var_tensor[None].numpy(), var_tensor[0, 0, None, 0, 0, None].numpy(), + var_tensor[None, None, 0, ..., None].numpy(), var_tensor[0, 1:10:2, None, None, ...].numpy(), ] @@ -724,11 +725,13 @@ def _test_none_index(self): self.assertTrue(np.array_equal(var[7], np_value[None])) self.assertTrue( np.array_equal(var[8], np_value[0, 0, None, 0, 0, None])) + self.assertTrue( + np.array_equal(var[9], np_value[None, None, 0, ..., None])) # TODO(zyfncg) there is a bug of dimensions when slice step > 1 and # indexs has int type # self.assertTrue( - # np.array_equal(var[9], np_value[0, 1:10:2, None, None, ...])) + # np.array_equal(var[10], np_value[0, 1:10:2, None, None, ...])) def _test_for_var(self): np_value = np.random.random((30, 100, 100)).astype('float32') From 8c8667f0b027a19d494b614c42d4e52a4e985560 Mon Sep 17 00:00:00 2001 From: Tongxin Bai Date: Fri, 13 Aug 2021 20:18:18 +0800 Subject: [PATCH 039/126] New Einsum API (#33821) * OP dot: refactor CPU kernels and get better loop performance. * Minor fix on code format. * Fixed minor errors. * Add new API: einsum * Update the Einsum unit test. One case failed with matmul_v2, where the dtype is int64: a = np.arange(2 * 3 * 1).reshape(2, 3, 1) b = np.arange(1) paddle.einsum("...i, ...i", a, b) * Test cases in test_einsum test floating point dtypes only. As of now Paddle only supports float/double dtypes in matmul, which is one of building blocks of this Einsum implementation. We decide not to test einsum against other dtypes. * Polish format. * More formatting. * Format... * Einsum: improve test coverage. * Einsum: bug fixes and more testcases for testing error messages * Einsum: fix format.. * Einsum: fixed typo and format. * Einsum: format again... * Einsum: applied suggested changes. * Einsum API: improve API documentation. * Einsum API: apply suggested changes. * Einsum API: Add dygraph only note. * Einsum API: Add dygraph only note. * Einsum API: fixed unittest. --- paddle/fluid/platform/dynload/mklml.h | 3 +- python/paddle/__init__.py | 3 + .../fluid/tests/unittests/test_einsum.py | 394 ++++++++ python/paddle/tensor/__init__.py | 2 + python/paddle/tensor/einsum.py | 940 ++++++++++++++++++ 5 files changed, 1341 insertions(+), 1 deletion(-) create mode 100644 python/paddle/fluid/tests/unittests/test_einsum.py create mode 100644 python/paddle/tensor/einsum.py diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index c3c8788c578bca..11208289165935 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -106,7 +106,8 @@ extern void* mklml_dso_handle; __macro(vmsErf); \ __macro(vmdErf); \ __macro(MKL_Free_Buffers); \ - __macro(MKL_Set_Num_Threads) + __macro(MKL_Set_Num_Threads); \ + __macro(MKL_Get_Max_Threads); MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index f8bdf97c8e32a2..f72fb6c1806b10 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -238,6 +238,8 @@ from .tensor.to_string import set_printoptions # noqa: F401 +from .tensor.einsum import einsum # noqa: F401 + from .framework.random import seed # noqa: F401 from .framework.random import get_cuda_rng_state # noqa: F401 from .framework.random import set_cuda_rng_state # noqa: F401 @@ -509,4 +511,5 @@ 'standard_normal', 'diagonal', 'broadcast_tensors', + 'einsum' ] diff --git a/python/paddle/fluid/tests/unittests/test_einsum.py b/python/paddle/fluid/tests/unittests/test_einsum.py new file mode 100644 index 00000000000000..39bf9b926b1487 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_einsum.py @@ -0,0 +1,394 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import contextlib +import unittest +import paddle +from paddle.fluid import core + + +class TestErrors(unittest.TestCase): + def setUp(self): + pass + + def test_diagonalize_errors(self): + a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float') + a = paddle.to_tensor(a) + with self.assertRaisesRegex(AssertionError, ( + 'Diagonal and trace not implemented yet.')): + paddle.einsum('...ii->...i', a) + with self.assertRaisesRegex(AssertionError, ( + 'Diagonal and trace not implemented yet.')): + paddle.einsum('i...i', a) + with self.assertRaisesRegex(AssertionError, ( + 'Diagonal and trace not implemented yet.')): + paddle.einsum('i...i->i...', a) + + def test_param_errors(self): + a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float') + a = paddle.to_tensor(a) + with self.assertRaisesRegex(AssertionError, + ('At least one operand is expected.')): + paddle.einsum('ijk') + with self.assertRaisesRegex(AssertionError, ( + 'Invalid equation: multiple `->` were found.')): + paddle.einsum('i -> j -> k', a) + with self.assertRaisesRegex(AssertionError, ( + "Invalid equation: the number of operands is 2, " + "but found 3 segments in the label equation.")): + paddle.einsum('i,j,k', a, a) + with self.assertRaisesRegex(AssertionError, ( + "Invalid equation: the number of operands is 2, " + "but found 1 segments in the label equation.")): + paddle.einsum('ij -> k', a, a) + with self.assertRaisesRegex(AssertionError, ( + "Invalid equation: the number of operands is 1, " + "but found 2 segments in the label equation.")): + paddle.einsum('i, -> k', a) + with self.assertRaisesRegex(AssertionError, ( + "Invalid equation: the label string '' misses dimensions.")): + paddle.einsum('->', a) + with self.assertRaisesRegex(AssertionError, ( + "Invalid equation: the label string 'i' misses dimensions.")): + paddle.einsum('i', a) + with self.assertRaisesRegex(AssertionError, ( + "Invalid equation: _ is not a valid label, " + "which should be letters.")): + paddle.einsum('i_', a) + with self.assertRaisesRegex(AssertionError, ( + "Invalid equation: `.` is found outside of an ellipsis.")): + paddle.einsum('i..j', a) + with self.assertRaisesRegex(AssertionError, ( + "Invalid equation: `.` is found outside of an ellipsis.")): + paddle.einsum('...k...', a) + with self.assertRaisesRegex(AssertionError, ( + "Invalid equation: missing ellipsis in output labels.")): + paddle.einsum('i...->i', a) + with self.assertRaisesRegex(AssertionError, ( + "Invalid equation: duplicate output labels are found.")): + paddle.einsum('i...->i...i', a) + with self.assertRaisesRegex(AssertionError, ( + "Invalid operands: label i " + "corresponds to non-broadcastable dimensions.")): + paddle.einsum('ij...,ji...', a, a) + + +class TestEinsum(unittest.TestCase): + @classmethod + def setUpClass(cls): + np.random.seed(12345) + + cls.TEST_SAMPLES = { + "x": np.random.rand(5), + "y": np.random.rand(7), + "A": np.random.rand(4, 5), + "B": np.random.rand(2, 5), + "C": np.random.rand(3, 7), + "D": np.random.rand(3, 4, 5), + "E": np.random.rand(3, 5, 2), + "F": np.random.rand(2, 4, 5, 3), + "G": np.random.rand(4, 2, 5), + "H": np.random.rand(3, 2, 4), + "I": np.random.rand(2, 2), + "J": np.random.rand(1, 3, 5), + "K": np.random.rand(1, 2, 3, 4), + } + + def _get_place(self, force_to_use_cpu=False): + if force_to_use_cpu: + return core.CPUPlace() + else: + if core.is_compiled_with_cuda(): + return core.CUDAPlace(0) + return core.CPUPlace() + + def check_output_equal(self, actual, expect, rtol=1.e-5, atol=1.e-8): + error_msg = 'Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}' + self.assertTrue( + np.allclose( + actual, expect, rtol=rtol, atol=atol), + error_msg.format(paddle.get_device(), expect, actual, + self.__class__.__name__)) + + def setUp(self): + self.sample = {"paradigm": "i->", "data": ["x"]} + + def test_forward(self): + operands = [ + TestEinsum.TEST_SAMPLES[operand] for operand in self.sample["data"] + ] + expected_result = np.einsum(self.sample["paradigm"], *operands) + equation = self.sample["paradigm"] + + with paddle.fluid.dygraph.guard( + self._get_place(force_to_use_cpu=False)): + pd_operands = [paddle.to_tensor(operand) for operand in operands] + result = paddle.einsum(equation, *pd_operands) + self.check_output_equal(result.numpy(), expected_result) + + with paddle.fluid.dygraph.guard(self._get_place(force_to_use_cpu=True)): + pd_operands = [paddle.to_tensor(operand) for operand in operands] + result = paddle.einsum(equation, *pd_operands) + self.check_output_equal(result.numpy(), expected_result) + + +class TestEinsumVectorDot(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "i,i->", "data": ["x", "x"]} + + +class TestEinsumVectorMul(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "i,i->i", "data": ["x", "x"]} + + +class TestEinsumVectorOuter(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "i,j->ij", "data": ["x", "y"]} + + +class TestEinsumMatrixTranspose(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ij->ji", "data": ["A"]} + + +class TestEinsumMatrixRowSum(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ij->j", "data": ["A"]} + + +class TestEinsumMatrixColSum(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ij->i", "data": ["A"]} + + +class TestEinsumMatrixEleMul(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ij,ij->ij", "data": ["A", "A"]} + + +class TestEinsumMatrixVecMul(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ij,j->i", "data": ["A", "x"]} + + +class TestEinsumMatrixMul(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ij,kj->ik", "data": ["A", "B"]} + + +class TestEinsumMatrixOuter(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ij,kl->ijkl", "data": ["A", "C"]} + + +class TestEinsumTensorBMM(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "bij,bjk->bik", "data": ["D", "E"]} + + +class TestEinsumTensorContract1(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ijk,jk->i", "data": ["D", "A"]} + + +class TestEinsumTensorContract2(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ijk,lk->ijl", "data": ["D", "B"]} + + +class TestEinsumTensorContract3(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "abcd,dfg->abcfg", "data": ["F", "D"]} + + +class TestEinsumTensorContract4(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ijk,jk->ik", "data": ["D", "A"]} + + +class TestEinsumTensorContract5(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ijk,jk->ij", "data": ["D", "A"]} + + +class TestEinsumTensorContract6(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ik, ijk->j", "data": ["A", "G"]} + + +class TestEinsumTensorContract7(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ijk, ik->jk", "data": ["G", "A"]} + + +class TestEinsumEllipsis1(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "i...->...", "data": ["G"]} + + +class TestEinsumEllipsis2(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ij,...i->j...", "data": ["A", "H"]} + + +class TestEinsumEllipsis3(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "k...,jk", "data": ["F", "I"]} + + +class TestEinsumTestEinsumBilinear(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "bn,anm,bm->ba", "data": ["B", "E", "I"]} + + +class TestEinsumTestEinsumOthers1(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ijkl, lmn->kmn", "data": ["F", "H"]} + + +class TestEinsumTestEinsumOthers2(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "ijkl, lmn->ijn", "data": ["F", "H"]} + + +class TestEinsumBatch1(TestEinsum): + def setUp(self): + self.sample = {"paradigm": "blq,bhlk->bhlqk", "data": ["J", "K"]} + + +class TestNumpyTests(unittest.TestCase): + def setUp(self): + pass + + def _get_place(self, force_to_use_cpu=False): + if force_to_use_cpu: + return core.CPUPlace() + else: + if core.is_compiled_with_cuda(): + return core.CUDAPlace(0) + return core.CPUPlace() + + def check_output_equal(self, actual, expect, rtol=1.e-5, atol=1.e-8): + error_msg = 'Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}' + self.assertTrue( + np.allclose( + actual, expect, rtol=rtol, atol=atol), + error_msg.format(paddle.get_device(), expect, actual, + self.__class__.__name__)) + + def check_output(self, eqn, *ops): + expect = np.einsum(eqn, *ops) + with paddle.fluid.dygraph.guard( + self._get_place(force_to_use_cpu=False)): + pd_operands = [paddle.to_tensor(op) for op in ops] + actual = paddle.einsum(eqn, *pd_operands) + self.check_output_equal(actual.numpy(), expect) + + def test_sums(self): + for n in range(1, 17): + a = np.arange(n).astype('float') + self.check_output("i->", a) + + for n in range(1, 17): + a = np.arange(2 * 3 * n).reshape(2, 3, n).astype('float') + self.check_output("...i->...", a) + + for n in range(1, 17): + a = np.arange(2 * n).reshape(2, n).astype('float') + self.check_output("i...->...", a) + + for n in range(1, 17): + a = np.arange(2 * 3 * n).reshape(2, 3, n).astype('float') + self.check_output("i...->...", a) + + for n in range(1, 17): + a = np.arange(3 * n).reshape(3, n).astype('float') + b = np.arange(2 * 3 * n).reshape(2, 3, n).astype('float') + self.check_output("..., ...", a, b) + + for n in range(1, 17): + a = np.arange(2 * 3 * n).reshape(2, 3, n).astype('float') + b = np.arange(n).astype('float') + self.check_output("...i, ...i", a, b) + + for n in range(1, 11): + a = np.arange(n * 3 * 2).reshape(n, 3, 2).astype('float') + b = np.arange(n).astype('float') + self.check_output("i..., i...", a, b) + + for n in range(1, 17): + a = (np.arange(3) + 1).astype('float') + b = (np.arange(n) + 1).astype('float') + self.check_output("i,j", a, b) + + for n in range(1, 17): + a = np.arange(4 * n).reshape(4, n).astype('float') + b = np.arange(n).astype('float') + self.check_output("ij, j", a, b) + + for n in range(1, 17): + a = np.arange(4 * n).reshape(4, n).astype('float') + b = np.arange(n).astype('float') + self.check_output("ji,j", a.T, b.T) + + for n in range(1, 17): + a = np.arange(4 * n).reshape(4, n).astype('float') + b = np.arange(n * 6).reshape(n, 6).astype('float') + self.check_output("ij,jk", a, b) + + a = np.arange(12).reshape(3, 4).astype('float') + b = np.arange(20).reshape(4, 5).astype('float') + c = np.arange(30).reshape(5, 6).astype('float') + self.check_output("ij,jk,kl", a, b, c) + + a = np.arange(60).reshape(3, 4, 5).astype('float') + b = np.arange(24).reshape(4, 3, 2).astype('float') + self.check_output("ijk, jil -> kl", a, b) + + for n in range(1, 25): + a = np.arange(n).astype('float') + self.check_output("...,...", a, a) + self.check_output("i,i", a, a) + + p = np.ones((10, 2)).astype('float') + q = np.ones((1, 2)).astype('float') + self.check_output('ij,ij->j', p, q) + + x = np.array([2., 3.]).astype('float') + y = np.array([4.]).astype('float') + self.check_output("i, i", x, y) + + p = np.ones((1, 5)) / 2 + q = np.ones((5, 5)) / 2 + self.check_output("...ij,...jk->...ik", p, p) + self.check_output("...ij,...jk->...ik", p, q) + + x = np.eye(2).astype('float') + y = np.ones(2).astype('float') + self.check_output("ji,i->", x, y) + self.check_output("i,ij->", y, x) + self.check_output("ij,i->", x, y) + + def test_large_nops(self): + a = np.arange(4 * 3 * 1 * 4).reshape(4, 3, 1, 4).astype('float') + self.check_output('a...b,b...c,c...d', a, a, a) + self.check_output('a...b,b...c,c...a', a, a, a) + self.check_output('a...b,b...c,c...a', a, a, a) + self.check_output('...ab,...ba,...ab,...ab', a, a, a, a) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 2d4c97212be83d..1c6996bcad6e5c 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -204,6 +204,8 @@ from .array import array_write # noqa: F401 from .array import create_array # noqa: F401 +from .einsum import einsum # noqa: F401 + #this list used in math_op_patch.py for _binary_creator_ tensor_method_func = [ #noqa 'matmul', diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py new file mode 100644 index 00000000000000..b6b0a9b1e7fe33 --- /dev/null +++ b/python/paddle/tensor/einsum.py @@ -0,0 +1,940 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import re + +from ..fluid.layers import reshape, transpose +from .linalg import matmul +from .manipulation import squeeze, unsqueeze +from .math import multiply +from .math import sum as paddle_sum + +from paddle.common_ops_import import dygraph_only + +__all__ = [] + + +def parse_op_labels(labelstr, operand): + ''' + Parse labels for an input operand. + + Parameters + ---------- + labelstr: + the input label string + operand: + the input operand + + Returns + ------- + the input operand's full label string in which all anonymous dimensions are + labeled in dots. + ''' + # Sanity checks + for c in labelstr.replace('.', ''): + assert c.isalpha(), ( + f"Invalid equation: {c} is not a valid label, which should be letters." + ) + + assert labelstr.replace('...', '', 1).find('.') == -1, ( + f"Invalid equation: `.` is found outside of an ellipsis.") + + # Check shape. Note, in Paddle a tensor rank is always nonzero + ndims = len(operand.shape) + assert ndims > 0 + + full_labelstr = labelstr.replace('...', '.' * (ndims - len(labelstr) + 3)) + + assert len(full_labelstr) == ndims, ( + f"Invalid equation: the label string '{labelstr}' misses dimensions.") + + return full_labelstr + + +def parse_labels(labelstr, operands): + ''' + Parse label strings for all input operands. + + Parameters + ---------- + labelstr: + The equation's label string + operands: + The input operands + + Returns + ------- + list of full label strings for all input operands + ''' + + nop_labels = labelstr.split(',') + assert len(nop_labels) == len(operands), ( + f"Invalid equation: the number of operands is {len(operands)}, " + f"but found {len(nop_labels)} segments in the label equation.") + + return list(map(parse_op_labels, nop_labels, operands)) + + +def validate_rhs(rhs, input_labels, n_bcast_dims): + ''' + Check whether the equation's right hand side is valid + ''' + # Sanity check. + if n_bcast_dims > 0: + assert '...' in rhs, ( + f"Invalid equation: missing ellipsis in output labels.") + + rhs = rhs.replace('...', '') + rhs_set = set(rhs) + + # Hidden assumption: availble labels don't include '.' + assert '.' not in input_labels + + # Verify that output labels all come from the set of input labels + non_input_labels = rhs_set.difference(input_labels) + assert not non_input_labels, ( + f"Invalid equation: " + f"output label {sorted(non_input_labels)} not used by any input.") + # Verify that output labels are not duplicate + assert len(rhs) == len(rhs_set), ( + f"Invalid equation: duplicate output labels are found.") + + +# ''' +# Tests if the two operands can perform a broadcast operation on the given ranges of dimensions. +# We follow the Numpy broadcasting convention which states that, by lining up the shape arrays +# starting from the right most dimension, all the aligned dimensions either have equal sizes or +# one of them is sized one. +# Parameters +# ---------- +# args: +# *args unpacks into operand one's axes range, shape, operand two's axes range, shape +# f: +# if available, is used as a callback for postprocessing the aligned operand dimensions. +# ''' +# xran, xshape, yran, yshape = args +# +# xran_inv, yran_inv = xran[::-1], yran[::-1] +# +# for xi, yi in zip(xran_inv, yran_inv): +# xs, ys = xshape[xi], yshape[yi] +# cond = xs == ys or xs == 1 or ys == 1 +# if not cond: +# return False +# +# if not f: +# return True +# +# # Apply the callback to each aligned dimension pair +# for xi, yi in zip(xran_inv, yran_inv): +# f(xi, yi) + + +def build_view(in_labels, out_labels): + ''' + Build an inverse map of dimension indices. Three conditions must hold for + the result to be meaningful. + First, no duplicate letter labels in each label string. + Second, the number of dots in dimout_labels >= that in in_labels. + Third, dots are contiguous in each label string. + + Parameters + ---------- + in_labels: + The dimension labels to map to + out_labels: + The dimension labels to map from + + Returns + ------- + The inverse map from out_labels to in_labels. The length of the inverse map equals that of + out_labels. -1 is filled if there's no matching intput dimension for a specific label. + + Examples + -------- + in_labels = 'ij..', out_labels = '..ji' + inv_map = [2, 3, 1, 0] + in_labels = 'ij..', out_labels = '..kji' + inv_map = [2, 3, -1, 1, 0] + ''' + + inv_map = [-1] * len(out_labels) + + # First build the broadcast dimension mapping + # Find the broadcast index range in out_labels + r = re.search(r'\.+', out_labels) + if r: + start, end = r.start(), r.end() + s = re.search(r'\.+', in_labels) + # fill the broadcast dimension indices from right to left. + if s: + for ax, dim in zip( + range(start, end)[::-1], range(s.start(), s.end())[::-1]): + inv_map[ax] = dim + + # Now work on non-broadcast dimensions + if r: + it = itertools.chain(range(start), range(end, len(out_labels))) + else: + it = iter(range(len(out_labels))) + + for i in it: + inv_map[i] = in_labels.find(out_labels[i]) + + return inv_map + + +def build_global_view(nop_labels, rhs, n_bcast_dims): + ''' + Build the global view, which is a layout of all dimension labels + plus an index table that maps from the layout to the dimensions + in each operand. In the global view, the dimensions are arranged + such that output ones are put on the left and contraction ones + are put on the right. + + Parameters + ---------- + nop_labels: + The input full label strings of all input operands + rhs: + The equation right hand side + n_bcast_dims: + The maxium number of broadcast dimensions + + Returns + ------- + A tuple of g_labels, g_view, g_nout, g_count + g_labels: + the layout of all labels in a string + g_view: + the index table + g_nout: + the number of output dimensions + g_count: + the counter array for dimension contractions + ''' + # Put all labels in alphabetical order + concat = sorted(''.join(nop_labels).replace('.', '')) + labels, count = [], [] + for a, b in zip(['.'] + concat, concat): + if a != b: + labels.append(b) + count.append(1) + else: + count[-1] += 1 + + if rhs != None: + validate_rhs(rhs, labels, n_bcast_dims) + g_labels_out = rhs.replace('...', '.' * n_bcast_dims) + else: + g_labels_out = '.' * n_bcast_dims + ''.join( + l for l, c in zip(labels, count) if c == 1) + + for i in range(len(count))[::-1]: + if labels[i] in g_labels_out: + labels.pop(i) + count.pop(i) + + g_labels_sum = ''.join(labels) + g_labels = g_labels_out + g_labels_sum + g_view = list(map(lambda i: build_view(i, g_labels), nop_labels)) + g_nout = len(g_labels_out) + g_count = count + + return g_labels, g_view, g_nout, g_count + + +def build_global_shape(g_view, g_labels, op_shapes): + ''' + The global shape is the shape of all dimensions rearranged and broadcasting + to the global view. It's a reference data structure for einsum planning. + + Parameters + ---------- + g_view: + the global view + op_shapes: + the shapes of the all operands + + Returns + ------- + g_shape: + the global shape vector + g_masks: + list of shape masks for each operand. A dimension's shape mask is a boolean + indicating whether its size > 1, in other words, it's not squeezable + ''' + view_shapes = [] + g_masks = [] + + for view, op_shape in zip(g_view, op_shapes): + view_shapes.append([op_shape[dim] if dim > -1 else 1 for dim in view]) + + g_shape = [set(sizes_per_ax) - {1} for sizes_per_ax in zip(*view_shapes)] + + non_bcastable = [ax for ax, sizes in enumerate(g_shape) if len(sizes) > 1] + + assert not non_bcastable, ( + f"Invalid operands: label {g_labels[non_bcastable[0]]} " + f"corresponds to non-broadcastable dimensions.") + + g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape] + + g_masks = [[s > 1 for s in view_shape] for view_shape in view_shapes] + + return g_shape, g_masks + + +def dim_strides(shape): + ''' + Returns the dimension strides for a tensor shape + ''' + strides = [] + stride = 1 + for size in shape[::-1]: + strides.append(stride) + stride = stride * size + return strides + + +def create_view(operand, *view_def): + ''' + Create and materialize a view. + + Parameters + ---------- + operand: + the base tensor operand + view_def: + include two lists which define the view's dimension sizes and strides + ''' + assert False, f'Diagonal and trace not implemented yet.' + view_shape, view_strides = view_def + return operand.create_view(view_shape, view_strides) + + +def has_duplicated_labels(labels): + ''' + Returns True if there is any duplicate label. + ''' + labels = labels.replace('.', '') + return len(labels) > len(set(labels)) + + +def diagonalize(labels, operand): + ''' + Merges dimensions with duplicate labels. + + For those dimensions with duplicate labels, merge them into one dimension + which represents the diagonal elements. That requires the duplicate labeled + dimensions equal sized. The order of dimensions is kept unchanged up to + the left-most appearance of each label. + + Examples + -------- + 'ijj...i' would be merged into 'ij...' + ''' + if not has_duplicated_labels(labels): + return labels, operand + + strides = dim_strides(operand.shape) + shape = operand.shape + new_labels = [] + new_shape = [] + new_strides = [] + + for ax, l in enumerate(labels): + if l == '.' or l not in new_labels: + # not duplicate + new_labels.append(l) + new_strides.append(strides[ax]) + new_shape.append(shape[ax]) + else: + # duplicate label + diag_ax = new_labels.index(l) + new_strides[diag_ax] += strides[ax] + + # Call framework API to build a new tensor + new_op = create_view(operand, new_shape, new_strides) + return new_labels, new_op + + +def prod(iter, default=1): + if len(iter): + res = 1 + for s in iter: + res *= s + return res + return default + + +def plan_reduce(plan, op, reduce_dims, keepdim): + ''' + Add reduce to the plan + ''' + varname = f'op{op}' + + f = lambda var, dims: paddle_sum(var, dims, keepdim=keepdim) + step = f, [varname], varname, reduce_dims + plan.add_step(step) + + +def plan_scalar_prod(plan, op1, op2): + varnames = [f'op{op1}', f'op{op2}'] + f = lambda var1, var2: paddle_sum(var1) * var2 + step = f, varnames, varnames[1] + plan.add_step(step) + + +def plan_matmul(plan, g_view, op1, op2, g_op_masks, g_shape, I, J1, J2, K): + ''' + plan matmul + ''' + # Transpose and re-shape op1 and op2 in I, J1, K and I, J2, K + # Then apply matmul(x, y, transpose_x=False, tranpose_y=True) + var1, var2 = f'op{op1}', f'op{op2}' + + op1_view, op2_view = [g_view[op] for op in (op1, op2)] + + # Note, I may index into -1 + I1_dims = [op1_view[ax] for ax in I if op1_view[ax] >= 0] + I2_dims = [op2_view[ax] for ax in I if op2_view[ax] >= 0] + J1_dims = [op1_view[ax] for ax in J1] + J2_dims = [op2_view[ax] for ax in J2] + K1_dims = [op1_view[ax] for ax in K] + K2_dims = [op2_view[ax] for ax in K] + + op1_mask, op2_mask = [g_op_masks[op] for op in (op1, op2)] + op1_vshape = [s if m else 1 for s, m in zip(g_shape, op1_mask)] + op2_vshape = [s if m else 1 for s, m in zip(g_shape, op2_mask)] + + I1_shape, J1_shape, K1_shape = [[op1_vshape[ax] for ax in axes] + for axes in (I, J1, K)] + I2_shape, J2_shape, K2_shape = [[op2_vshape[ax] for ax in axes] + for axes in (I, J2, K)] + + K1_size, J1_size, J2_size = prod(K1_shape), prod(J1_shape), prod(J2_shape) + + perm1 = I1_dims + J1_dims + K1_dims + perm2 = I2_dims + J2_dims + K2_dims + + if any(i != dim for i, dim in enumerate(perm1)): + # print(f'perm1: {perm1}') + step = transpose, [var1], var1, perm1 + plan.add_step(step) + + if any(i != dim for i, dim in enumerate(perm2)): + # print(f'perm2: {perm2}') + step = transpose, [var2], var2, perm2 + plan.add_step(step) + + # In case of no K... dimensions, do a broadcast + if not K: + # unsqueeze operands include J1...J2... dimensions + if J2: + fill_start = len(I2_dims) + len(J1) + fill_end = fill_start + len(J2) + fill = list(range(fill_start, fill_end)) + step = unsqueeze, [var1], var1, fill + plan.add_step(step) + if J1: + fill_start = len(I2_dims) + fill_end = fill_start + len(J1) + fill = list(range(fill_start, fill_end)) + step = unsqueeze, [var2], var2, fill + plan.add_step(step) + # make broadcast + step = multiply, [var1, var2], var2 + plan.add_step(step) + # K... are there, let's reason about I... and J... + # In case I... and J... are empty, do the vector-vector version of matmul + elif not I and not J1 and not J2: + # merge K dimensions + if len(K) > 1: + for var in var1, var2: + step = reshape, [var], var, [K1_size] + plan.add_step(step) + # Build vector-vector matmul + step = matmul, [var1, var2], var2 + plan.add_step(step) + # General case, there are K... and some I... and J..., the actual operation will be + # matrix-vector or matrix-matrix multiplies, depending on the operands' shapes. + else: + # Merge J dims and K dims by reshaping + merged_shape1 = I1_shape + [J1_size] + [K1_size] + merged_shape2 = I2_shape + [J2_size] + [K1_size] + + step = reshape, [var1], var1, merged_shape1 + plan.add_step(step) + step = reshape, [var2], var2, merged_shape2 + plan.add_step(step) + + # Matmul + step = matmul, [var1, var2], var2, False, True + plan.add_step(step) + + # The result shape is in I..., J1, J2. Let's reshape back to known dimensions + # Note, this is static deduction, not by reading the tensor shape at runtime + result_shape = [1] * len(I) + for i, ax in enumerate(I): + result_shape[i] = max(op1_vshape[ax], op2_vshape[ax]) + if J1: + result_shape += J1_shape + if J2: + result_shape += J2_shape + + # Need a scalar dimension somehow + if result_shape: + step = reshape, [var2], var2, result_shape + plan.add_step(step) + + # Wrap up, updating auxiliary data + # Updating g_mask for I and J axes + for i, ax in enumerate(I + J1 + J2): + op2_mask[ax] = (result_shape[i] > 1) + + for ax in K: + op2_mask[ax] = False + + for ax in range(len(op2_view)): + op2_view[ax] = -1 + dim = 0 + for ax in I + J1 + J2: + op2_view[ax], dim = dim, dim + 1 + + +def plan_summation(plan, g_view, op1, op2, g_op_masks, g_shape, g_count, + n_bcast): + ''' + Plan various kinds of summation + ''' + op1_view, op2_view = g_view[op1], g_view[op2] + op1_mask, op2_mask = g_op_masks[op1], g_op_masks[op2] + + ndim = len(op1_view) + nout = ndim - len(g_count) + + count = [0] * nout + g_count + + I, K, J1, J2 = list(range(n_bcast)), [], [], [] + + for ax, dim1, dim2 in zip( + range(n_bcast, ndim), op1_view[n_bcast:], op2_view[n_bcast:]): + + if (dim1 != -1) != (dim2 != -1): + if dim1 != -1: + J1.append(ax) + else: + J2.append(ax) + elif dim1 != -1: + fold = int(op1_mask[ax]) + int(op2_mask[ax]) + if ax >= nout and fold == count[ax]: + # Ready to fold the dimensions + K.append(ax) + count[ax] -= fold + else: + I.append(ax) + count[ax] -= max(fold - 1, 0) + + # Update g_count + g_count[:] = count[nout:] + + # Now it's OK to merge the K dims as the same shape holds + # print(f'I: {I} J1: {J1} J2: {J2} K: {K}') + plan_matmul(plan, g_view, op1, op2, g_op_masks, g_shape, I, J1, J2, K) + + +def rearrange(axes): + perm, fill = [], [] + for ax, dim in enumerate(axes): + if dim < 0: + fill.append(ax) + else: + perm.append(dim) + # Trivial permutation returns [] + if all(i == dim for i, dim in enumerate(perm)): + perm = [] + + return perm, fill + + +def plan_broadcast(plan, operands, nop_axes): + ''' + Plan broadcast across + ''' + nop = len(operands) + varnames = [f'op{i}' for i in range(nop)] + + for i, op_axes in zip(range(nop), nop_axes): + # Re-arrange the dimesions according to the global layout + perm, fill = rearrange(op_axes) + var = varnames[i] + if perm: + step = transpose, [var], var, perm + plan.add_step(step) + if fill: + step = unsqueeze, [var], var, fill + plan.add_step(step) + + def f(*args): + expr = ' * '.join(varnames) + return eval(expr, dict(zip(varnames, args))) + + step = f, varnames, None + plan.add_step(step) + + +class Plan: + def __init__(self): + self.env = {} + self.steps = [] + + def add_step(self, step): + self.steps.append(step) + + def get_var(self, varname): + return self.env[varname] if varname in self.env else None + + def set_var(self, varname, var): + self.env[varname] = var + + def show(self): + res = None + for f, in_varnames, out_varname, *args in self.steps: + print(repr((out_varname, f, *in_varnames, *args))) + return res + + def execute(self): + res = None + for f, in_varnames, out_varname, *args in self.steps: + res = f(*map(self.get_var, in_varnames), *args) + if out_varname: + self.set_var(out_varname, res) + return res + + +def plan_einsum(operands, g_view, g_shape, g_op_masks, g_count, n_bcast): + ''' + Plans the actual execution steps. + Results + ------- + the execution plan + ''' + nop = len(operands) + ndim = len(g_view[0]) + nout = ndim - len(g_count) + + # Initialize a plan with an environment + plan = Plan() + op_names = [f'op{i}' for i in range(nop)] + list(map(plan.set_var, op_names, operands)) + + # In case no dimensions to combine, do broadcast straight across + if not g_count: + plan_broadcast(plan, operands, g_view) + return plan + + # Down count axis >= nout and degenerate dimensions (masked is not set) + for view, mask in zip(g_view, g_op_masks): + down_count = [ + 1 if (dim > -1 and not masked) else 0 + for dim, masked in zip(view[nout:], mask[nout:]) + ] + for i, d in enumerate(down_count): + g_count[i] -= d + + # Reduce any dimension for which g_mask is set and g_count == 1 + for i, view, mask in zip(range(nop), g_view, g_op_masks): + to_reduce = [] + for dim, masked, count in zip(view[nout:], mask[nout:], g_count): + to_reduce.append(dim if (masked and count == 1) else -1) + + reduce_dims = list(filter(lambda x: x > -1, to_reduce)) + if reduce_dims: + plan_reduce(plan, i, reduce_dims, keepdim=True) + + # Unset mask and decrease g_count for the reduced dimensions + for i, d in enumerate(to_reduce): + ax = i + nout + mask[ax] = mask[ax] and (d == -1) + g_count[i] -= 0 if d == -1 else 1 + + # Plan the summations over the operand sequence + for i in range(nop): + # plan a single step + + if i == 0: + continue + + # We'd like to arrange the dimensions in the following way: + # [I... J... K...] + # [I... J... K...] + # where + # I... are aligned and not to be combined immediately + # J... are not aligned and not to be combined immediately + # K... are aligned and should be immediately combined + # At this point the non-trivial broadcast dimensinos in K are already reduced + # and removed. That means all K dimensions are aligned and their sizes are not 1. + # We then inspect the layout of I,J,K plus the above observation to make + # specializatoin decisions. The current strategy is set as follows: + # (1) if I... J... K... are all empty, it's multiplying a scalar + # (2) if K... are empty, better use a broadcast + # (3) if I... J... empty and K... not empty, a vector-vector multiply (or a dot) + # (4) Elsewise, either I... or J... not empty, and K... not empty, use a general matmul + + # Resolve the summation kind: dot, matmul or * + if not any(g_op_masks[i - 1]): + # op1 is a scalar + plan_scalar_prod(plan, i - 1, i) + else: + plan_summation(plan, g_view, i - 1, i, g_op_masks, g_shape, g_count, + n_bcast) + + # for ax, dim in enumerate(g_view[nop-1][:nout]): + # assert dim == ax + assert all(not masked for masked in g_op_masks[nop - 1][nout:]) + + view = g_view[-1] + if any(ax != dim for ax, dim in enumerate(view[:nout])): + perm = [dim for dim in view if dim >= 0] + varname = f'op{nop-1}' + step = transpose, [varname], varname, perm + plan.add_step(step) + dim = 0 + for ax, d in enumerate(view): + if d != -1: + view[ax], dim = dim, dim + 1 + + squeeze_dims = [dim for dim in view[nout:] if dim != -1] + if squeeze_dims: + # plan_reduce(plan, nop-1, reduce_dims, keepdim=False) + varname = f'op{nop-1}' + step = squeeze, [varname], varname, squeeze_dims + plan.add_step(step) + + return plan + + +@dygraph_only +def einsum(equation, *operands): + r""" + einsum(equation, *operands) + + The current version of this API should be used in dygraph only mode. + + Einsum offers a tensor operation API which allows using the Einstein summation + convention or Einstain notation. It takes as input one or multiple tensors and + produces as output one tensor. + + Einsum is able to perform a variety of tensor operations. Following lists a few: + + - for single operand + - trace + - diagonal + - transpose + - sum + - for double operands + - dot + - outer + - broadcasting and elementwise multiply + - matrix multiply + - batched matrix multiply + - for many operads + - broadcasting multiply + - chained matrix multiply + + **The summation notation** + + - The tensor dimensions are labeled using uncased English letters. E.g., `ijk` + relates to a three dimensional tensor whose dimensions are labeled i, j, and k. + - The equation is `,` separated into terms, each being a distinct input's + dimension label string. + - Ellipsis `...` enables broadcasting by automatically converting the unlabeled + dimensions into broadcasting dimensions. + - Singular labels are called free labels, duplicate are dummy labels. Dummy labeled + dimensions will be reduced and removed in the output. + - Output labels can be explicitly specified on the right hand side of `->` or omitted. + In the latter case, the output labels will be inferred from the input labels. + - Inference of output labels + - Broadcasting label `...`, if present, is put on the leftmost position. + - Free labels are reordered alphabetically and put after `...`. + - On explicit output labels + - If broadcasting is enabled, then `...` must be present. + - The output labels can be an empty, an indication to output as a scalar + the sum over the original output. + - Non-input labels are invalid. + - Duplicate labels are invalid. + - For any dummmy label which is present for the output, it's promoted to + a free label. + - For any free label which is not present for the output, it's lowered to + a dummy label. + - Examples + - '...ij, ...jk',where i and k are free labels, j is dummy. The output label + string is '...ik' + - 'ij -> i', where i is a free label and j is a dummy label. + - '...ij, ...jk -> ...ijk',where i, j and k are all free labels. + - '...ij, ...jk -> ij', an invalid equation since `...` is not present for + the output. + + **The summation rule** + + The summation procedure can be outlined as follows, although the actual steps taken + may vary significantly due to implementation specific optimization. + + - Step 1: preparation for broadcasting, that is, transposing and unsqueezing + the input operands to have each resulting dimension identically labeled across + all the input operands. + - Step 2: broadcasting multiply all the resulting operands from step 1. + - Step 3: reducing dummy labeled dimensions. + - Step 4: transposing the result tensor to match the output labels. + + **On trace and diagonal** + + The trace and diagonal are planned yet unimplemented features. + + Args: + equation (`str`): + The summation terms using the Einstein summation notation. + operands (`list|Tensor`): + The input tensors over which to compute the Einstein summation. The number of + operands should equal the number of input terms in the equation. + + Returns: + result (`Tensor`): the result tensor. + + Examples: + .. code-block:: python + + import paddle + paddle.seed(102) + x = paddle.rand([4]) + y = paddle.rand([5]) + + # sum + print(paddle.einsum('i->', x)) + # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # 1.95791852) + + # dot + print(paddle.einsum('i,i->', x, x)) + # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [1.45936954]) + + # outer + print(paddle.einsum("i,j->ij", x, y)) + # Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[0.00079869, 0.00120950, 0.00136844, 0.00187187, 0.00192194], + # [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545], + # [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654], + # [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]]) + + A = paddle.rand([2, 3, 2]) + B = paddle.rand([2, 2, 3]) + + # transpose + print(paddle.einsum('ijk->kji', A)) + # Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[[0.95649719, 0.49684682], + # [0.80071914, 0.46258664], + # [0.49814570, 0.33383518]], + # + # [[0.07637714, 0.29374704], + # [0.51470858, 0.51907635], + # [0.99066722, 0.55802226]]]) + + # batch matrix multiplication + print(paddle.einsum('ijk, ikl->ijl', A,B)) + # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[[0.32172769, 0.50617385, 0.41394392], + # [0.51736701, 0.49921003, 0.38730967], + # [0.69078457, 0.42282537, 0.30161136]], + # + # [[0.32043904, 0.18164253, 0.27810261], + # [0.50226176, 0.24512935, 0.39881429], + # [0.51476848, 0.23367381, 0.39229113]]]) + + # Ellipsis transpose + print(paddle.einsum('...jk->...kj', A)) + # Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[[0.95649719, 0.80071914, 0.49814570], + # [0.07637714, 0.51470858, 0.99066722]], + # + # [[0.49684682, 0.46258664, 0.33383518], + # [0.29374704, 0.51907635, 0.55802226]]]) + + # Ellipsis batch matrix multiplication + print(paddle.einsum('...jk, ...kl->...jl', A,B)) + # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[[0.32172769, 0.50617385, 0.41394392], + # [0.51736701, 0.49921003, 0.38730967], + # [0.69078457, 0.42282537, 0.30161136]], + # + # [[0.32043904, 0.18164253, 0.27810261], + # [0.50226176, 0.24512935, 0.39881429], + # [0.51476848, 0.23367381, 0.39229113]]]) + """ + + nop = len(operands) + assert nop > 0, "At least one operand is expected." + + # Part the equation to left hand side and right hand side + lhs, *rhs = equation.lower().replace(' ', '').split('->') + assert len(rhs) < 2, "Invalid equation: multiple `->` were found." + + # Note, we distinguish between 'ij->' and 'ij' by setting rhs to '' and None + rhs = rhs[0] if rhs else None + + # Parse labels for each operand and count the number of occurrences for each alphabet label + nop_labels = parse_labels(lhs, operands) + + # Diagonalize the operands which have duplicate labels + nop_labels, operands = list(zip(*map(diagonalize, nop_labels, operands))) + + # To handle broadcasting, we should first know how many dimensions are there + # We need to use that number to generate output labels + # e.g. 1 for ['ij', 'i.', '.k'] + n_bcast_dims = max(map(lambda s: s.count('.'), nop_labels)) + + # Build the data structures for planning. It's helpful to think of all the operands + # broadcasting together from a global view. In this view, dimensions from multiple + # operands are mapped to the same position if they are labeled uniquely. Broadcasting + # dimensions are mapped to adjacent positions with the right bound fixed. Subject to + # each operand, the map is injective but for all operands the map is on-to. + # g_labels: + # The labels of the global view + # g_view: + # Includes a list of maps from each operand's dimensions to the global view's dimensions + # which we refer to as ax or axes in the code to distinguish from operand's dims + # g_shape: + # The shape of the global view. The size of each dimension is what the aligned dimensions + # should broadcast to + # g_nout: + # Number of output axes + # g_op_masks + # A list of masks that specify each operand's non-trivial dimensions + # g_count + # Counting how many non-trivial dimensions remain for each ax + + g_labels, g_view, g_nout, g_count = build_global_view(nop_labels, rhs, + n_bcast_dims) + g_shape, g_op_masks = build_global_shape(g_view, g_labels, + [op.shape for op in operands]) + + # Now we're ready to build up an execution plan + args = operands, g_view, g_shape, g_op_masks, g_count, n_bcast_dims + plan = plan_einsum(*args) + result = plan.execute() + + return result From 2cd05d5ddcaa1942f70af10089b88e572091f5ce Mon Sep 17 00:00:00 2001 From: WangXi Date: Sat, 14 Aug 2021 07:52:33 +0800 Subject: [PATCH 040/126] [hybrid] refine pipeline stage and mp send/recv check (#34870) --- python/paddle/fluid/optimizer.py | 68 +++++++++++++++----------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 7ad94f4be3eb2f..3cb6d24c86faf2 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -4397,6 +4397,10 @@ def _is_loss_grad_op(self, op): return op_role & int(self._op_role.Backward) and op_role & int( self._op_role.Loss) + def _is_forward_op(self, op): + return self._op_role_key in op.attr_names and ( + int(op.attr(self._op_role_key)) == int(self._op_role.Forward)) + def _is_backward_op(self, op): return self._op_role_key in op.attr_names and ( int(op.attr(self._op_role_key)) & int(self._op_role.Backward)) @@ -4705,10 +4709,6 @@ def _check_validation(self, block): int(self._op_role.Optimize), int(self._op_role.Backward) | int(self._op_role.Loss), ] - pre_stage_id = None - decrease_flag = False - in_optimize = False - in_forward = True for op in block.ops: if not op._has_kernel(op.type): assert op.type == "conditional_block" and ( @@ -4724,10 +4724,6 @@ def _check_validation(self, block): op_role, op.type, valid_op_role_value) - if int(op_role) == int(self._op_role.Optimize): - in_optimize = True - if int(op_role) == int(self._op_role.Backward): - in_forward = False assert op.has_attr(self._op_device_key), ( "op ({}) has no {} attribute.".format(op.type, @@ -4739,7 +4735,6 @@ def _check_validation(self, block): if device == f"{self._device}:all": continue dev_type = device.split(':')[0] - stage_id = int(device.split(':')[1]) assert dev_type == "gpu" or dev_type == 'npu', ( "Now only gpu and npu devices are supported " "for pipeline parallelism.") @@ -4747,26 +4742,6 @@ def _check_validation(self, block): if device not in device_list: device_list.append(device) - if not in_optimize: - if pre_stage_id is not None: - interval = stage_id - pre_stage_id - assert abs(interval) <= 1, \ - "The stage interval of two consecutive ops in the pipeline must be < = 1," \ - "but the interval of op={} and prev op is {}".format(op, interval) - # stage must be in order, such as Forward(0 1 2 3 4), Backward(4 3 2 1 0) - # if stage is unordered, such as Forward(0 1 2 3 4 3 4), will report error - if in_forward: - assert interval >= 0, \ - "Pipeline stage must be sequential increment in Forward, prev_stage={}, " \ - "please check the stage of op={}".format(pre_stage_id, op) - else: - # FIXME(wangxi): recompute check failed - pass - #assert interval <=0, \ - # "Pipeline stage must be sequential decrement in Backward, prev_stage={}, " \ - # "please check the stage of op={}".format(pre_stage_id, op) - pre_stage_id = stage_id - return device_list def _insert_sendrecv_ops_for_boundaries(self, block): @@ -4820,6 +4795,25 @@ def _insert_sendrecv_ops_for_boundaries(self, block): device_type = cur_device.split(':')[0] + ':' + def _check_stage(cur_id, prev_id): + # check send/recv stage valid + is_forward = self._is_forward_op(op) + is_backward = self._is_backward_op(op) + assert is_forward or is_backward, \ + 'send/recv in pipeline should only be inserted in forward or backward,' \ + 'please check the op_role of op={}'.format(op) + + if is_forward: + assert prev_id < cur_id, \ + "In forward, send/recv can only be passed forward, but now " \ + "prev_stage={} great than cur_stage={}, please check op_device of op={}".format( + prev_id, cur_id, op) + elif is_backward: + assert prev_id > cur_id, \ + "In backward, send/recv can only be passed backward, but now " \ + "prev_stage={} less than cur_stage={}, please check op_device of op={}".format( + prev_id, cur_id, op) + def _insert_send_recv(cur_id, prev_id): cur_dev = device_type + str(cur_id) prev_dev = device_type + str(prev_id) @@ -4890,9 +4884,9 @@ def _insert_send_recv(cur_id, prev_id): var_shape[0] = self.micro_batch_size if var_shape[ 0] < 0 else var_shape[0] - numel = np.prod(var.shape) - assert numel % self.mp_degree == 0, \ - "The numel={} must be divisible by mp_degree={}".format(numel, self.mp_degree) + numel = np.prod(var_shape) + use_mp = (self.mp_degree > 1) and ( + numel % self.mp_degree == 0) if 'subprog' in var.name: # For recompute, if the checkpoints var is layer_norm_6.tmp_2 @@ -4919,6 +4913,8 @@ def _insert_send_recv(cur_id, prev_id): extra_index_info['index'] += 1 return + _check_stage(cur_id, prev_id) + block._insert_op_without_sync( index=index + extra_index_info['index'], type='c_sync_calc_stream', @@ -4931,8 +4927,7 @@ def _insert_send_recv(cur_id, prev_id): extra_index_info['index'] += 1 block._insert_op_without_sync( index=index + extra_index_info['index'], - type='send_v2' - if self.mp_degree == 1 else 'partial_send', + type='send_v2' if not use_mp else 'partial_send', inputs={'X': var}, attrs={ self._op_device_key: prev_dev, @@ -4968,8 +4963,7 @@ def _insert_send_recv(cur_id, prev_id): extra_index_info['index'] += 1 block._insert_op_without_sync( index=index + extra_index_info['index'], - type='recv_v2' - if self.mp_degree == 1 else 'partial_recv', + type='recv_v2' if not use_mp else 'partial_recv', outputs={'Out': [var]}, attrs={ 'out_shape': var_shape, @@ -4984,7 +4978,7 @@ def _insert_send_recv(cur_id, prev_id): 'id': self.mp_rank, }) extra_index_info['index'] += 1 - if self.mp_degree > 1: + if use_mp: block._insert_op_without_sync( index=index + extra_index_info['index'], type='partial_allgather', From 7316018d372d8bfdf0bd5f4f5784d817ad8ff66a Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Sun, 15 Aug 2021 21:01:01 -0500 Subject: [PATCH 041/126] [NPU] add p_norm_op_npu (#34695) * add p_norm_op_npu * remove p_norm_grad op * update --- paddle/fluid/operators/p_norm_op_npu.cc | 92 ++++++++++ .../tests/unittests/npu/test_p_norm_op_npu.py | 160 ++++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100644 paddle/fluid/operators/p_norm_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py diff --git a/paddle/fluid/operators/p_norm_op_npu.cc b/paddle/fluid/operators/p_norm_op_npu.cc new file mode 100644 index 00000000000000..3c5d1a36e9c273 --- /dev/null +++ b/paddle/fluid/operators/p_norm_op_npu.cc @@ -0,0 +1,92 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/p_norm_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class PnormNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_x = ctx.Input("X"); + auto* out_norm = ctx.Output("Out"); + out_norm->mutable_data(ctx.GetPlace()); + + float porder = ctx.Attr("porder"); + int axis = ctx.Attr("axis"); + bool keepdim = ctx.Attr("keepdim"); + + auto xdim = in_x->dims(); + if (axis < 0) axis = xdim.size() + axis; + + auto stream = + ctx.template device_context() + .stream(); + + int p = 0; + bool combine_op = + !(porder == 0 || porder == INFINITY || porder == -INFINITY); + if (porder == INFINITY) { + p = INT_MAX; + } else if (porder == -INFINITY) { + p = INT_MIN; + } else { + p = static_cast(porder); + float t = 0; + float diff = abs(std::modf(porder, &t)); + if (diff < 1e-5) { + combine_op = false; + } + } + + if (!combine_op) { + const auto& runner = NpuOpRunner("LpNorm", {*in_x}, {*out_norm}, + {{"p", p}, + {"axes", std::vector({axis})}, + {"keep_dims", keepdim}}); + runner.Run(stream); + } else { + Tensor tmp_x; + tmp_x.mutable_data(xdim, ctx.GetPlace()); + + const auto& power_runner1 = + NpuOpRunner("Power", {*in_x}, {tmp_x}, + {{"power", porder}, {"scale", 1.0f}, {"shift", 0.0f}}); + power_runner1.Run(stream); + + const auto& reduce_runner = NpuOpRunner( + "ReduceSumD", {tmp_x}, {*out_norm}, + {{"axes", std::vector({axis})}, {"keep_dims", keepdim}}); + reduce_runner.Run(stream); + + const auto& power_runner2 = NpuOpRunner( + "Power", {*out_norm}, {*out_norm}, + {{"power", 1 / porder}, {"scale", 1.0f}, {"shift", 0.0f}}); + power_runner2.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + p_norm, ops::PnormNPUKernel, + ops::PnormNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py new file mode 100644 index 00000000000000..9f990c0e29f6eb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py @@ -0,0 +1,160 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import unittest +import numpy as np +sys.path.append("..") + +import paddle +from op_test import OpTest +from test_norm_all import p_norm + +paddle.enable_static() + + +class TestPnormOp(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def setUp(self): + self.set_npu() + self.op_type = "p_norm" + self.init_test_case() + x = (np.random.random(self.shape) + 0.5).astype(self.dtype) + norm = p_norm(x, self.axis, self.porder, self.keepdim) + self.inputs = {'X': x} + self.attrs = { + 'epsilon': self.epsilon, + 'axis': self.axis, + 'keepdim': self.keepdim, + 'porder': float(self.porder) + } + self.outputs = {'Out': norm} + self.gradient = self.calc_gradient() + + def test_check_output(self): + if self.dtype == "float16": + self.check_output_with_place(paddle.NPUPlace(0), atol=5e-3) + else: + self.check_output_with_place(paddle.NPUPlace(0)) + + def init_test_case(self): + self.shape = [2, 3, 4, 5] + self.axis = 1 + self.epsilon = 1e-12 + self.porder = 2.0 + self.keepdim = False + self.init_dtype() + + def init_dtype(self): + self.dtype = "float32" + + def calc_gradient(self): + self.attrs = { + 'epsilon': self.epsilon, + 'axis': self.axis, + 'keepdim': self.keepdim, + 'porder': float(self.porder) + } + x = self.inputs["X"] + porder = self.attrs["porder"] + axis = self.attrs["axis"] + if porder == 0: + grad = np.zeros(x.shape).astype(x.dtype) + elif porder in [float("inf"), float("-inf")]: + norm = p_norm(x, axis=axis, porder=porder, keepdims=True) + x_abs = np.abs(x) + grad = np.sign(x) + grad[x_abs != norm] = 0.0 + else: + norm = p_norm(x, axis=axis, porder=porder, keepdims=True) + grad = np.power(norm, 1 - porder) * np.power( + np.abs(x), porder - 1) * np.sign(x) + + numel = 1 + for s in x.shape: + numel *= s + numel /= x.shape[axis] + return [grad.astype(x.dtype) * 1 / numel] + + +class TestPnormOp2(TestPnormOp): + def init_test_case(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.epsilon = 1e-12 + self.porder = 2.0 + self.keepdim = True + self.init_dtype() + + +class TestPnormOp3(TestPnormOp): + def init_test_case(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.epsilon = 1e-12 + self.porder = np.inf + self.keepdim = True + self.init_dtype() + + +class TestPnormOp4(TestPnormOp3): + def init_test_case(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.epsilon = 1e-12 + self.porder = -np.inf + self.keepdim = True + self.init_dtype() + + +class TestPnormOp5(TestPnormOp3): + def init_test_case(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.epsilon = 1e-12 + self.porder = 0 + self.keepdim = True + self.init_dtype() + + +class TestPnormOpfp16(TestPnormOp): + def init_dtype(self): + self.dtype = "float16" + + +class TestPnormOp2fp16(TestPnormOp2): + def init_dtype(self): + self.dtype = "float16" + + +class TestPnormOp3fp16(TestPnormOp3): + def init_dtype(self): + self.dtype = "float16" + + +class TestPnormOp4fp16(TestPnormOp4): + def init_dtype(self): + self.dtype = "float16" + + +class TestPnormOp5fp16(TestPnormOp5): + def init_dtype(self): + self.dtype = "float16" + + +if __name__ == "__main__": + unittest.main() From 8fb17fc72b228b7916c0aaab43a0537f3d4e20f0 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Mon, 16 Aug 2021 10:41:29 +0800 Subject: [PATCH 042/126] Op-benchmark CI cpu and gpu (#34631) * notest;pm-op-benchmark * notest;pm-op-benchmark * notest;pm-op-benchmark * notest;pm-op-benchmark * notest;pm-op-benchmark * notest;pm-op-benchmark * notest;test=op_benchmark * notest;test=op_benchmark * notest;op_benchmark * notest;op_benchmark * notest;op_benchmark * notest;op_benchmark * notest;op_benchmark * notest;op_benchmark * notest;test=op_benchmark * notest;op_benchmark * notest;op_benchmark * notest;op_benchmark * fix * fix --- tools/test_ci_op_benchmark.sh | 314 ++++++++++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 tools/test_ci_op_benchmark.sh diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh new file mode 100644 index 00000000000000..ff1024ba948ed8 --- /dev/null +++ b/tools/test_ci_op_benchmark.sh @@ -0,0 +1,314 @@ +#!/bin/bash + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set +ex + +[ -z "$PADDLE_ROOT" ] && PADDLE_ROOT=$(cd $(dirname ${BASH_SOURCE[0]})/.. && pwd) + +# PR modify op source files +CHANGE_OP_FILES=() + +# ops that will run benchmark test +declare -A CHANGE_OP_MAP + +# ops that benchmark repo has +declare -A BENCHMARK_OP_MAP + +# searched header files +declare -A INCLUDE_SEARCH_MAP + +function LOG { + echo "[$0:${BASH_LINENO[0]}] $*" >&2 +} + +# Limit cu file directory +function match_cu_file_directory { + local sub_dir cu_file_dir + cu_file_dir=$(dirname ${1}) + for sub_dir in "" "/elementwise" "/reduce_ops" + do + [ "${cu_file_dir}" == "paddle/fluid/operators${sub_dir}" ] && return 0 + done + return 1 +} + +# Load op files by header file +function load_CHANGE_OP_FILES_by_header_file { + local change_file + for change_file in $(grep -rl "${1}" paddle/fluid/operators) + do + if [[ "$change_file" =~ "_op.cu" ]] + then + # match cu file directory limit + match_cu_file_directory $change_file || continue + LOG "[INFO] Found \"${1}\" include by \"${change_file}\"." + CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file" + elif [[ "$change_file" =~ ".h" ]] + then + [ -n "${INCLUDE_SEARCH_MAP[$change_file]}" ] && continue + LOG "[INFO] Found \"${1}\" include by \"${change_file}\", keep searching." + INCLUDE_SEARCH_MAP[$change_file]="searched" + load_CHANGE_OP_FILES_by_header_file $change_file + fi + done +} + +# Load op files that PR changes +function load_CHANGE_OP_FILES { + local sub_dir change_file + # TODO(Avin0323): Need to filter the files added by the new OP. + for change_file in $(git diff --name-only origin/develop) + do + # match directory limit + [[ "$change_file" =~ "paddle/fluid/operators/" ]] || continue + # match file name limit + if [[ "$change_file" =~ "_op.cu" ]] + then + # match cu file directory limit + match_cu_file_directory $change_file || continue + LOG "[INFO] Found \"${change_file}\" changed." + CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file" + elif [[ "$change_file" =~ ".h" ]] + then + LOG "[INFO] Found \"${change_file}\" changed, keep searching." + INCLUDE_SEARCH_MAP[${change_file}]="searched" + load_CHANGE_OP_FILES_by_header_file $change_file + fi + done + [ ${#CHANGE_OP_FILES[@]} -eq 0 ] && LOG "[INFO] No op to test, skip this ci." && \ + echo "cpu_benchmark=ON" >${cfs_dir}/op_benchmark/${AGILE_PULL_ID}/${AGILE_REVISION}/pass.txt && \ + exit 0 +} + +# Clone benchmark repo +function prepare_benchmark_environment { + LOG "[INFO] Clone benchmark repo ..." + git clone https://github.com/PaddlePaddle/benchmark.git + [ $? -ne 0 ] && LOG "[FATAL] Clone benchmark repo fail." && exit -1 + LOG "[INFO] Collect api info ..." + python benchmark/api/deploy/collect_api_info.py \ + --test_module_name tests_v2 \ + --info_file api_info.txt >& 2 + [ $? -ne 0 ] && LOG "[FATAL] Collect api info fail." && exit -1 + [ ! -f benchmark/ci/scripts/op_benchmark.config ] && LOG "[FATAL] Missing op_benchmark.config!" && exit -1 +} + +# Load unique op name from CHANGE_OP_FILES +function load_CHANGE_OP_MAP { + local op_name change_file change_file_name + source benchmark/ci/scripts/op_benchmark.config + for change_file in ${CHANGE_OP_FILES[@]} + do + change_file_name=${change_file#*paddle/fluid/operators/} + if [ -n "${PADDLE_FILENAME_OP_MAP[$change_file_name]}" ] + then + for op_name in ${PADDLE_FILENAME_OP_MAP[$change_file_name]} + do + LOG "[INFO] Load op: \"${op_name}\"." + CHANGE_OP_MAP[${op_name}]="$change_file" + done + else + op_name=${change_file_name##*/} + op_name=${op_name%_cudnn_op*} + op_name=${op_name%_op*} + [ -n "${SKIP_OP_MAP[$op_name]}" ] && continue + LOG "[INFO] Load op: \"${op_name}\"." + CHANGE_OP_MAP[${op_name}]="$change_file" + fi + done +} + +# Load ops that will run benchmark test +function load_BENCHMARK_OP_MAP { + local line op_name api_name + source benchmark/ci/scripts/op_benchmark.config + for line in $(cat api_info.txt) + do + api_name=${line%%,*} + if [ -n "${BENCHMARK_APINAME_OP_MAP[$api_name]}" ] + then + op_name=${BENCHMARK_APINAME_OP_MAP[$api_name]} + else + op_name=$api_name + fi + if [ -n "${CHANGE_OP_MAP[$op_name]}" ] + then + LOG "[INFO] Load benchmark settings with op \"${op_name}\"." + BENCHMARK_OP_MAP[$op_name]=$line + fi + done +} + +# compile and install paddlepaddle +function compile_install_paddlepaddle { + LOG "[INFO] Compiling install package ..." + export WITH_GPU=ON + export WITH_AVX=ON + export WITH_MKL=ON + export RUN_TEST=OFF + export WITH_PYTHON=ON + export WITH_TESTING=OFF + export BUILD_TYPE=Release + export CUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} + export WITH_DISTRIBUTE=OFF + export CMAKE_BUILD_TYPE=Release + [ -d build ] && rm -rf build + bash paddle/scripts/paddle_build.sh build_only $(nproc) + [ $? -ne 0 ] && LOG "[FATAL] compile fail." && exit 7 + LOG "[INFO] Build fineshed" + mkdir -p build_whl/${branch_name} && cp build/python/dist/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl build_whl/${branch_name}/ +} + +function build_whl { + for branch_name in "develop" "test" + do + git checkout ${branch_name} + [ $? -ne 0 ] && LOG "[FATAL] Missing branch ${branch_name}." && exit 7 + LOG "[INFO] Now branch name is ${branch_name}." + compile_install_paddlepaddle + done +} + +# run op benchmark test +function run_op_benchmark_test { + [ ${#BENCHMARK_OP_MAP[*]} -eq 0 ] && return + local logs_dir op_name branch_name api_info_file + [ -z "$VISIBLE_DEVICES" ] && export VISIBLE_DEVICES=0 + [ "$BENCHMARK_PRINT_FAIL_LOG" != "1" ] && export BENCHMARK_PRINT_FAIL_LOG=1 + api_info_file="$(pwd)/api_info.txt" + [ -f "$api_info_file" ] && rm -f $api_info_file + for api_info in ${BENCHMARK_OP_MAP[*]} + do + echo "$api_info" >> $api_info_file + done + # install tensorflow for testing accuary + pip install tensorflow==2.3.0 tensorflow-probability + for branch_name in "develop" "test" + do + LOG "[INFO] Uninstall Paddle ..." + pip uninstall -y paddlepaddle paddlepaddle_gpu + LOG "[INFO] Install Paddle ..." + pip install build_whl/${branch_name}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl + logs_dir="$(pwd)/logs-${branch_name}" + [ -d $logs_dir ] && rm -rf $logs_dir/* || mkdir -p $logs_dir + pushd benchmark/api > /dev/null + bash deploy/main_control.sh tests_v2 \ + tests_v2/configs \ + $logs_dir \ + $VISIBLE_DEVICES \ + "gpu" \ + "both" \ + $api_info_file \ + "paddle" + popd > /dev/null + done +} + +# check benchmark result +function check_op_benchmark_result { + local logs_dir api_info_file check_status_code + # default 3 times + [ -z "${RETRY_TIMES}" ] && RETRY_TIMES=3 + logs_dir=$(pwd)/logs-test_pr + api_info_file=$(pwd)/api_info.txt + for retry_time in $(seq 0 ${RETRY_TIMES}) + do + if [ $retry_time -gt 0 ]; then + # run op benchmark speed test + # there is no need to recompile and install paddle + LOG "[INFO] retry ${retry_time} times ..." + pushd benchmark/api > /dev/null + bash deploy/main_control.sh tests_v2 \ + tests_v2/configs \ + ${logs_dir} \ + $VISIBLE_DEVICES \ + "gpu" \ + "speed" \ + ${api_info_file} \ + "paddle" + popd > /dev/null + fi + # check current result and update the file to benchmark test + python ${PADDLE_ROOT}/tools/check_op_benchmark_result.py \ + --develop_logs_dir $(pwd)/logs-develop \ + --pr_logs_dir $(pwd)/logs-test_pr \ + --api_info_file ${api_info_file} + check_status_code=$? + # TODO(Avin0323): retry only if the performance check fails + [ $check_status_code -eq 0 ] && break + done + return $check_status_code +} + +# diff benchmakr result and miss op +function summary_problems { + local op_name exit_code + exit_code=0 + if [ ${#BENCHMARK_OP_MAP[*]} -ne 0 ] + then + check_op_benchmark_result + exit_code=$? + fi + for op_name in ${!CHANGE_OP_MAP[@]} + do + if [ -z "${BENCHMARK_OP_MAP[$op_name]}" ] + then + exit_code=8 + LOG "[ERROR] Missing test script of \"${op_name}\"(${CHANGE_OP_MAP[$op_name]}) in benchmark." + fi + done + if [ $exit_code -ne 0 ]; then + LOG "[INFO] See https://github.com/PaddlePaddle/Paddle/wiki/PR-CI-OP-benchmark-Manual for details." + LOG "[INFO] Or you can apply for one RD (Avin0323(Recommend), Xreki, luotao1) approval to pass this PR." + exit $exit_code + fi +} + + +function cpu_op_benchmark { + LOG "[INFO] Start run op benchmark cpu test ..." + load_CHANGE_OP_FILES + build_whl + LOG "[INFO] Op benchmark run success and no error!" + exit 0 +} + + +function gpu_op_benchmark { + LOG "[INFO] Start run op benchmark gpu test ..." + load_CHANGE_OP_FILES + prepare_benchmark_environment + load_CHANGE_OP_MAP + load_BENCHMARK_OP_MAP + run_op_benchmark_test + summary_problems + LOG "[INFO] Op benchmark run success and no error!" + exit 0 +} + + +case $1 in + cpu_op_benchmark) + cpu_op_benchmark + ;; + gpu_op_benchmark) + gpu_op_benchmark + ;; + *) + cpu_op_benchmark + gpu_op_benchmark + ;; +esac From 34d188bff15148ab1ae94b95cb22f60a28610a29 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Mon, 16 Aug 2021 10:48:07 +0800 Subject: [PATCH 043/126] Check whl size (#34767) --- paddle/scripts/paddle_build.sh | 42 +++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 1eee270c8dc46f..1d3504556fc3d8 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -689,18 +689,18 @@ function get_precision_ut_mac() { on_precision=1 re=$(cat ut_list|awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}') UT_list_prec_1='ut_list_prec2' - for case in $UT_list; do - flag=$(echo $case|grep -oE $re) + for ut_case in $UT_list; do + flag=$(echo $ut_case|grep -oE $re) if [ -n "$flag" ];then if [ -z "$UT_list_prec" ];then - UT_list_prec="^$case$" + UT_list_prec="^$ut_case$" elif [[ "${#UT_list_prec}" -gt 10000 ]];then - UT_list_prec_1="$UT_list_prec_1|^$case$" + UT_list_prec_1="$UT_list_prec_1|^$ut_case$" else - UT_list_prec="$UT_list_prec|^$case$" + UT_list_prec="$UT_list_prec|^$ut_case$" fi else - echo ${case} "won't run in PRECISION_TEST mode." + echo ${ut_case} "won't run in PRECISION_TEST mode." fi done fi @@ -722,6 +722,32 @@ function fetch_upstream_develop_if_not_exist() { fi } +function check_whl_size() { + if [ ! "${pr_whl_size}" ];then + echo "pr whl size not found " + exit 1 + fi + + set +x + dev_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'` + echo "dev_whl_size: ${dev_whl_size}" + + whldiffSize=`expr ${pr_whl_size} - ${dev_whl_size}` + if [ ${whldiffSize} -gt 10 ] ; then + approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` + APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22334008 22361972` + echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" + if [ "${APPROVALS}" == "FALSE" ]; then + echo "==========================================================================================" + echo "This PR make the release paddlepaddle whl size growth exceeds 10 M." + echo "Then you must have one RD (jim19930609 (Recommend) or JiabinYang) approval for this PR\n" + echo "==========================================================================================" + exit 6 + fi + fi + set -x +} + function generate_upstream_develop_api_spec() { fetch_upstream_develop_if_not_exist cur_branch=`git branch | grep \* | cut -d ' ' -f2` @@ -730,6 +756,9 @@ function generate_upstream_develop_api_spec() { cmake_gen $1 build $2 cp ${PADDLE_ROOT}/python/requirements.txt /tmp + pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'` + echo "pr_whl_size: ${pr_whl_size}" + git checkout $cur_branch generate_api_spec "$1" "DEV" @@ -2234,6 +2263,7 @@ function main() { example_code=$? summary_check_problems $check_style_code $[${example_code_gpu} + ${example_code}] "$check_style_info" "${example_info_gpu}\n${example_info}" assert_api_spec_approvals + check_whl_size ;; build) cmake_gen ${PYTHON_ABI:-""} From 28279f6f664586baf47a3b9ee91bb4ad834bf5a3 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 16 Aug 2021 10:50:40 +0800 Subject: [PATCH 044/126] [NPU] remove npu int64 kernel for increment op (#34909) --- paddle/fluid/operators/increment_op_npu.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc index 35ebe92b364d3c..cdd82b55b7e81e 100644 --- a/paddle/fluid/operators/increment_op_npu.cc +++ b/paddle/fluid/operators/increment_op_npu.cc @@ -64,6 +64,5 @@ REGISTER_OP_NPU_KERNEL( ops::IncrementalNPUKernel, ops::IncrementalNPUKernel, ops::IncrementalNPUKernel, - ops::IncrementalNPUKernel, ops::IncrementalNPUKernel) From e84b2e9b0564b79fd2c4c0f72379b27317343b95 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Mon, 16 Aug 2021 11:08:13 +0800 Subject: [PATCH 045/126] Add bcast semantics checks at C++ level to BroadcastTensorsOp (#34874) --- .../fluid/operators/broadcast_tensors_op.cc | 9 ++++ .../unittests/test_broadcast_tensors_op.py | 42 +++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc index 074607e05ea7d5..bd85c0029da518 100644 --- a/paddle/fluid/operators/broadcast_tensors_op.cc +++ b/paddle/fluid/operators/broadcast_tensors_op.cc @@ -38,6 +38,7 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel { int target_rank = 0; const auto& input_dims = ctx->GetInputsDim("X"); + // 1. Find Output rank = max(Inputs rank) for (const auto& input_ddim : input_dims) { target_rank = std::max(target_rank, input_ddim.size()); @@ -64,6 +65,14 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel { dim_size = input_ddim[axis]; } + if (target_dim_size != 1 && dim_size != 1 && + target_dim_size != dim_size) { + PADDLE_THROW(platform::errors::InvalidArgument( + "BroadcastTensorsOp inputs does not satisfy bcast semantics," + "Please check axis = %d in reverse order", + index)); + } + // We performed bcast semantics check at python level // So input tensors should all have legal shape target_dim_size = std::max(target_dim_size, dim_size); diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py index 602c5bae8f86e6..f60e4067a09e51 100644 --- a/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py +++ b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py @@ -192,5 +192,47 @@ def test_bcast_semantics(): self.assertRaises(TypeError, test_bcast_semantics) +class TestRaiseBroadcastTensorsErrorDyGraph(unittest.TestCase): + def test_errors(self): + def test_type(): + inputs = [ + paddle.to_tensor( + np.ones( + shape=[1, 1, 1, 1], dtype='float32', name="x4")), + paddle.to_tensor( + np.ones( + shape=[1, 4, 1, 1], dtype='float64', name="x5")) + ] + paddle.broadcast_tensors(inputs) + + def test_dtype(): + inputs = [ + paddle.to_tensor( + np.ones( + shape=[1, 1, 1, 1], dtype='int8', name="x6")), + paddle.to_tensor( + np.ones( + shape=[1, 4, 1, 1], dtype='int8', name="x7")) + ] + paddle.broadcast_tensors(inputs) + + def test_bcast_semantics(): + inputs = [ + paddle.to_tensor( + np.ones( + shape=[1, 3, 1, 1], dtype='float32', name="x9")), + paddle.to_tensor( + np.ones( + shape=[1, 8, 1, 1], dtype='float32', name="x10")) + ] + paddle.broadcast_tensors(inputs) + + paddle.disable_static() + self.assertRaises(TypeError, test_type) + self.assertRaises(TypeError, test_dtype) + self.assertRaises(TypeError, test_bcast_semantics) + paddle.enable_static() + + if __name__ == '__main__': unittest.main() From ad6c3b9222b85c0aa69e6ee79b843ebf9b465a24 Mon Sep 17 00:00:00 2001 From: shangliang Xu Date: Mon, 16 Aug 2021 11:15:25 +0800 Subject: [PATCH 046/126] [dev] fix dice_loss bug (#34757) * fix dice_loss bug --- python/paddle/fluid/layers/nn.py | 25 ++++++-- .../tests/unittests/test_nn_dice_loss.py | 63 +++++++++++++++++++ 2 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_nn_dice_loss.py diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index dc1e56f13f3b1d..656f1efe493dea 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -7105,11 +7105,11 @@ def dice_loss(input, label, epsilon=0.00001, name=None): Parameters: - input (Tensor): Tensor, rank>=2, shape is :math:`[N_1, N_2, ..., N_D]`, where :math:`N_1` is - the batch_size, :math:`N_D` is 1. It is usually the output predictions of sigmoid activation. - The data type can be float32 or float64. - label (Tensor): Tensor, the groud truth with the same rank as input, shape is :math:`[N_1, N_2, ..., N_D]`. - where :math:`N_1` is the batch_size, :math:`N_D` is 1. The data type can be float32 or float64. + input (Tensor): Tensor, rank>=2, shape is :math:`[N_1, N_2, ..., N_k, D]`, where :math:`N_1` is + the batch_size, :math:`D` is the number of categories. It is usually the output + predictions of sigmoid activation. The data type can be float32 or float64. + label (Tensor): Tensor, the groud truth with the same rank as input, shape is :math:`[N_1, N_2, ..., N_k, 1]`. + where :math:`N_1` is the batch_size. The data type can be int32 or int64. epsilon (float): The epsilon will be added to the numerator and denominator. If both input and label are empty, it makes sure dice is 1. Default: 0.00001 @@ -7131,6 +7131,21 @@ def dice_loss(input, label, epsilon=0.00001, name=None): predictions = F.softmax(x) loss = F.dice_loss(input=predictions, label=label) """ + assert input.dtype in (paddle.float32, paddle.float64) + assert label.dtype in (paddle.int32, paddle.int64) + assert len(input.shape) >= 2, \ + "The rank of input should be greater than or equal to 2." + assert len(input.shape) == len(label.shape), ( + "The rank of input and label should be equal, " + "but received input: %d, label: %d." % + (len(input.shape), len(label.shape))) + assert label.shape[-1] == 1, ("The last dimension of label should be 1, " + "but received %d." % label.shape[-1]) + assert input.shape[:-1] == label.shape[:-1], ( + "All dimensions should be equal except the last one.") + assert input.numel() > 0 and label.numel() > 0, \ + "Any dimension of input and label cannot be equal to 0." + label = one_hot(label, depth=input.shape[-1]) reduce_dim = list(range(1, len(input.shape))) inse = reduce_sum(input * label, dim=reduce_dim) diff --git a/python/paddle/fluid/tests/unittests/test_nn_dice_loss.py b/python/paddle/fluid/tests/unittests/test_nn_dice_loss.py new file mode 100644 index 00000000000000..316063767771f9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_nn_dice_loss.py @@ -0,0 +1,63 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.layers.nn as nn + +num_classes = 4 +eps = 1e-6 + + +class TestDiceLossValue(unittest.TestCase): + def test_dice_loss(self): + input_ = paddle.rand([2, 3, num_classes]) + label_ = paddle.randint(0, num_classes, [2, 3, 1], dtype=paddle.int64) + + input_np, label_np = input_.numpy(), label_.numpy() + eye_np = np.eye(num_classes) + label_np = np.float32(eye_np[np.squeeze(label_np)]) + input_np = np.reshape(input_np, [2, -1]) + label_np = np.reshape(label_np, [2, -1]) + intersection_np = np.sum(input_np * label_np, axis=-1) + union_np = input_np.sum(-1) + label_np.sum(-1) + dice_np = np.mean(1 - 2 * intersection_np / (union_np + eps)) + dice_paddle = nn.dice_loss(input_, label_, eps) + self.assertTrue(np.isclose(dice_np, dice_paddle.numpy()).all()) + + +class TestDiceLossInvalidInput(unittest.TestCase): + def test_error(self): + def test_invalid_dtype(): + input_ = paddle.rand([2, 3, num_classes], dtype=paddle.float32) + label_ = paddle.randint( + 0, num_classes, [2, 3, 1], dtype=paddle.int64) + nn.dice_loss(input_, label_.astype(paddle.float32)) + + self.assertRaises(AssertionError, test_invalid_dtype) + + def test_zero_shape_input(): + input_ = paddle.rand([0, 3, num_classes], dtype=paddle.float32) + label_ = paddle.randint( + 0, num_classes, [0, 3, 1], dtype=paddle.int64) + nn.dice_loss(input_, label_) + + self.assertRaises(AssertionError, test_zero_shape_input) + + +if __name__ == "__main__": + unittest.main() From fd92d949c48137d13d0c4aa1f0dfcf806ebedc4a Mon Sep 17 00:00:00 2001 From: zyfncg Date: Mon, 16 Aug 2021 11:32:56 +0800 Subject: [PATCH 047/126] Support npu op hard_swish and hard_swish_grad (#34608) * Support NPU OP hard_swish and hard_swish_grad * Support NPU OP hard_swish and hard_swish_grad * add the unittest to compare the result between npu ans cpu * format the prompt of exception * replace Min and Max op by ClipByValue op * fix the precision problem for fp16 * Using HardtanhGrad to improve performace --- paddle/fluid/operators/activation_op_npu.cc | 156 ++++++++++++++++++ .../unittests/npu/test_hard_swish_op_npu.py | 126 ++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc index 5cf70cc391d8ff..8f6af4260dcc96 100755 --- a/paddle/fluid/operators/activation_op_npu.cc +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -16,6 +16,7 @@ limitations under the Licnse. */ #include #include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/npu_op_runner.h" @@ -388,6 +389,155 @@ class SigmoidGradNPUKernel : public framework::OpKernel { } }; +// HardSwish = min(max(0, x+offset), threshold) * x / scale +template +class HardSwishNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + float threshold = ctx.Attr("threshold"); + float scale = ctx.Attr("scale"); + float offset = ctx.Attr("offset"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + Tensor tensor_offset(x->type()); + tensor_offset.mutable_data({1}, place); + FillNpuTensorWithConstant(&tensor_offset, static_cast(offset)); + + Tensor add_offset_val(x->type()); + add_offset_val.mutable_data(x->dims(), place); + const auto& runner_add = + NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val}); + runner_add.Run(stream); + + Tensor tensor_threshold(x->type()); + tensor_threshold.mutable_data({1}, place); + FillNpuTensorWithConstant(&tensor_threshold, static_cast(threshold)); + + Tensor tensor_zero(x->type()); + tensor_zero.mutable_data({1}, place); + FillNpuTensorWithConstant(&tensor_zero, static_cast(0.0)); + + Tensor clip_val(x->type()); + clip_val.mutable_data(x->dims(), place); + const auto& runner_clip = NpuOpRunner( + "ClipByValue", {add_offset_val, tensor_zero, tensor_threshold}, + {clip_val}); + runner_clip.Run(stream); + + Tensor tensor_scale_tmp(x->type()); + tensor_scale_tmp.mutable_data({1}, place); + FillNpuTensorWithConstant(&tensor_scale_tmp, static_cast(scale)); + Tensor tensor_scale(x->type()); + tensor_scale.mutable_data(x->dims(), place); + const auto& runner_fill = + NpuOpRunner("FillD", {tensor_scale_tmp}, {tensor_scale}, + {{"dims", framework::vectorize(x->dims())}}); + runner_fill.Run(stream); + + Tensor div_val(x->type()); + div_val.mutable_data(x->dims(), place); + const auto& runner_div = + NpuOpRunner("Div", {clip_val, tensor_scale}, {div_val}); + runner_div.Run(stream); + + const auto& runner_mul = NpuOpRunner("Mul", {*x, div_val}, {*out}); + runner_mul.Run(stream); + } +}; + +template +class HardSwishGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + + float threshold = ctx.Attr("threshold"); + float scale = ctx.Attr("scale"); + float offset = ctx.Attr("offset"); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + Tensor tensor_offset(x->type()); + tensor_offset.mutable_data({1}, place); + FillNpuTensorWithConstant(&tensor_offset, static_cast(offset)); + + Tensor add_offset_val(x->type()); + add_offset_val.mutable_data(x->dims(), place); + const auto& runner_add = + NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val}); + runner_add.Run(stream); + + Tensor tmp1(x->type()); + tmp1.mutable_data(x->dims(), place); + const auto& runner_pow1 = NpuOpRunner("Power", {*x}, {tmp1}, + {{"scale", 2.0f}, {"shift", offset}}); + runner_pow1.Run(stream); + + Tensor tmp2(x->type()); + tmp2.mutable_data(x->dims(), place); + const auto& runner_ht_grad = + NpuOpRunner("HardtanhGrad", {add_offset_val, tmp1}, {tmp2}, + {{"min_val", 0.0f}, {"max_val", threshold}}); + runner_ht_grad.Run(stream); + + Tensor tmp3(x->type()); + tmp3.mutable_data(x->dims(), place); + const auto& runner_pow2 = NpuOpRunner( + "Power", {tmp2}, {tmp3}, {{"scale", 1.0f / scale}, {"shift", 1.0f}}); + runner_pow2.Run(stream); + + Tensor tensor_threshold_tmp(x->type()); + tensor_threshold_tmp.mutable_data({1}, place); + FillNpuTensorWithConstant(&tensor_threshold_tmp, + static_cast(threshold)); + Tensor tensor_threshold(x->type()); + tensor_threshold.mutable_data(x->dims(), place); + const auto& runner_fill = + NpuOpRunner("FillD", {tensor_threshold_tmp}, {tensor_threshold}, + {{"dims", framework::vectorize(x->dims())}}); + runner_fill.Run(stream); + + Tensor tmp_bool(framework::proto::VarType::BOOL); + tmp_bool.mutable_data(x->dims(), place); + const auto& runner_less = + NpuOpRunner("Less", {add_offset_val, tensor_threshold}, {tmp_bool}); + runner_less.Run(stream); + Tensor tmp4(x->type()); + tmp4.mutable_data(x->dims(), place); + auto dst_dtype = ConvertToNpuDtype(x->type()); + const auto& runner_cast = + NpuOpRunner("Cast", {tmp_bool}, {tmp4}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast.Run(stream); + + Tensor tmp5(x->type()); + tmp5.mutable_data(x->dims(), place); + const auto& runner_sub = NpuOpRunner("Sub", {tmp3, tmp4}, {tmp5}); + runner_sub.Run(stream); + + const auto& runner_final = NpuOpRunner("Mul", {tmp5, *dout}, {*dx}); + runner_final.Run(stream); + } +}; + template class HardSigmoidNPUKernel : public framework::OpKernel { public: @@ -677,6 +827,12 @@ REGISTER_OP_NPU_KERNEL( ops::SigmoidGradNPUKernel); +REGISTER_OP_NPU_KERNEL(hard_swish, ops::HardSwishNPUKernel, + ops::HardSwishNPUKernel); + +REGISTER_OP_NPU_KERNEL(hard_swish_grad, ops::HardSwishGradNPUKernel, + ops::HardSwishGradNPUKernel); + REGISTER_OP_NPU_KERNEL( hard_sigmoid, ops::HardSigmoidNPUKernel, diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py new file mode 100644 index 00000000000000..32042ba83a9f77 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py @@ -0,0 +1,126 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.nn.functional as F + + +def ref_hard_swish_grad(x, threshold=6.0, scale=6.0, offset=3.0): + dout = np.full_like(x, fill_value=1. / x.size) + tmp = ((x + offset) < threshold).astype(x.dtype) + dx = dout * (((x + offset) > 0).astype(x.dtype) * + (2 * x + offset) * tmp / scale + 1.0 - tmp) + return dx + + +class TestHardSwishNPU(OpTest): + def setUp(self): + paddle.enable_static() + + self.set_npu() + self.op_type = "hard_swish" + self.place = paddle.NPUPlace(0) + self.init_dtype() + + x = np.random.uniform(-6, 6, [10, 12]).astype(self.dtype) + threshold = 6.0 + scale = 6.0 + offset = 3.0 + #the same with TestAbs + x[np.abs(x + offset) < 0.005] = 0.02 + x[np.abs(x - threshold + offset) < 0.005] = threshold - offset + 0.02 + out = (x * (np.minimum(np.maximum(x + offset, 0.), threshold) / + scale)).astype(self.dtype) + self.x_grad = ref_hard_swish_grad(x, threshold, scale, offset) + + self.inputs = {'X': x} + self.attrs = {'threshold': threshold, 'scale': scale, 'offset': offset} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + if self.dtype == np.float16: + return + # There is a problem that precision of grad result using float32 + # can't satisfy the default precision requirement + # when compared with numeric_grads, but the results on + # NPU and CPU are same (verified in TestHardSwishNPUWithCPU) + self.check_grad_with_place( + self.place, ['X'], 'Out', user_defined_grads=[self.x_grad]) + + +class TestHardSwishNPUFp16(TestHardSwishNPU): + def test_check_output(self): + self.check_output_with_place(self.place) + + def init_dtype(self): + self.dtype = np.float16 + + +# test the result of hard_swish and hard_swish_grad on CPU and NPU +class TestHardSwishNPUWithCPU(unittest.TestCase): + def setUp(self): + paddle.disable_static() + + self.place = paddle.NPUPlace(0) + self.dtype = np.float32 + + self.x = np.random.uniform(-6, 10, [8, 15]).astype(self.dtype) + + paddle.set_device('cpu') + + data = paddle.to_tensor(self.x, stop_gradient=False) + y = F.hardswish(data) + y.sum().backward() + + self.out_g = data.grad + self.out_y = y + + def test_check_output_and_grad_npu(self): + paddle.set_device('npu') + + data = paddle.to_tensor(self.x, stop_gradient=False) + y = F.hardswish(data) + y.sum().backward() + + self.assertTrue( + np.allclose(self.out_y.numpy(), y.numpy()), + "Output of NPU HardSwish forward has diff at " + str(self.place) + + "\nExpect " + str(self.out_y) + "\n" + "But Got" + str(y) + + " in class " + self.__class__.__name__ + ".") + self.assertTrue( + np.allclose(self.out_g.numpy(), data.grad.numpy()), + "Output of NPU HardSwish backward has diff at " + str(self.place) + + "\nExpect " + str(self.out_g) + "\n" + "But Got" + str(data.grad) + + " in class " + self.__class__.__name__ + ".") + + +if __name__ == '__main__': + unittest.main() From dc439a129c3205023b973c754a7ccdee4a5f567f Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Mon, 16 Aug 2021 12:51:39 +0800 Subject: [PATCH 048/126] Enhance tensor shape check for dist op. (#34915) --- paddle/fluid/operators/dist_op.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc index 0e9bddf01e82f8..0cb8c29d114512 100644 --- a/paddle/fluid/operators/dist_op.cc +++ b/paddle/fluid/operators/dist_op.cc @@ -27,6 +27,20 @@ class DistOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Dist"); OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Dist"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Dist"); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_NE(framework::product(x_dims), 0, + platform::errors::InvalidArgument( + "The Input(X) has not been initialized properly. The " + "shape of Input(X) = [%s].", + x_dims)); + PADDLE_ENFORCE_NE(framework::product(y_dims), 0, + platform::errors::InvalidArgument( + "The Input(Y) has not been initialized properly. The " + "shape of Input(Y) = [%s].", + y_dims)); ctx->SetOutputDim("Out", {1}); } }; From b0cb4148551b0b1657160594ddbab9d429422cf7 Mon Sep 17 00:00:00 2001 From: Guoxia Wang Date: Mon, 16 Aug 2021 13:56:22 +0800 Subject: [PATCH 049/126] support margin loss (arcface, cosface, sphereface) for single GPU and cross GPUs (#34247) * support margin loss (arcface, cosface, sphereface) --- .../operators/margin_cross_entropy_op.cc | 203 ++++++++ .../operators/margin_cross_entropy_op.cu | 483 ++++++++++++++++++ .../fluid/operators/margin_cross_entropy_op.h | 41 ++ .../fluid/tests/unittests/CMakeLists.txt | 3 + .../parallel_margin_cross_entropy.py | 188 +++++++ .../unittests/test_margin_cross_entropy_op.py | 385 ++++++++++++++ .../test_parallel_margin_cross_entropy.py | 29 ++ python/paddle/nn/functional/__init__.py | 2 + python/paddle/nn/functional/loss.py | 262 ++++++++++ tools/static_mode_white_list.py | 1 + 10 files changed, 1597 insertions(+) create mode 100644 paddle/fluid/operators/margin_cross_entropy_op.cc create mode 100644 paddle/fluid/operators/margin_cross_entropy_op.cu create mode 100644 paddle/fluid/operators/margin_cross_entropy_op.h create mode 100644 python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py create mode 100644 python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cc b/paddle/fluid/operators/margin_cross_entropy_op.cc new file mode 100644 index 00000000000000..94c58fa6970d90 --- /dev/null +++ b/paddle/fluid/operators/margin_cross_entropy_op.cc @@ -0,0 +1,203 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/margin_cross_entropy_op.h" + +namespace paddle { +namespace operators { + +class MarginCrossEntropyOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Logits"), "Input", "Logits", + "MarginCrossEntropyOp"); + OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", + "MarginCrossEntropyOp"); + + OP_INOUT_CHECK(ctx->HasOutput("Softmax"), "Output", "Softmax", + "MarginCrossEntropyOp"); + OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", + "MarginCrossEntropyOp"); + + auto logits_dims = ctx->GetInputDim("Logits"); + auto labels_dims = ctx->GetInputDim("Label"); + + auto logits_rank = logits_dims.size(); + auto axis = logits_rank - 1; + for (int i = 0; i < logits_rank; i++) { + if (i != axis) { + if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) { + PADDLE_ENFORCE_EQ(logits_dims[i], labels_dims[i], + platform::errors::InvalidArgument( + "Input(Logits) and Input(Label) should in " + "same shape in dimensions except axis.")); + } + } + } + + if (labels_dims.size() > 1) { + PADDLE_ENFORCE_EQ( + labels_dims[logits_rank - 1], 1UL, + platform::errors::InvalidArgument( + "the last dimension of Input(Label) should be 1." + "But received: the last dimension of Input(Label) is [%d]," + "the last dimension is [%d]", + labels_dims[logits_rank - 1], logits_rank - 1)); + } + + ctx->SetOutputDim("Softmax", logits_dims); + + logits_dims[axis] = 1; + ctx->SetOutputDim("Loss", logits_dims); + + ctx->ShareLoD("Logits", /*->*/ "Softmax"); + ctx->ShareLoD("Logits", /*->*/ "Loss"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "Logits"), + ctx.device_context()); + } +}; + +class MarginCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("Logits", + "(Tensor, default: Tensor), The input tensor of unscaled " + "log probabilities, whose dimension :attr:`axis` should be scaled " + "by softmax."); + AddInput( + "Label", + "(Tensor) The input tensor of groud truth label. Label is a " + "Tensor in same shape with Input(Logits) except the shape in " + "dimension :attr:`axis` as 1."); + AddOutput( + "Softmax", + "(Tensor, default: Tensor), A tensor in same shape with " + "Input(Logits). " + "The outputs value of softmax activation by given the input batch, " + "which will be used in backward calculation."); + AddOutput("Loss", + "(Tensor, default: Tensor), A tensor in same shape with " + "Input(Logits) " + "except the shape in dimension :attr:`axis` as 1. The cross " + "entropy loss."); + AddAttr("return_softmax", + "(bool default false) A flag to indicate " + "whether to return softmax.") + .SetDefault(false); + AddAttr("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr("rank", "(int default 0) rank id for MarginCrossEntropy.") + .SetDefault(0); + AddAttr("nranks", "(int default 1) nranks id for MarginCrossEntropy.") + .SetDefault(1); + AddAttr("margin1", "(float default 1.0) margin1 for MarginLoss.") + .SetDefault(1.0); + AddAttr("margin2", "(float default 0.5) margin2 for MarginLoss.") + .SetDefault(0.5); + AddAttr("margin3", "(float default 0.0) margin3 for MarginLoss.") + .SetDefault(0.0); + AddAttr("scale", "(float default 64.0) scale for MarginLoss.") + .SetDefault(64.0); + AddComment(R"DOC( +MarginCrossEntropy Operator +.. math:: + + L=-\frac{1}{N}\sum^N_{i=1}\log\frac{e^{s(cos(m_{1}\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\neq y_i} e^{scos\theta_{y_i}}} + +where the :math: `\theta_{y_i}` is the angle between the feature :math: `x` and +the representation of class :math: `i`. The details of ArcFace loss +could be referred to https://arxiv.org/abs/1801.07698. + +Note that the Op supports model parallel and single GPU. And Logits.shape[-1] can be different each rank. + +)DOC"); + } +}; + +class MarginCrossEntropyOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")), true, + platform::errors::InvalidArgument( + "Input(Loss@Grad) should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true, + platform::errors::InvalidArgument( + "Input(Softmax) should be not null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Label"), true, + platform::errors::InvalidArgument("Input(Label) should be not null.")); + + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")), true, + platform::errors::InvalidArgument( + "Output(Logits@Grad) should be not null.")); + + ctx->SetOutputDim(framework::GradVarName("Logits"), + ctx->GetInputDim("Softmax")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Loss")), + ctx.device_context()); + } +}; + +template +class MarginCrossEntropyOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("margin_cross_entropy_grad"); + + op->SetInput("Softmax", this->Output("Softmax")); + op->SetInput("Logits", this->Input("Logits")); + op->SetInput("Label", this->Input("Label")); + op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss")); + op->SetAttrMap(this->Attrs()); + op->SetOutput(framework::GradVarName("Logits"), this->InputGrad("Logits")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR( + margin_cross_entropy, ops::MarginCrossEntropyOp, + ops::MarginCrossEntropyOpMaker, + ops::MarginCrossEntropyOpGradMaker, + ops::MarginCrossEntropyOpGradMaker); + +REGISTER_OPERATOR(margin_cross_entropy_grad, ops::MarginCrossEntropyOpGrad); + +REGISTER_OP_CPU_KERNEL(margin_cross_entropy, + ops::MarginCrossEntropyOpCPUKernel, + ops::MarginCrossEntropyOpCPUKernel, + ops::MarginCrossEntropyOpCPUKernel); diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu new file mode 100644 index 00000000000000..ccdba43b0542dc --- /dev/null +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -0,0 +1,483 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_HIP +#include +namespace cub = hipcub; +#else +#include +#endif + +#include +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/margin_cross_entropy_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/softmax_impl.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" +#include "paddle/fluid/string/string_helper.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +void GetClassInterval(const gpuStream_t& stream, const platform::Place& place, + const platform::DeviceContext& ctx, const int rid, + const int rank, const int nranks, const int D, + Tensor* class_interval) { + std::vector shard_dim_vec(nranks + 1, 0); + shard_dim_vec[rank + 1] = D; + if (nranks <= 1) { + framework::TensorFromVector(shard_dim_vec, ctx, class_interval); + return; + } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + Tensor num_classes_per_device; + framework::TensorFromVector(shard_dim_vec, ctx, &num_classes_per_device); + int* num_classes_per_device_ptr = num_classes_per_device.data(); + + const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place); + // use global calculate stream + const auto calcu_stream = + static_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + num_classes_per_device_ptr, num_classes_per_device_ptr, + num_classes_per_device.numel(), + platform::ToNCCLDataType(num_classes_per_device.type()), ncclSum, + comm->comm(), calcu_stream)); + + auto class_interval_ptr = + class_interval->mutable_data({nranks + 1}, place); + size_t cub_temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum( + nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream); + auto cub_temp_storage = memory::Alloc(place, cub_temp_storage_bytes); + cub::DeviceScan::InclusiveSum( + cub_temp_storage->ptr(), cub_temp_storage_bytes, + num_classes_per_device_ptr, class_interval_ptr, nranks + 1, stream); + return; +#endif +} + +template +__global__ void AddMarginToPositiveLogitsKernel( + T* logit, const IndexT* label, const float margin1, const float margin2, + const float margin3, const int rank, const int nranks, const int64_t N, + const int64_t D, const int* class_interval_ptr) { + using MPType = typename details::MPTypeTrait::Type; + int start_index = class_interval_ptr[rank]; + int end_index = class_interval_ptr[rank + 1]; + int num_classes = class_interval_ptr[nranks]; + CUDA_KERNEL_LOOP(i, N) { + auto real_label = label[i]; + PADDLE_ENFORCE((real_label < num_classes) && (real_label >= 0), + "The index is out of bounds, " + "please check whether the value of label and " + "input meet the number of class. It should " + "be less than [%d], but received [%d]", + num_classes, real_label); + + if (real_label >= start_index && real_label < end_index) { + int64_t offset = i * D + real_label - start_index; + if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) { + MPType x = static_cast(logit[offset]); + MPType theta = acos(x); + if (fabs(margin1 - 1.0) > 1e-8) { + theta *= static_cast(margin1); + } + if (fabs(margin2) > 1e-8) { + theta += static_cast(margin2); + } + logit[offset] = static_cast(cos(theta)); + } + if (fabs(margin3) > 1e-8) { + MPType y = static_cast(logit[offset]); + y -= static_cast(margin3); + logit[offset] = static_cast(y); + } + } + } +} + +static __device__ __forceinline__ platform::float16 exp_on_device( + platform::float16 x) { + return ::Eigen::numext::exp(x); +} +static __device__ __forceinline__ float exp_on_device(float x) { + return expf(x); +} +static __device__ __forceinline__ double exp_on_device(double x) { + return exp(x); +} +static __device__ __forceinline__ platform::float16 log_on_device( + platform::float16 x) { + return ::Eigen::numext::log(x); +} +static __device__ __forceinline__ float log_on_device(float x) { + return logf(x); +} +static __device__ __forceinline__ double log_on_device(double x) { + return log(x); +} + +template +struct ExpLogitTransformer { + HOSTDEVICE explicit inline ExpLogitTransformer(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(exp_on_device(x)); + } +}; + +template +struct ExpAndSum { + using Transformer = ExpLogitTransformer; + + inline Ty initial() { return static_cast(0.0f); } + + __device__ __forceinline__ Ty operator()(const Ty& a, const Ty& b) const { + return b + a; + } +}; + +template +__global__ void ScaleLogitKernel(T* logits, const float scale, const int64_t N, + const int64_t D) { + CUDA_KERNEL_LOOP(i, N * D) { logits[i] *= static_cast(scale); } +} + +template +__global__ void LogitsMinusMaxKernel(T* logits, const T* logits_max_per_row, + const int64_t N, const int64_t D) { + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + logits[i] -= logits_max_per_row[row]; + } +} + +template +__global__ void LogitsMinusLogSumKernel(T* logits, const T* logits_sum_per_row, + const int64_t N, const int64_t D) { + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + logits[i] -= log_on_device(logits_sum_per_row[row]); + } +} + +template +__global__ void HardLabelSoftmaxWithCrossEntropyKernel( + T* loss, T* log_softmax, const IndexT* labels, const int rank, + const int64_t N, const int64_t D, const int* class_interval_ptr) { + int start_index = class_interval_ptr[rank]; + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + auto col = i % D; + if ((col + start_index) == labels[row]) { + auto softmax = log_softmax[i]; + loss[row] = -softmax; + log_softmax[i] = exp_on_device(softmax); + } else { + log_softmax[i] = exp_on_device(log_softmax[i]); + } + } +} + +template +__global__ void CalculateGrad(T* logits_grad, const T* loss_grad, + const T* logits, const IndexT* labels, + const float margin1, const float margin2, + const float scale, const int rank, + const int64_t N, const int64_t D, + const int* class_interval_ptr) { + using MPType = typename details::MPTypeTrait::Type; + int start_index = class_interval_ptr[rank]; + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + auto col = i % D; + if ((col + start_index) == labels[row]) { + logits_grad[i] = (logits_grad[i] - static_cast(1.0)) * loss_grad[row]; + if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) { + MPType dout = static_cast(logits_grad[i]); + MPType one = static_cast(1.0f); + MPType x = static_cast(logits[i]); + MPType m1 = static_cast(margin1); + MPType m2 = static_cast(margin2); + + MPType d = m1 * sin(m1 * acos(x) + m2) / sqrt(one - x * x); + logits_grad[i] = static_cast(dout * d); + } + } else { + logits_grad[i] *= loss_grad[row]; + } + if (fabs(scale - 1.0) > 1e-8) { + logits_grad[i] *= static_cast(scale); + } + } +} + +template +class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* logits = ctx.Input("Logits"); + const Tensor* labels = ctx.Input("Label"); + Tensor* softmax = ctx.Output("Softmax"); + Tensor* loss = ctx.Output("Loss"); + + const int rid = ctx.Attr("ring_id"); + const int nranks = ctx.Attr("nranks"); + const int rank = ctx.Attr("rank"); + + const float margin1 = ctx.Attr("margin1"); + const float margin2 = ctx.Attr("margin2"); + const float margin3 = ctx.Attr("margin3"); + const float scale = ctx.Attr("scale"); + + const auto& place = ctx.GetPlace(); + auto& dev_ctx = ctx.template device_context(); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + platform::NCCLComm* comm; + gpuStream_t stream; + if (nranks > 1) { + comm = platform::NCCLCommContext::Instance().Get(rid, place); + + // use global calculate stream + stream = static_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + } +#endif + + // allocate memory on device. + T* softmax_ptr = softmax->mutable_data(place); + T* loss_ptr = loss->mutable_data(place); + + const auto& logits_dims = logits->dims(); + const auto& labels_dims = labels->dims(); + + const int axis = logits_dims.size() - 1; + const int N = SizeToAxis(axis, logits_dims); + const int D = SizeFromAxis(axis, logits_dims); + + int blocks = NumBlocks(N); + int threads = kNumCUDAThreads; + const auto& label_type = labels->type(); + + // copy logits to softmax variable since we can't modify logits, + // and it also be used when calculate grad + framework::TensorCopy(*logits, ctx.GetPlace(), ctx.device_context(), + softmax); + + Tensor softmax_2d; + softmax_2d.ShareDataWith(*softmax).Resize({N, D}); + T* logits_ptr = softmax_2d.data(); + + Tensor class_interval; + GetClassInterval(dev_ctx.stream(), place, ctx.cuda_device_context(), rid, + rank, nranks, D, &class_interval); + + // step 1, preprocess logits + // add margin for positive elements + // theta = acos(x_i) + // (cos(m1 * theta + m2) - m3) + // save match_logits, used for gradient computation. + if (label_type == framework::proto::VarType::INT32) { + typedef int32_t LabelT; + AddMarginToPositiveLogitsKernel< + T><<>>( + logits_ptr, labels->data(), margin1, margin2, margin3, rank, + nranks, N, D, class_interval.data()); + } else if (label_type == framework::proto::VarType::INT64) { + typedef int64_t LabelT; + AddMarginToPositiveLogitsKernel< + T><<>>( + logits_ptr, labels->data(), margin1, margin2, margin3, rank, + nranks, N, D, class_interval.data()); + } + + // scale by s + ScaleLogitKernel<<>>( + logits_ptr, scale, N, D); + + // step 2, obtain logit_max + Tensor logits_max; + logits_max = + ctx.AllocateTmpTensor({N, 1}, dev_ctx); + T* logits_max_buff = logits_max.mutable_data(place); + TensorReduceFunctorImpl(softmax_2d, &logits_max, {1}, + dev_ctx.stream()); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (nranks > 1) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + logits_max_buff, logits_max_buff, logits_max.numel(), + platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(), + stream)); + } +#endif + + // step 3, logit - logit_max + LogitsMinusMaxKernel<<>>( + logits_ptr, logits_max_buff, N, D); + + // step 4, sum(exp(logit - logit_max)) + Tensor sum_exp_logits; + sum_exp_logits = + ctx.AllocateTmpTensor({N, 1}, dev_ctx); + T* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); + TensorReduceFunctorImpl(softmax_2d, &sum_exp_logits, {1}, + dev_ctx.stream()); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (nranks > 1) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(), + platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum, + comm->comm(), stream)); + } +#endif + + // step 5, (logit - logit_max) - log(sum(exp(logit - logit_max))) + LogitsMinusLogSumKernel< + T><<>>( + logits_ptr, sum_exp_logits_buff, N, D); + + // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit - + // logit_max)))) + // loss = -((logit_i - logit_max) - log(sum(exp(logit - logit_max)))) + math::SetConstant()(dev_ctx, loss, + static_cast(0.0)); + if (label_type == framework::proto::VarType::INT32) { + typedef int32_t LabelT; + HardLabelSoftmaxWithCrossEntropyKernel< + T, LabelT><<>>( + loss_ptr, logits_ptr, labels->data(), rank, N, D, + class_interval.data()); + } else if (label_type == framework::proto::VarType::INT64) { + typedef int64_t LabelT; + HardLabelSoftmaxWithCrossEntropyKernel< + T, LabelT><<>>( + loss_ptr, logits_ptr, labels->data(), rank, N, D, + class_interval.data()); + } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (nranks > 1) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + loss_ptr, loss_ptr, loss->numel(), + platform::ToNCCLDataType(loss->type()), ncclSum, comm->comm(), + stream)); + } +#endif + } +}; + +template +class MarginCrossEntropyGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* labels = context.Input("Label"); + const Tensor* logits = context.Input("Logits"); + const Tensor* softmax = context.Input("Softmax"); + + const Tensor* loss_grad = + context.Input(framework::GradVarName("Loss")); + Tensor* logit_grad = + context.Output(framework::GradVarName("Logits")); + + const bool return_softmax = context.Attr("return_softmax"); + + const int rid = context.Attr("ring_id"); + const int nranks = context.Attr("nranks"); + const int rank = context.Attr("rank"); + + const float margin1 = context.Attr("margin1"); + const float margin2 = context.Attr("margin2"); + const float margin3 = context.Attr("margin3"); + const float scale = context.Attr("scale"); + + auto& dev_ctx = + context.template device_context(); + + const auto sofrmax_dims = softmax->dims(); + const int axis = sofrmax_dims.size() - 1; + const int N = SizeToAxis(axis, sofrmax_dims); + const int D = SizeFromAxis(axis, sofrmax_dims); + + if (return_softmax) { + framework::TensorCopy(*softmax, context.GetPlace(), + context.device_context(), logit_grad); + } else { + logit_grad->ShareDataWith(*softmax); + } + + int blocks = NumBlocks(N * D); + int threads = kNumCUDAThreads; + const auto& label_type = labels->type(); + + Tensor class_interval; + GetClassInterval(dev_ctx.stream(), context.GetPlace(), + context.cuda_device_context(), rid, rank, nranks, D, + &class_interval); + + if (label_type == framework::proto::VarType::INT32) { + typedef int32_t LabelT; + CalculateGrad<<>>( + logit_grad->data(), loss_grad->data(), logits->data(), + labels->data(), margin1, margin2, scale, rank, N, D, + class_interval.data()); + } else if (label_type == framework::proto::VarType::INT64) { + typedef int64_t LabelT; + CalculateGrad<<>>( + logit_grad->data(), loss_grad->data(), logits->data(), + labels->data(), margin1, margin2, scale, rank, N, D, + class_interval.data()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(margin_cross_entropy, + ops::MarginCrossEntropyOpCUDAKernel, + ops::MarginCrossEntropyOpCUDAKernel, + ops::MarginCrossEntropyOpCUDAKernel); + +REGISTER_OP_CUDA_KERNEL(margin_cross_entropy_grad, + ops::MarginCrossEntropyGradCUDAKernel, + ops::MarginCrossEntropyGradCUDAKernel, + ops::MarginCrossEntropyGradCUDAKernel); diff --git a/paddle/fluid/operators/margin_cross_entropy_op.h b/paddle/fluid/operators/margin_cross_entropy_op.h new file mode 100644 index 00000000000000..fe0dab5d47d35a --- /dev/null +++ b/paddle/fluid/operators/margin_cross_entropy_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/softmax_op.h" + +namespace paddle { +namespace operators { + +template +class MarginCrossEntropyOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support margin_cross_entropy for cpu kernel " + "now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 007221ca4f9ca3..9d8b5fb699e33a 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -28,6 +28,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers) +list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) #remove distribute unittests. list(APPEND MIXED_DIST_TEST_OPS test_dgc_op) @@ -195,6 +196,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_mixed_precision) LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single) LIST(REMOVE_ITEM TEST_OPS test_dygraph_recompute) + LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy) elseif(WITH_GPU) if (${CUDNN_VERSION} VERSION_LESS 7100) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) @@ -906,6 +908,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT 120) if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py new file mode 100644 index 00000000000000..475a26ee94f372 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py @@ -0,0 +1,188 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest + +import paddle +import numpy as np +import random +import paddle.distributed as dist +import paddle.fluid as fluid +import paddle.distributed.fleet as fleet +from paddle import framework + + +def set_random_seed(seed): + """Set random seed for reproducability.""" + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + fleet.meta_parallel.model_parallel_random_seed(seed) + + +class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase): + def setUp(self): + strategy = fleet.DistributedStrategy() + fleet.init(is_collective=True, strategy=strategy) + + def test_parallel_margin_softmax_cross_entropy(self): + margin1s = [1.0, 1.0, 1.35] + margin2s = [0.5, 0.0, 0.0] + margin3s = [0.0, 0.35, 0.0] + scales = [64.0, 64.0, 64.0] + + rank_id = dist.get_rank() + num_trainer = dist.get_world_size() + batch_size = 2 + feature_length = 4 + seed = 1025 + set_random_seed(seed) + paddle.seed(rank_id * 10) + random.seed(seed) + np.random.seed(seed) + + check_group = dist.new_group(list(range(num_trainer))) + for dtype in ('float32', 'float64'): + + num_class_per_cards = [[4, 8], [2, 2], [4, 2], [3, 9]] + for num_class_per_card in num_class_per_cards: + + num_class = np.sum(num_class_per_card) + for margin1, margin2, margin3, scale in zip(margin1s, margin2s, + margin3s, scales): + + for _ in range(5): + np_label = np.random.randint(0, num_class, + (batch_size, )) + label = paddle.to_tensor(np_label, dtype="int64") + + input = paddle.randn( + shape=[batch_size, feature_length], dtype=dtype) + input.stop_gradient = False + input_l2 = paddle.sqrt( + paddle.sum( + paddle.square(input), axis=1, keepdim=True)) + norm_input = paddle.divide(input, input_l2) + + weight = paddle.randn( + shape=[ + feature_length, num_class_per_card[rank_id] + ], + dtype=dtype) + weight.stop_gradient = False + weight_l2 = paddle.sqrt( + paddle.sum( + paddle.square(weight), axis=0, keepdim=True)) + norm_weight = paddle.divide(weight, weight_l2) + + data = paddle.matmul(norm_input, norm_weight) + data.stop_gradient = False + + sta = np.sum( + num_class_per_card[:rank_id]) if rank_id > 0 else 0 + end = np.sum(num_class_per_card[:rank_id + 1]) + + integral_data = np.zeros( + (batch_size, num_class), dtype=dtype) + integral_data[:, sta:end] = data.clone().detach().numpy( + ) + integral_data = paddle.to_tensor( + integral_data, dtype=dtype) + + paddle.distributed.all_reduce( + integral_data, + op=paddle.distributed.ReduceOp.SUM, + group=check_group) + integral_data = integral_data.detach().clone() + integral_data.stop_gradient = False + + # add arcface margin to logit + theta = paddle.acos(integral_data) + one_hot_label = paddle.nn.functional.one_hot( + label, num_classes=num_class) + one_hot_label.stop_gradient = False + + if margin1 != 1.0: + theta = margin1 * theta + if margin2 != 0.0: + theta = theta + margin2 + margin_cos = paddle.cos(theta) + if margin3 != 0.0: + margin_cos = margin_cos - margin3 + diff = one_hot_label * (margin_cos - integral_data) + arc_data = (integral_data + diff) * scale + + loss_a, softmax_a = paddle.nn.functional.margin_cross_entropy( + data, + label, + margin1=margin1, + margin2=margin2, + margin3=margin3, + scale=scale, + group=check_group, + return_softmax=True, + reduction=None) + loss_b, softmax_b = paddle.nn.functional.softmax_with_cross_entropy( + logits=arc_data, + label=paddle.reshape(label, (-1, 1)), + return_softmax=True) + + np.testing.assert_allclose( + loss_a.numpy(), loss_b.numpy(), rtol=1e-5) + + integral_prob = np.zeros( + (batch_size, num_class), dtype=dtype) + integral_prob[:, sta:end] = softmax_a.clone().detach( + ).numpy() + integral_prob = paddle.to_tensor( + integral_prob, dtype=dtype) + paddle.distributed.all_reduce( + integral_prob, + op=paddle.distributed.ReduceOp.SUM, + group=check_group) + integral_prob = integral_prob.detach().clone() + integral_prob.stop_gradient = False + + np.testing.assert_allclose( + integral_prob.numpy(), + softmax_b.numpy(), + rtol=1e-5, + atol=1e-6) + + loss_a = loss_a.sum() / batch_size + loss_b = loss_b.sum() / batch_size + loss_a.backward() + loss_b.backward() + + integral_grad = np.zeros( + (batch_size, num_class), dtype=dtype) + integral_grad[:, sta:end] = data.grad.clone().detach() + integral_grad = paddle.to_tensor( + integral_grad, dtype=dtype) + paddle.distributed.all_reduce( + integral_grad, + op=paddle.distributed.ReduceOp.SUM, + group=check_group) + + np.testing.assert_allclose( + integral_data.grad.numpy(), + integral_grad.numpy(), + rtol=1e-5) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py new file mode 100644 index 00000000000000..85d74f379814cd --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py @@ -0,0 +1,385 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import math +import random +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid import Program, program_guard + + +def stable_softmax_comm(x): + shiftx = (x - np.max(x)) + deno = np.log(np.sum(np.exp(shiftx))) + comm = shiftx - deno + return comm + + +def margin_cross_entropy(logits, + label, + axis, + margin1, + margin2, + margin3, + scale, + reduction=None): + one_hot_label = np.zeros_like(logits, dtype=logits.dtype) + for i, lb in enumerate(label): + one_hot_label[i, lb] = 1.0 + + # add arcface margin to logit + theta = np.arccos(logits) + if margin1 != 1.0: + theta = margin1 * theta + if margin2 != 0.0: + theta = theta + margin2 + margin_cos = np.cos(theta) + if margin3 != 0.0: + margin_cos = margin_cos - margin3 + diff = one_hot_label * (margin_cos - logits) + arc_logits = (logits + diff) * scale + + comm = np.apply_along_axis(stable_softmax_comm, axis, arc_logits) + loss = (-one_hot_label * comm).sum(axis=axis, keepdims=True) + softmax = np.exp(comm) + if reduction == 'mean': + loss = np.mean(loss) + elif reduction == 'sum': + loss = np.sum(loss) + return loss, softmax + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOp(OpTest): + def initParams(self): + self.op_type = "margin_cross_entropy" + self.axis = -1 + self.batch_dim = 5 + self.feat_dim = 41 + self.num_class = 37 + + def init_loss_params(self): + self.margin1 = 1.0 + self.margin2 = 0.5 + self.margin3 = 0.0 + self.scale = 2.0 + + def init_dtype(self): + self.dtype = np.float64 + + def setUp(self): + self.initParams() + self.init_loss_params() + self.init_dtype() + + datas = np.random.uniform( + -0.99, 0.99, [self.batch_dim, self.feat_dim]).astype(self.dtype) + datas = datas / np.sqrt(np.sum(np.square(datas), axis=1, keepdims=True)) + weights = np.random.uniform( + -0.99, 0.99, [self.feat_dim, self.num_class]).astype(self.dtype) + weights = weights / np.sqrt( + np.sum(np.square(weights), axis=0, keepdims=True)) + logits = np.matmul(datas, weights) + + labels = np.random.randint( + 0, self.num_class, (self.batch_dim, ), dtype="int64") + + loss, softmax = margin_cross_entropy(logits, labels, self.axis, + self.margin1, self.margin2, + self.margin3, self.scale) + + self.inputs = {"Logits": logits, "Label": labels} + self.outputs = { + "Softmax": softmax.astype(self.dtype), + "Loss": loss.astype(self.dtype) + } + self.attrs = { + 'margin1': self.margin1, + 'margin2': self.margin2, + 'margin3': self.margin3, + 'scale': self.scale, + } + + def test_check_output(self): + self.check_output_with_place(core.CUDAPlace(0), atol=1e-5) + + def test_check_grad(self): + self.check_grad_with_place(core.CUDAPlace(0), ["Logits"], "Loss") + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpFP32(TestMarginCrossEntropyOp): + def init_dtype(self): + self.dtype = np.float32 + + def test_check_grad(self): + self.check_grad_with_place( + core.CUDAPlace(0), ["Logits"], + "Loss", + numeric_grad_delta=5e-2, + max_relative_error=5e-2) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpFP16(TestMarginCrossEntropyOp): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(core.CUDAPlace(0), atol=5e-2) + + def test_check_grad(self): + self.check_grad_with_place( + core.CUDAPlace(0), ["Logits"], + "Loss", + numeric_grad_delta=6e-1, + max_relative_error=6e-1) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpCosFace(TestMarginCrossEntropyOp): + def init_loss_params(self): + self.margin1 = 1.0 + self.margin2 = 0.0 + self.margin3 = 0.35 + self.scale = 2.0 + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpSphereFace(TestMarginCrossEntropyOp): + def init_loss_params(self): + self.margin1 = 1.35 + self.margin2 = 0.0 + self.margin3 = 0.0 + self.scale = 2.0 + + +class TestMarginCrossEntropyOpCPU(TestMarginCrossEntropyOp): + def test_check_output(self): + try: + self.check_output_with_place(core.CPUPlace(), atol=1e-5) + except RuntimeError: + pass + + def test_check_grad(self): + try: + self.check_grad_with_place(core.CPUPlace(), ["Logits"], "Loss") + except RuntimeError: + pass + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpV2(unittest.TestCase): + def setUp(self): + self.initParams() + np.random.seed(self.seed) + paddle.framework.random._manual_program_seed(self.seed) + self.places = [] + if core.is_compiled_with_cuda(): + self.places.append(paddle.fluid.CUDAPlace(0)) + + def initParams(self): + self.seed = 2021 + self.axis = -1 + self.batch_dim = 5 + self.feat_dim = 41 + self.num_class = 37 + self.init_loss_params() + self.init_dtype() + self.init_reduction() + + def init_loss_params(self): + self.margin1 = 1.0 + self.margin2 = 0.5 + self.margin3 = 0.0 + self.scale = 2.0 + + def init_dtype(self): + self.dtype = np.float64 + + def init_reduction(self): + self.reduction = None + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + def check_static_result(self, place): + with program_guard(Program(), Program()): + datas = np.random.uniform( + -0.99, 0.99, [self.batch_dim, self.feat_dim]).astype(self.dtype) + datas = datas / np.sqrt( + np.sum(np.square(datas), axis=1, keepdims=True)) + weights = np.random.uniform( + -0.99, 0.99, [self.feat_dim, self.num_class]).astype(self.dtype) + weights = weights / np.sqrt( + np.sum(np.square(weights), axis=0, keepdims=True)) + + logits_np = np.matmul(datas, weights) + labels_np = np.random.randint( + 0, self.num_class, (self.batch_dim, ), dtype="int64") + + loss_np, softmax_np = margin_cross_entropy( + logits_np, labels_np, self.axis, self.margin1, self.margin2, + self.margin3, self.scale, self.reduction) + + logits = paddle.static.data( + name='logits', + shape=[self.batch_dim, self.num_class], + dtype=self.dtype) + label = paddle.static.data( + name='label', shape=[self.batch_dim], dtype="int64") + loss, softmax = paddle.nn.functional.margin_cross_entropy( + logits, + label, + margin1=self.margin1, + margin2=self.margin2, + margin3=self.margin3, + scale=self.scale, + return_softmax=True, + reduction=self.reduction) + + exe = paddle.fluid.Executor(place) + [loss_res, softmax_res] = exe.run( + paddle.fluid.default_main_program(), + feed={'logits': logits_np, + 'label': labels_np}, + fetch_list=[loss, softmax]) + np.testing.assert_allclose(loss_res, loss_np) + np.testing.assert_allclose(softmax_res, softmax_np) + + def test_dynamic(self): + for place in self.places: + self.check_dynamic_result(place=place) + + def check_dynamic_result(self, place): + with paddle.fluid.dygraph.guard(place): + datas = np.random.uniform( + -0.99, 0.99, [self.batch_dim, self.feat_dim]).astype(self.dtype) + datas = datas / np.sqrt( + np.sum(np.square(datas), axis=1, keepdims=True)) + weights = np.random.uniform( + -0.99, 0.99, [self.feat_dim, self.num_class]).astype(self.dtype) + weights = weights / np.sqrt( + np.sum(np.square(weights), axis=0, keepdims=True)) + + logits_np = np.matmul(datas, weights) + labels_np = np.random.randint( + 0, self.num_class, (self.batch_dim, ), dtype="int64") + + loss_np, softmax_np = margin_cross_entropy( + logits_np, labels_np, self.axis, self.margin1, self.margin2, + self.margin3, self.scale, self.reduction) + + logits = paddle.to_tensor(logits_np, dtype=self.dtype) + labels = paddle.to_tensor(labels_np, dtype="int64") + + loss, softmax = paddle.nn.functional.margin_cross_entropy( + logits, + labels, + margin1=self.margin1, + margin2=self.margin2, + margin3=self.margin3, + scale=self.scale, + return_softmax=True, + reduction=self.reduction) + + loss_res = loss.numpy() + softmax_res = softmax.numpy() + np.testing.assert_allclose(loss_res, loss_np) + np.testing.assert_allclose(softmax_res, softmax_np) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpV3(TestMarginCrossEntropyOpV2): + def init_reduction(self): + self.reduction = 'mean' + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpV4(TestMarginCrossEntropyOpV2): + def init_reduction(self): + self.reduction = 'sum' + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpAPIError(unittest.TestCase): + def setUp(self): + self.initParams() + np.random.seed(self.seed) + paddle.framework.random._manual_program_seed(self.seed) + self.places = [] + if core.is_compiled_with_cuda(): + self.places.append(paddle.fluid.CUDAPlace(0)) + + def initParams(self): + self.seed = 2021 + self.axis = -1 + self.batch_dim = 10 + self.feat_dim = 41 + self.num_class = 37 + self.init_loss_params() + self.init_dtype() + + def init_loss_params(self): + self.margin1 = 1.0 + self.margin2 = 0.5 + self.margin3 = 0.0 + self.scale = 2.0 + + def init_dtype(self): + self.dtype = np.float64 + + def test_dynamic_errors(self): + def test_dim(): + for place in self.places: + with paddle.fluid.dygraph.guard(place): + labels_np = np.random.randint( + 0, self.num_class, (self.batch_dim, 2), dtype="int64") + logits_np = np.random.uniform( + -0.99, 0.99, + [self.batch_dim, self.num_class]).astype(self.dtype) + labels = paddle.to_tensor(labels_np) + logits = paddle.to_tensor(logits_np) + + loss, softmax = paddle.nn.functional.margin_cross_entropy( + logits, + labels, + margin1=self.margin1, + margin2=self.margin2, + margin3=self.margin3, + scale=self.scale, + return_softmax=True, + reduction=None) + + self.assertRaises(ValueError, test_dim) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py new file mode 100644 index 00000000000000..1b24889830ad87 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py @@ -0,0 +1,29 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestParallelMarginSoftmaxWithCrossEntropy(TestMultipleGpus): + def test_parallel_margin_cross_entropy(self): + self.run_mnist_2gpu('parallel_margin_cross_entropy.py') + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index f3d9f9dde11a4e..04e0b7c140d7fa 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -79,6 +79,7 @@ from .loss import sigmoid_focal_loss # noqa: F401 from .loss import smooth_l1_loss # noqa: F401 from .loss import softmax_with_cross_entropy # noqa: F401 +from .loss import margin_cross_entropy # noqa: F401 from .loss import square_error_cost # noqa: F401 from .loss import ctc_loss # noqa: F401 from .norm import batch_norm # noqa: F401 @@ -185,6 +186,7 @@ 'sigmoid_focal_loss', 'smooth_l1_loss', 'softmax_with_cross_entropy', + 'margin_cross_entropy', 'square_error_cost', 'ctc_loss', 'affine_grid', diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index ef2bfb3b8e0d3a..d7b781c84767f2 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1092,6 +1092,268 @@ def ctc_loss(log_probs, return loss_out +def margin_cross_entropy(logits, + label, + margin1=1.0, + margin2=0.5, + margin3=0.0, + scale=64.0, + group=None, + return_softmax=False, + reduction='mean'): + """ + .. math:: + + L=-\\frac{1}{N}\sum^N_{i=1}\log\\frac{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\\neq y_i} e^{scos\\theta_{y_i}}} + + where the :math:`\\theta_{y_i}` is the angle between the feature :math:`x` and + the representation of class :math:`i`. The details of ArcFace loss + could be referred to https://arxiv.org/abs/1801.07698. + + .. hint:: + The API supports model parallel and single GPU. And logits.shape[-1] can be different at each rank. + + Args: + logits (Tensor): shape[N, local_num_classes], the output of the normalized X multiply the normalized W. + The logits is shard_logits when using model parallel. + label (Tensor): shape[N] or shape[N, 1], the groud truth label. + margin1 (float, optional): m1 of margin loss, default value is `1.0`. + margin2 (float, optional): m2 of margin loss, default value is `0.5`. + margin3 (float, optional): m3 of margin loss, default value is `0.0`. + scale (float, optional): s of margin loss, default value is `64.0`. + group (Group, optional): The abstract representation of group, see paddle.distributed.collective.Group. + Default `None`. + return_softmax (bool, optional): Whether return softmax probability. Default value is `False`. + reduction (str, optional): The candicates are ``'none'`` | ``'mean'`` | ``'sum'``. + If :attr:`reduction` is ``'mean'``, return the average of loss; + If :attr:`reduction` is ``'sum'``, return the sum of loss; + If :attr:`reduction` is ``'none'``, no reduction will be applied. + Default value is `'mean'`. + + Returns: + ``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if \ + `return_softmax` is False, otherwise the tuple \ + (loss, softmax), softmax is shard_softmax when \ + using model parallel, otherwise softmax is in \ + the same shape with input logits. If ``reduction == None``, \ + the shape of loss is ``[N, 1]``, otherwise the shape is ``[1]``. + + Examples: + + .. code-block:: python + + # required: gpu + # Single GPU + import paddle + m1 = 1.0 + m2 = 0.5 + m3 = 0.0 + s = 64.0 + batch_size = 2 + feature_length = 4 + num_classes = 4 + + label = paddle.randint(low=0, high=num_classes, shape=[batch_size], dtype='int64') + + X = paddle.randn( + shape=[batch_size, feature_length], + dtype='float64') + X_l2 = paddle.sqrt(paddle.sum(paddle.square(X), axis=1, keepdim=True)) + X = paddle.divide(X, X_l2) + + W = paddle.randn( + shape=[feature_length, num_classes], + dtype='float64') + W_l2 = paddle.sqrt(paddle.sum(paddle.square(W), axis=0, keepdim=True)) + W = paddle.divide(W, W_l2) + + logits = paddle.matmul(X, W) + loss, softmax = paddle.nn.functional.margin_cross_entropy( + logits, label, margin1=m1, margin2=m2, margin3=m3, scale=s, return_softmax=True, reduction=None) + + print(logits) + print(label) + print(loss) + print(softmax) + + #Tensor(shape=[2, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True, + # [[ 0.85204151, -0.55557678, 0.04994566, 0.71986042], + # [-0.20198586, -0.35270476, -0.55182702, 0.09749021]]) + #Tensor(shape=[2], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [2, 3]) + #Tensor(shape=[2, 1], dtype=float64, place=CUDAPlace(0), stop_gradient=True, + # [[82.37059586], + # [12.13448420]]) + #Tensor(shape=[2, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True, + # [[0.99978819, 0.00000000, 0.00000000, 0.00021181], + # [0.99992995, 0.00006468, 0.00000000, 0.00000537]]) + + .. code-block:: python + + # required: distributed + # Multi GPU, test_margin_cross_entropy.py + import paddle + import paddle.distributed as dist + strategy = dist.fleet.DistributedStrategy() + dist.fleet.init(is_collective=True, strategy=strategy) + rank_id = dist.get_rank() + m1 = 1.0 + m2 = 0.5 + m3 = 0.0 + s = 64.0 + batch_size = 2 + feature_length = 4 + num_class_per_card = [4, 8] + num_classes = paddle.sum(paddle.to_tensor(num_class_per_card)) + + label = paddle.randint(low=0, high=num_classes.item(), shape=[batch_size], dtype='int64') + label_list = [] + dist.all_gather(label_list, label) + label = paddle.concat(label_list, axis=0) + + X = paddle.randn( + shape=[batch_size, feature_length], + dtype='float64') + X_list = [] + dist.all_gather(X_list, X) + X = paddle.concat(X_list, axis=0) + X_l2 = paddle.sqrt(paddle.sum(paddle.square(X), axis=1, keepdim=True)) + X = paddle.divide(X, X_l2) + + W = paddle.randn( + shape=[feature_length, num_class_per_card[rank_id]], + dtype='float64') + W_l2 = paddle.sqrt(paddle.sum(paddle.square(W), axis=0, keepdim=True)) + W = paddle.divide(W, W_l2) + + logits = paddle.matmul(X, W) + loss, softmax = paddle.nn.functional.margin_cross_entropy( + logits, label, margin1=m1, margin2=m2, margin3=m3, scale=s, return_softmax=True, reduction=None) + + print(logits) + print(label) + print(loss) + print(softmax) + + # python -m paddle.distributed.launch --gpus=0,1 test_margin_cross_entropy.py + ## for rank0 input + #Tensor(shape=[4, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True, + # [[ 0.32888934, 0.02408748, -0.02763289, 0.18173063], + # [-0.52893978, -0.10623845, -0.21596515, -0.06432517], + # [-0.00536345, -0.03924667, 0.66735314, -0.28640926], + # [-0.09907366, -0.48534973, -0.10365338, -0.39472322]]) + #Tensor(shape=[4], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [11, 1 , 10, 11]) + + ## for rank1 input + #Tensor(shape=[4, 8], dtype=float64, place=CUDAPlace(1), stop_gradient=True, + # [[ 0.68654754, 0.28137170, 0.69694954, -0.60923933, -0.57077653, 0.54576703, -0.38709028, 0.56028204], + # [-0.80360371, -0.03042448, -0.45107338, 0.49559349, 0.69998950, -0.45411693, 0.61927630, -0.82808600], + # [ 0.11457570, -0.34785879, -0.68819499, -0.26189226, -0.48241491, -0.67685711, 0.06510185, 0.49660849], + # [ 0.31604851, 0.52087884, 0.53124749, -0.86176582, -0.43426329, 0.34786144, -0.10850784, 0.51566383]]) + #Tensor(shape=[4], dtype=int64, place=CUDAPlace(1), stop_gradient=True, + # [11, 1 , 10, 11]) + + ## for rank0 output + #Tensor(shape=[4, 1], dtype=float64, place=CUDAPlace(0), stop_gradient=True, + # [[38.96608230], + # [81.28152394], + # [69.67229865], + # [31.74197251]]) + #Tensor(shape=[4, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True, + # [[0.00000000, 0.00000000, 0.00000000, 0.00000000], + # [0.00000000, 0.00000000, 0.00000000, 0.00000000], + # [0.00000000, 0.00000000, 0.99998205, 0.00000000], + # [0.00000000, 0.00000000, 0.00000000, 0.00000000]]) + ## for rank1 output + #Tensor(shape=[4, 1], dtype=float64, place=CUDAPlace(1), stop_gradient=True, + # [[38.96608230], + # [81.28152394], + # [69.67229865], + # [31.74197251]]) + #Tensor(shape=[4, 8], dtype=float64, place=CUDAPlace(1), stop_gradient=True, + # [[0.33943993, 0.00000000, 0.66051859, 0.00000000, 0.00000000, 0.00004148, 0.00000000, 0.00000000], + # [0.00000000, 0.00000000, 0.00000000, 0.00000207, 0.99432097, 0.00000000, 0.00567696, 0.00000000], + # [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00001795], + # [0.00000069, 0.33993085, 0.66006319, 0.00000000, 0.00000000, 0.00000528, 0.00000000, 0.00000000]]) + """ + + assert reduction in ['mean', 'sum', 'none', None] + if group is not None and not group.is_member(): + return + + ring_id = 0 if group is None else group.id + rank = 0 + nranks = 1 + if core.is_compiled_with_dist(): + parallel_env = paddle.distributed.ParallelEnv() + global_rank = parallel_env.rank + rank = global_rank if group is None else group.get_group_rank( + global_rank) + nranks = parallel_env.world_size if group is None else group.nranks + + input_dims = len(list(logits.shape)) + label_dims = len(list(label.shape)) + if input_dims - 1 != label_dims and input_dims != label_dims: + raise ValueError( + 'Expected nput_dims - 1 = label_dims or input_dims == label_dims\ + (got nput_dims{}, label_dims{})'.format(input_dims, label_dims)) + if input_dims - 1 == label_dims: + label = paddle.unsqueeze(label, axis=-1) + + if in_dygraph_mode(): + softmax, loss = core.ops.margin_cross_entropy( + logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks, + 'margin1', margin1, 'margin2', margin2, 'margin3', margin3, 'scale', + scale, 'return_softmax', return_softmax) + if reduction == 'mean': + loss = paddle.mean(loss) + elif reduction == 'sum': + loss = paddle.sum(loss) + if not return_softmax: + return loss + else: + return loss, softmax + + op_type = 'margin_cross_entropy' + helper = LayerHelper(op_type, **locals()) + softmax = helper.create_variable_for_type_inference(dtype=logits.dtype) + loss = helper.create_variable_for_type_inference(dtype=logits.dtype) + + check_variable_and_dtype(logits, 'logits', + ['float16', 'float32', 'float64'], + 'margin_cross_entropy') + check_variable_and_dtype(label, 'label', ['int32', 'int64'], + 'margin_cross_entropy') + + helper.append_op( + type=op_type, + inputs={'Logits': logits, + 'Label': label}, + outputs={'Softmax': softmax, + 'Loss': loss}, + attrs={ + 'return_softmax': return_softmax, + 'ring_id': ring_id, + 'rank': rank, + 'nranks': nranks, + 'margin1': margin1, + 'margin2': margin2, + 'margin3': margin3, + 'scale': scale, + }) + + if reduction == 'mean': + loss = paddle.mean(loss) + elif reduction == 'sum': + loss = paddle.sum(loss) + + if not return_softmax: + return loss + else: + return loss, softmax + + @deprecated( since="2.0.0", update_to="paddle.nn.functional.cross_entropy", diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 616d5ae280ad1a..d2f95c235b04c1 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -719,4 +719,5 @@ 'test_sgd_op_bf16', 'test_marker_op', 'test_c_embedding_op', + 'test_margin_cross_entropy_op', ] From e29c2d12cd8fb67354f052d0d81b8c9f20699e35 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 16 Aug 2021 14:24:14 +0800 Subject: [PATCH 050/126] [amp] dygraph amp support param_group (#34899) * dygraph amp support param_group * remove unused code * fix doc --- python/paddle/amp/grad_scaler.py | 43 ++++++++++++++ .../paddle/fluid/dygraph/amp/loss_scaler.py | 17 ++++-- .../test_imperative_auto_mixed_precision.py | 59 ++++++++++++++----- 3 files changed, 100 insertions(+), 19 deletions(-) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 18c436a0bb95f7..5c3b575f2f069e 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -146,6 +146,49 @@ def minimize(self, optimizer, *args, **kwargs): """ return super(GradScaler, self).minimize(optimizer, *args, **kwargs) + def step(self, optimizer): + """ + This function is similar as `optimizer.step()`, which performs parameters updating. + + If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped. + Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters. + + Args: + optimizer(Optimizer): The optimizer used to update parameters. + + Examples: + .. code-block:: python + + # required: gpu + import paddle + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + with paddle.amp.auto_cast(): + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) # scale the loss + scaled.backward() # do backward + scaler.step(optimizer) + optimizer.clear_grad() + """ + if not self._enable: + return optimizer.step() + + # unscale the grad + self._unscale(optimizer) + + if self._found_inf: + self._cache_founf_inf = True + else: + optimizer.step() + self._cache_founf_inf = False + + if self._use_dynamic_loss_scaling: + # uopdate the scale + self._update() + def is_enable(self): """ Enable loss scaling or not. diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index 2065bec8af3bc4..a9fe2c9f3ed7b0 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -212,10 +212,19 @@ def minimize(self, optimizer, *args, **kwargs): def _unscale(self, optimizer): if not self._enable: return - param_grads = [ - param._grad_ivar() for param in optimizer._parameter_list - if param._grad_ivar() is not None - ] + + if getattr(optimizer, '_param_groups', None) and isinstance( + optimizer._param_groups[0], dict): + param_grads = [] + for group in optimizer._param_groups: + for param in group['params']: + if param._grad_ivar() is not None: + param_grads.append(param._grad_ivar()) + else: + param_grads = [ + param._grad_ivar() for param in optimizer._parameter_list + if param._grad_ivar() is not None + ] _C_ops.check_finite_and_unscale(param_grads, self._scale, param_grads, self._found_inf) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index 17d50ed8c19de0..330c4c5ffec3d9 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -19,6 +19,9 @@ import six from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting +if fluid.core.is_compiled_with_cuda(): + fluid.set_flags({"FLAGS_cudnn_deterministic": True}) + class SimpleConv(fluid.dygraph.Layer): def __init__(self, @@ -373,8 +376,6 @@ def train_resnet(self, return dy_out, dy_param_value, dy_grad_value def test_with_state_dict(self): - if fluid.core.is_compiled_with_cuda(): - fluid.set_flags({"FLAGS_cudnn_deterministic": True}) with fluid.dygraph.guard(): out_use_state_dict = self.train_resnet( enable_amp=True, use_data_loader=True, use_save_load=True) @@ -390,18 +391,43 @@ class TestResnet2(unittest.TestCase): Use paddle-2.0 API """ - def train_resnet(self, enable_amp=True, use_data_loader=False): + def train_resnet(self, + enable_amp=True, + use_data_loader=False, + use_param_group=False): seed = 90 batch_size = train_parameters["batch_size"] - batch_num = 1 + batch_num = 10 paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) resnet = ResNet(use_cudnn=True) - optimizer = optimizer_setting( - train_parameters, parameter_list=resnet.parameters()) + + if use_param_group: + conv_params = resnet.conv.parameters() + other_params = [] + for p in resnet.parameters(): + contains = False + for q in conv_params: + if p is q: + contains = True + if not contains: + other_params.append(p) + # NOTE(zhiqiu): The Membership test operations(in / not in) calls "is" and "equal", + # see details: https://docs.python.org/3/reference/expressions.html#membership-test-operations. + # So do not use other_params = [p for p in resnet.parameters() if p not in conv_params] + optimizer = paddle.optimizer.Momentum(parameters=[{ + 'params': conv_params, + 'learning_rate': 0.01 + }, { + 'params': other_params, + 'learning_rate': 0.001 + }]) + else: + optimizer = paddle.optimizer.SGD(parameters=resnet.parameters()) + np.random.seed(seed) train_reader = paddle.batch( paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size) @@ -456,7 +482,7 @@ def train_resnet(self, enable_amp=True, use_data_loader=False): scaled_loss = scaler.scale(avg_loss) scaled_loss.backward() - scaler.minimize(optimizer, scaled_loss) + scaler.step(optimizer) dy_grad_value = {} for param in resnet.parameters(): @@ -475,22 +501,27 @@ def train_resnet(self, enable_amp=True, use_data_loader=False): return dy_out, dy_param_value, dy_grad_value def test_resnet(self): - if fluid.core.is_compiled_with_cuda(): - fluid.set_flags({"FLAGS_cudnn_deterministic": True}) with fluid.dygraph.guard(): out_fp32 = self.train_resnet(enable_amp=False) out_amp = self.train_resnet(enable_amp=True) print(out_fp32[0], out_amp[0]) - self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2)) + self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5)) def test_with_data_loader(self): - if fluid.core.is_compiled_with_cuda(): - fluid.set_flags({"FLAGS_cudnn_deterministic": True}) with fluid.dygraph.guard(): out_fp32 = self.train_resnet(enable_amp=False, use_data_loader=True) out_amp = self.train_resnet(enable_amp=True, use_data_loader=True) print(out_fp32[0], out_amp[0]) - self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2)) + self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5)) + + def test_param_group(self): + with fluid.dygraph.guard(): + out_fp32 = self.train_resnet( + enable_amp=False, use_data_loader=True, use_param_group=True) + out_amp = self.train_resnet( + enable_amp=True, use_data_loader=True, use_param_group=True) + print(out_fp32[0], out_amp[0]) + self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5)) class TestResnet(unittest.TestCase): @@ -566,8 +597,6 @@ def train_resnet(self, enable_amp=True): return dy_out, dy_param_value, dy_grad_value def test_resnet(self): - if fluid.core.is_compiled_with_cuda(): - fluid.set_flags({"FLAGS_cudnn_deterministic": True}) out_fp32 = self.train_resnet(enable_amp=False) out_amp = self.train_resnet(enable_amp=True) print(out_fp32[0], out_amp[0]) From 875cfd571291c79b45db2a341deacbb965d8f273 Mon Sep 17 00:00:00 2001 From: duanboqiang Date: Mon, 16 Aug 2021 14:40:49 +0800 Subject: [PATCH 051/126] add unique_consecutive_op (#34334) * add unique_consecutive_op * add unique_consecutive_op * add unique_consecutive_op * add unique_consecutive_op * add unique_consecutive_op * add unique_consecutive_op * add unique_consecutive_op * add unique_consecutive_op * remove unity build * add unique_consecutive op * add unique_consecutive op * add enable static * add noqa * add space line * add default case. * add comma * add space line * modify unique_consecutive unittest * optimize ut coverage * rebase develop * improve coverage * update en docs * update en docs * update en docs * update en docs * update en docs * update en doc --- .../fluid/operators/unique_consecutive_op.cc | 142 ++++++ .../fluid/operators/unique_consecutive_op.cu | 424 ++++++++++++++++++ .../fluid/operators/unique_consecutive_op.h | 268 +++++++++++ paddle/fluid/pybind/op_function_generator.cc | 1 + python/paddle/__init__.py | 2 + .../unittests/test_unique_consecutive_op.py | 238 ++++++++++ python/paddle/tensor/__init__.py | 2 + python/paddle/tensor/manipulation.py | 106 +++++ 8 files changed, 1183 insertions(+) create mode 100644 paddle/fluid/operators/unique_consecutive_op.cc create mode 100644 paddle/fluid/operators/unique_consecutive_op.cu create mode 100644 paddle/fluid/operators/unique_consecutive_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py diff --git a/paddle/fluid/operators/unique_consecutive_op.cc b/paddle/fluid/operators/unique_consecutive_op.cc new file mode 100644 index 00000000000000..464660d80be019 --- /dev/null +++ b/paddle/fluid/operators/unique_consecutive_op.cc @@ -0,0 +1,142 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/unique_consecutive_op.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace operators { + +class UniqueConsecutiveOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique_consecutive"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "unique_consecutive"); + + auto in_dims = ctx->GetInputDim("X"); + bool return_inverse = ctx->Attrs().Get("return_inverse"); + bool return_counts = ctx->Attrs().Get("return_counts"); + auto axis_vec = ctx->Attrs().Get>("axis"); + if (return_inverse) { + OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", + "unique_consecutive"); + } + if (return_counts) { + OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", + "unique_consecutive"); + } + + if (axis_vec.empty()) { + ctx->SetOutputDim("Out", {-1}); + if (return_inverse) { + ctx->SetOutputDim("Index", {framework::product(in_dims)}); + } + } else { + int axis = axis_vec[0]; + if (axis < 0) { + axis += in_dims.size(); + } + PADDLE_ENFORCE_LT( + axis, in_dims.size(), + platform::errors::InvalidArgument("The axis(%d) should be less than " + "the dimension size(%d) of x.", + axis, in_dims.size())); + auto out_dims = in_dims; + out_dims[axis] = -1; + ctx->SetOutputDim("Out", out_dims); + if (return_inverse) { + ctx->SetOutputDim("Index", {in_dims[axis]}); + } + } + if (return_counts) { + ctx->SetOutputDim("Counts", {-1}); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +class UniqueConsecutiveOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input tensor of unique_consecutive op."); + AddAttr("dtype", + "(int, default 5(FP32)) " + "data type for output index") + .SetDefault(framework::proto::VarType::FP32); + + AddOutput("Out", "A unique consecutive subsequence for input tensor."); + AddOutput("Index", + "The indices for where elements in the original input ended up " + "in the returned unique tensor.") + .AsDispensable(); + AddOutput("Counts", "The counts for each unique element.").AsDispensable(); + AddAttr( + "return_inverse", + "If True, also return the indices for where elements" + " in the original input ended up in the returned unique tensor.") + .SetDefault(false); + AddAttr("return_counts", + "If True, also return the counts for each unique element.") + .SetDefault(false); + AddAttr>( + "axis", + "The axis to apply unique. If None, the input will be flattened.") + .SetDefault({}); + AddComment(R"DOC( + This function is different from paddle.unique() in the sense that this + function only eliminates consecutive duplicate values. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(unique_consecutive, ops::UniqueConsecutiveOp, + ops::UniqueConsecutiveOpMaker); +REGISTER_OP_CPU_KERNEL( + unique_consecutive, + ops::UniqueConsecutiveKernel, + ops::UniqueConsecutiveKernel, + ops::UniqueConsecutiveKernel, + ops::UniqueConsecutiveKernel); +REGISTER_OP_VERSION(unique_consecutive) + .AddCheckpoint( + R"ROC( + Upgrade unique_consecutive, add 2 outputs [Indices, Counts] and 3 attribute + [return_inverse, return_counts, axis]. + )ROC", + paddle::framework::compatible::OpVersionDesc() + .NewOutput("Counts", "The counts for each unique element.") + .NewAttr("return_inverse", + "If True, also return the indices for where elements" + " in the original input ended up in the returned unique " + "tensor.", + false) + .NewAttr("return_counts", + "If True, also return the counts for each unique element.", + false) + .NewAttr("axis", + "The axis to apply unique. If None, the input will be " + "flattened.", + std::vector{})); diff --git a/paddle/fluid/operators/unique_consecutive_op.cu b/paddle/fluid/operators/unique_consecutive_op.cu new file mode 100644 index 00000000000000..1f0023c467c01c --- /dev/null +++ b/paddle/fluid/operators/unique_consecutive_op.cu @@ -0,0 +1,424 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/tensor_util.h" // TensorToVector() +#include "paddle/fluid/operators/unique_consecutive_op.h" // TransComute() + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +// Binary function 'equal_to' +template +struct BinaryEqual { + int64_t col; + const InT* in_trans_data; + + BinaryEqual(int64_t _col, const InT* _in_trans_data) + : col(_col), in_trans_data(_in_trans_data) {} + + __device__ bool operator()(int64_t a, int64_t b) const { + for (int64_t i = 0; i < col; ++i) { + InT lhs = in_trans_data[i + a * col]; + InT rhs = in_trans_data[i + b * col]; + if (lhs != rhs) { + return false; + } + } + return true; + } +}; + +// Binary function 'not_equal_to' +template +struct BinaryNotEqual { + int64_t col; + const InT* in_trans_data; + + BinaryNotEqual(int64_t _col, const InT* _in_trans_data) + : col(_col), in_trans_data(_in_trans_data) {} + + __device__ bool operator()(int64_t a, int64_t b) const { + for (int64_t i = 0; i < col; ++i) { + InT lhs = in_trans_data[i + a * col]; + InT rhs = in_trans_data[i + b * col]; + if (lhs != rhs) { + return true; + } + } + return false; + } +}; + +// index_select() function for Tensor +template +void IndexSelect(const framework::ExecutionContext& context, + const Tensor& input, const Tensor& index, Tensor* output, + int dim) { + auto input_dim = input.dims(); + auto input_dim_size = input_dim.size(); + auto output_dim = output->dims(); + + auto slice_size = 1; + for (auto i = dim + 1; i < input_dim_size; i++) { + slice_size *= input_dim[i]; + } + + auto input_width = slice_size * input_dim[dim]; + auto output_width = slice_size * output_dim[dim]; + + auto outer_nums = 1; + for (auto i = 0; i < dim; i++) { + outer_nums *= input_dim[i]; + } + + auto index_size = index.dims()[0]; + + std::vector input_vec; + std::vector index_vec; + TensorToVector(input, context.device_context(), &input_vec); + TensorToVector(index, context.device_context(), &index_vec); + std::vector out_vec(output->numel()); + + for (int i = 0; i < index_size; i++) { + PADDLE_ENFORCE_GE( + index_vec[i], 0, + platform::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], index_vec[i])); + PADDLE_ENFORCE_LT( + index_vec[i], input_dim[dim], + platform::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], index_vec[i])); + } + + for (auto i = 0; i < outer_nums; i++) { + auto input_start_offset = i * input_width; + auto output_start_offset = i * output_width; + + for (auto j = 0; j < index_size; j++) { + IndexT index_value = index_vec[j]; + for (auto k = 0; k < slice_size; k++) { + out_vec[output_start_offset + j * slice_size + k] = + input_vec[input_start_offset + index_value * slice_size + k]; + } + } + } + output->mutable_data(context.GetPlace()); + framework::TensorFromVector(out_vec, context.device_context(), output); + output->Resize(output_dim); +} + +// The core logic of computing Unique Consecutive for a flattend Tensor +template +static void UniqueConsecutiveFlattendCUDATensor( + const framework::ExecutionContext& context, const Tensor& in, Tensor* out, + bool return_inverse, bool return_counts, equal_T equal, + not_equal_T not_equal, int64_t num_input) { + // 0. Prepration + Tensor in_hat; + framework::TensorCopy(in, context.GetPlace(), &in_hat); + auto in_data_hat = in_hat.mutable_data(context.GetPlace()); + + Tensor sorted_indices; + sorted_indices.Resize(framework::make_ddim({num_input})); + auto sorted_indices_data = + sorted_indices.mutable_data(context.GetPlace()); + thrust::sequence(thrust::device, sorted_indices_data, + sorted_indices_data + num_input); + // 1. Calculate op result: 'out' + Tensor range; + range.Resize(framework::make_ddim({num_input + 1})); + auto range_data_ptr = range.mutable_data(context.GetPlace()); + thrust::sequence(thrust::device, range_data_ptr, + range_data_ptr + num_input + 1); + framework::TensorCopy(in_hat, context.GetPlace(), out); + int num_out; + auto out_data = out->mutable_data(context.GetPlace()); + num_out = thrust::unique_by_key(thrust::device, out_data, + out_data + num_input, range_data_ptr, equal) + .first - + out_data; + out->Resize(framework::make_ddim({num_out})); + + // 2. Calculate inverse index: 'inverse' + if (return_inverse) { + Tensor* inverse = context.Output("Index"); + inverse->Resize(framework::make_ddim({num_input})); + auto inverse_data = inverse->mutable_data(context.GetPlace()); + Tensor inv_loc; + inv_loc.Resize(framework::make_ddim({num_input})); + auto inv_loc_data_ptr = inv_loc.mutable_data(context.GetPlace()); + thrust::adjacent_difference(thrust::device, in_data_hat, + in_data_hat + num_input, inv_loc_data_ptr, + not_equal); + thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); + inv_loc_data_dev[0] = 0; // without device_ptr, segmentation fault + thrust::inclusive_scan(thrust::device, inv_loc_data_ptr, + inv_loc_data_ptr + num_input, inv_loc_data_ptr); + thrust::scatter(thrust::device, inv_loc_data_ptr, + inv_loc_data_ptr + num_input, sorted_indices_data, + inverse_data); + } + // 3. Calculate 'counts' + if (return_counts) { + Tensor* counts = context.Output("Counts"); + counts->Resize(framework::make_ddim({num_out})); + auto count_data = counts->mutable_data(context.GetPlace()); + // init 'count_data' as 0 + thrust::fill(thrust::device, count_data, count_data + num_out, 0); + thrust::device_ptr range_data_ptr_dev(range_data_ptr); + range_data_ptr_dev[num_out] = num_input; + thrust::adjacent_difference(thrust::device, range_data_ptr + 1, + range_data_ptr + num_out + 1, count_data); + } +} + +// The logic of compute unique with axis required, it's a little different +// from above function +template +static void ComputeUniqueConsecutiveDims( + const framework::ExecutionContext& context, Tensor* sorted_indices, + IndexT* sorted_indices_data, Tensor* out, bool return_inverse, + bool return_counts, equal_T equal, not_equal_T not_equal, int64_t row) { + // 1. inverse indices: 'inverse' + Tensor* inverse = context.Output("Index"); + inverse->Resize(framework::make_ddim({row})); + auto inverse_data = inverse->mutable_data(context.GetPlace()); + Tensor inv_loc; + inv_loc.Resize(framework::make_ddim({row})); + auto inv_loc_data_ptr = inv_loc.mutable_data(context.GetPlace()); + thrust::adjacent_difference(thrust::device, sorted_indices_data, + sorted_indices_data + row, inv_loc_data_ptr, + not_equal); + thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); + inv_loc_data_dev[0] = 0; + thrust::inclusive_scan(thrust::device, inv_loc_data_ptr, + inv_loc_data_ptr + row, inv_loc_data_ptr); + thrust::scatter(thrust::device, inv_loc_data_ptr, inv_loc_data_ptr + row, + sorted_indices_data, inverse_data); + + // 2. sorted indices + Tensor range; + range.Resize(framework::make_ddim({row + 1})); + auto range_data_ptr = range.mutable_data(context.GetPlace()); + thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1); + int num_out; + num_out = + thrust::unique_by_key(thrust::device, sorted_indices_data, + sorted_indices_data + row, range_data_ptr, equal) + .first - + sorted_indices_data; + thrust::device_ptr range_data_ptr_dev(range_data_ptr); + range_data_ptr_dev[num_out] = row; + sorted_indices->Resize(framework::make_ddim({num_out})); + + // 3. counts: 'counts' + Tensor* counts = context.Output("Counts"); + counts->Resize(framework::make_ddim({num_out})); + auto count_data = counts->mutable_data(context.GetPlace()); + thrust::fill(thrust::device, count_data, count_data + row, 0); + thrust::adjacent_difference(thrust::device, range_data_ptr + 1, + range_data_ptr + row + 1, count_data); +} + +// Calculate unique consecutive when 'axis' is set +template +static void UniqueConsecutiveDimsCUDATensor( + const framework::ExecutionContext& context, const Tensor& in, Tensor* out, + bool return_inverse, bool return_counts, int axis) { + // 1. Transpose & reshape + // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] + std::vector permute(in.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis] = 0; + permute[0] = axis; + std::vector in_trans_dims_vec(framework::vectorize(in.dims())); + in_trans_dims_vec[axis] = in.dims()[0]; + in_trans_dims_vec[0] = in.dims()[axis]; + framework::Tensor in_trans; + framework::DDim in_trans_dims = framework::make_ddim(in_trans_dims_vec); + in_trans.Resize(in_trans_dims); + in_trans.mutable_data(context.GetPlace()); + auto& dev_ctx = context.cuda_device_context(); + TransCompute(in.dims().size(), // num of dims + dev_ctx, // device + in, // original Tensor + &in_trans, // Tensor after reshape + permute); // index of axis + + // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] + framework::DDim in_trans_flat_dims = + framework::flatten_to_2d(in_trans_dims, 1); + in_trans.Resize(in_trans_flat_dims); + + // now 'in_trans' is 2D + int64_t col = in_trans.dims()[1]; + int64_t row = in_trans.dims()[0]; + const InT* in_trans_data = in_trans.data(); + + Tensor sorted_indices; + sorted_indices.Resize(framework::make_ddim({row})); + auto sorted_indices_data = + sorted_indices.mutable_data(context.GetPlace()); + + // 2. Calculate 'inverse', 'counts' + // Init index + thrust::sequence(thrust::device, sorted_indices_data, + sorted_indices_data + row); + ComputeUniqueConsecutiveDims( + context, &sorted_indices, sorted_indices_data, out, return_inverse, + return_counts, BinaryEqual(col, in_trans_data), + BinaryNotEqual(col, in_trans_data), row); + + // 3. Select indices and reshape back to get 'out' + Tensor out_trans; + std::vector out_trans_dims_vec = in_trans_dims_vec; + out_trans_dims_vec[0] = sorted_indices.numel(); + out_trans.Resize(framework::make_ddim(out_trans_dims_vec)); + out_trans.mutable_data(context.GetPlace()); + + IndexSelect(context, in_trans, sorted_indices, &out_trans, 0); + + std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); + out->Resize(framework::make_ddim(out_trans_dims_vec)); + out->mutable_data(context.GetPlace()); + std::vector out_trans_unbind = Unbind(out_trans); + math::ConcatFunctor concat_functor; + concat_functor(dev_ctx, out_trans_unbind, 0, &out_trans); + TransCompute(out_trans.dims().size(), dev_ctx, out_trans, + out, permute); +} + +// functor for processing a flattend Tensor +template +struct UniqueConsecutiveFlattendCUDAFunctor { + const framework::ExecutionContext& ctx_; + const Tensor& in_; + Tensor* out_; + const bool return_inverse_; + const bool return_counts_; + + UniqueConsecutiveFlattendCUDAFunctor( + const framework::ExecutionContext& context, const Tensor& in, Tensor* out, + bool return_inverse, bool return_counts) + : ctx_(context), + in_(in), + out_(out), + return_inverse_(return_inverse), + return_counts_(return_counts) {} + + template + void apply() const { + UniqueConsecutiveFlattendCUDATensor( + ctx_, in_, out_, return_inverse_, return_counts_, + thrust::equal_to(), thrust::not_equal_to(), in_.numel()); + } +}; + +// functor for processing a multi-dimentional Tensor +template +struct UniqueConsecutiveDimsCUDAFunctor { + const framework::ExecutionContext& ctx_; + const Tensor& in_; + Tensor* out_; + const int axis_; + const bool return_inverse_; + const bool return_counts_; + + UniqueConsecutiveDimsCUDAFunctor(const framework::ExecutionContext& context, + const Tensor& in, Tensor* out, + const int axis, bool return_inverse, + bool return_counts) + : ctx_(context), + in_(in), + out_(out), + axis_(axis), + return_inverse_(return_inverse), + return_counts_(return_counts) {} + + template + void apply() const { + UniqueConsecutiveDimsCUDATensor( + ctx_, in_, out_, return_inverse_, return_counts_, axis_); + } +}; + +// Unique_Consecutive_op CUDA implementation. +template +class UniqueConsecutiveKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + auto data_type = static_cast( + context.Attr("dtype")); + if (data_type == framework::proto::VarType::INT32) { + PADDLE_ENFORCE_LE( + x->numel() + 1, INT_MAX, + platform::errors::InvalidArgument( + "The number of elements in Input(X) should be less than or " + "equal to INT_MAX, but received num is %d. Please set `dtype` to " + "int64.", + x->numel())); + } + + std::vector axis_vec = context.Attr>("axis"); + bool return_inverse = context.Attr("return_inverse"); + bool return_counts = context.Attr("return_counts"); + + // if 'axis' is not required, flatten the Tensor. + if (axis_vec.empty()) { + framework::VisitDataTypeTiny( + data_type, + UniqueConsecutiveFlattendCUDAFunctor( + context, *x, out, return_inverse, return_counts)); + } else { + // 'axis' is required. + int axis = axis_vec[0]; + framework::VisitDataTypeTiny( + data_type, + UniqueConsecutiveDimsCUDAFunctor( + context, *x, out, axis, return_inverse, return_counts)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + unique_consecutive, + ops::UniqueConsecutiveKernel, + ops::UniqueConsecutiveKernel, + ops::UniqueConsecutiveKernel, + ops::UniqueConsecutiveKernel); diff --git a/paddle/fluid/operators/unique_consecutive_op.h b/paddle/fluid/operators/unique_consecutive_op.h new file mode 100644 index 00000000000000..e6cb5dafe34330 --- /dev/null +++ b/paddle/fluid/operators/unique_consecutive_op.h @@ -0,0 +1,268 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/transpose_op.h" +#include "paddle/fluid/operators/unique_op.h" + +namespace paddle { +namespace operators { +template +static void UniqueConsecutiveFlattendTensor( + const framework::ExecutionContext& context, const framework::Tensor& in, + framework::Tensor* out, bool return_inverse, bool return_counts) { + const InT* in_data = in.data(); + std::vector out_vec(in.numel()); + std::vector inverse_vec(in.numel()); + std::vector counts_vec(in.numel()); + memcpy(out_vec.data(), in_data, in.numel() * sizeof(InT)); + InT* p = out_vec.data(); + int64_t last = 0; + IndexT* q = counts_vec.data(); + for (int64_t i = 0; i < in.numel(); i++) { + if (in_data[i] != *p) { + *(++p) = in_data[i]; + if (return_counts) { + *(q++) = i - last; + last = i; + } + } + if (return_inverse) { + inverse_vec[i] = p - out_vec.data(); + } + } + + int64_t output_size = p - out_vec.data() + 1; + if (return_counts) { + *q = in.numel() - last; + counts_vec.resize(output_size); + } + out_vec.resize(output_size); + + out->Resize(framework::make_ddim({output_size})); + auto* out_data = out->mutable_data(context.GetPlace()); + std::copy(out_vec.begin(), out_vec.end(), out_data); + + if (return_inverse) { + auto* inverse = context.Output("Index"); + inverse->Resize(framework::make_ddim({in.numel()})); + auto* inverse_data = inverse->mutable_data(context.GetPlace()); + std::copy(inverse_vec.begin(), inverse_vec.end(), inverse_data); + } + + if (return_counts) { + auto* count = context.Output("Counts"); + count->Resize(framework::make_ddim({out->numel()})); + auto* counts_data = count->mutable_data(context.GetPlace()); + std::copy(counts_vec.begin(), counts_vec.end(), counts_data); + } +} + +template +static ForwardIt UniqueConsecutiveDimImpl( + const framework::ExecutionContext& context, ForwardIt first, ForwardIt last, + const std::vector& sorted_indices_vec, + std::vector* inverse_vec, std::vector* counts_vec) { + if (first == last) { + return last; + } + + (*inverse_vec)[sorted_indices_vec[0]] = 0; + (*counts_vec)[0] = 1; + + ForwardIt begin = first; + ForwardIt result = first; + + while (++first != last) { + int64_t idx_first = std::distance(begin, first); + int64_t idx_result = std::distance(begin, result); + if (!Equal(*result, *first)) { + if (++result != first) { + *result = std::move(*first); + } + idx_result += 1; + } + (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result; + (*counts_vec)[idx_result] += 1; + } + return ++result; +} + +template +static void UniqueConsecutiveDim(const framework::ExecutionContext& context, + const framework::Tensor& in, + framework::Tensor* out, bool return_inverse, + bool return_counts, int axis) { + // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] + std::vector permute(in.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis] = 0; + permute[0] = axis; + std::vector in_trans_dims_vec(framework::vectorize(in.dims())); + in_trans_dims_vec[axis] = in.dims()[0]; + in_trans_dims_vec[0] = in.dims()[axis]; + framework::Tensor in_trans; + framework::DDim in_trans_dims = framework::make_ddim(in_trans_dims_vec); + in_trans.Resize(in_trans_dims); + in_trans.mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + TransCompute(in.dims().size(), dev_ctx, in, &in_trans, + permute); + // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] + framework::DDim in_trans_flat_dims = + framework::flatten_to_2d(in_trans_dims, 1); + in_trans.Resize(in_trans_flat_dims); + + std::vector sorted_indices_vec(in_trans.dims()[0]); + std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0); + int64_t col = in_trans.dims()[1]; + const InT* in_trans_data = in_trans.data(); + + // sort tensor according to indices + framework::Tensor input_sorted; + input_sorted.Resize(in_trans_dims); + input_sorted.mutable_data(context.GetPlace()); + InT* input_sorted_data = input_sorted.data(); + for (size_t i = 0; i < sorted_indices_vec.size(); ++i) { + memcpy(input_sorted_data + i * col, + in_trans_data + static_cast(sorted_indices_vec[i]) * col, + col * sizeof(InT)); + } + std::vector input_unbind = Unbind(input_sorted); + std::vector inverse_vec(sorted_indices_vec.size(), 0); + std::vector counts_vec(sorted_indices_vec.size(), 0); + auto last = + UniqueConsecutiveDimImpl::iterator, InT>( + context, input_unbind.begin(), input_unbind.end(), sorted_indices_vec, + &inverse_vec, &counts_vec); + input_unbind.erase(last, input_unbind.end()); + counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end()); + + math::ConcatFunctor concat_functor; + framework::Tensor out_trans; + std::vector out_trans_dims_vec = in_trans_dims_vec; + out_trans_dims_vec[0] = input_unbind.size(); + out_trans.Resize(framework::make_ddim(out_trans_dims_vec)); + out_trans.mutable_data(context.GetPlace()); + std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); + out->Resize(framework::make_ddim(out_trans_dims_vec)); + out->mutable_data(context.GetPlace()); + concat_functor(dev_ctx, input_unbind, 0, &out_trans); + TransCompute(out_trans.dims().size(), dev_ctx, out_trans, + out, permute); + if (return_inverse) { + auto* inverse = context.Output("Index"); + framework::TensorFromVector(inverse_vec, context.device_context(), inverse); + } + if (return_counts) { + auto* count = context.Output("Counts"); + framework::TensorFromVector(counts_vec, context.device_context(), count); + } +} + +template +struct UniqueConsecutiveFlattendTensorFunctor { + const framework::ExecutionContext& ctx_; + const framework::Tensor& in_; + framework::Tensor* out_; + const bool return_inverse_; + const bool return_counts_; + + UniqueConsecutiveFlattendTensorFunctor( + const framework::ExecutionContext& context, const framework::Tensor& in, + framework::Tensor* out, bool return_inverse, bool return_counts) + : ctx_(context), + in_(in), + out_(out), + return_inverse_(return_inverse), + return_counts_(return_counts) {} + + template + void apply() const { + UniqueConsecutiveFlattendTensor( + ctx_, in_, out_, return_inverse_, return_counts_); + } +}; + +template +struct UniqueConsecutiveDimFunctor { + const framework::ExecutionContext& ctx_; + const framework::Tensor& in_; + framework::Tensor* out_; + const int axis_; + const bool return_inverse_; + const bool return_counts_; + UniqueConsecutiveDimFunctor(const framework::ExecutionContext& context, + const framework::Tensor& in, + framework::Tensor* out, const int axis, + bool return_inverse, bool return_counts) + : ctx_(context), + in_(in), + out_(out), + axis_(axis), + return_inverse_(return_inverse), + return_counts_(return_counts) {} + + template + void apply() const { + UniqueConsecutiveDim( + ctx_, in_, out_, return_inverse_, return_counts_, axis_); + } +}; +template +class UniqueConsecutiveKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + auto data_type = static_cast( + context.Attr("dtype")); + if (data_type == framework::proto::VarType::INT32) { + PADDLE_ENFORCE_LE( + x->numel(), INT_MAX, + platform::errors::InvalidArgument( + "The number of elements in Input(X) should be less than or " + "equal to INT_MAX, but received num is %d. Please set `dtype` to " + "int64.", + x->numel())); + } + std::vector axis_vec = context.Attr>("axis"); + bool return_inverse = context.Attr("return_inverse"); + bool return_counts = context.Attr("return_counts"); + + if (axis_vec.empty()) { + framework::VisitDataTypeTiny( + data_type, UniqueConsecutiveFlattendTensorFunctor( + context, *x, out, return_inverse, return_counts)); + } else { + int axis = axis_vec[0]; + framework::VisitDataTypeTiny( + data_type, + UniqueConsecutiveDimFunctor( + context, *x, out, axis, return_inverse, return_counts)); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 07a3fc8a8df331..dc27befd26cda8 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -86,6 +86,7 @@ std::map> op_outs_map = { {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance", "ReserveSpace"}}, {"unique", {"Out", "Index", "Indices", "Counts"}}, + {"unique_consecutive", {"Out", "Index", "Counts"}}, {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, {"collect_fpn_proposals", {"FpnRois", "RoisNum"}}, {"matrix_nms", {"Out", "Index", "RoisNum"}}, diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index f72fb6c1806b10..907a667cb6ba78 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -142,6 +142,7 @@ from .tensor.manipulation import stack # noqa: F401 from .tensor.manipulation import strided_slice # noqa: F401 from .tensor.manipulation import unique # noqa: F401 +from .tensor.manipulation import unique_consecutive # noqa: F401 from .tensor.manipulation import unsqueeze # noqa: F401 from .tensor.manipulation import unsqueeze_ # noqa: F401 from .tensor.manipulation import unstack # noqa: F401 @@ -470,6 +471,7 @@ 'randn', 'strided_slice', 'unique', + 'unique_consecutive', 'set_cuda_rng_state', 'set_printoptions', 'std', diff --git a/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py b/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py new file mode 100644 index 00000000000000..a12f1aaff45969 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py @@ -0,0 +1,238 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core + +import paddle +import paddle.fluid as fluid +import paddle.fluid.framework as framework + + +def reference_unique_consecutive(X, return_inverse=False, return_counts=False): + """ + Reference unique_consecutive implementation using python. + Args: + x(Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + return_inverse(bool, optional): If True, also return the indices for where elements in + the original input ended up in the returned unique consecutive tensor. Default is False. + return_counts(bool, optional): If True, also return the counts for each unique consecutive element. + """ + X = list(X) + counts_vec = [1] * len(X) + i = 0 + counts = 1 + last = 0 + inverse_vec = [0] * len(X) + inverse_vec[last] = i + cnt = 0 + while i < len(X) - 1: + if X[i] == X[i + 1]: + if return_counts: + counts_vec[cnt] += 1 + del X[i] + else: + i += 1 + cnt += 1 + if return_inverse: + last += 1 + inverse_vec[last] = i + if return_counts: + counts_vec = counts_vec[:len(X)] + if return_inverse and return_counts: + return X, np.array(inverse_vec), np.array(counts_vec) + elif return_counts: + return X, np.array(counts_vec) + elif return_inverse: + return X, np.array(inverse_vec) + else: + return X + + +class TestUniqueConsecutiveOp(OpTest): + """case 1""" + + def config(self): + self.x_size = 100 + self.x_range = 20 + self.return_inverse = False + self.return_counts = False + + def init_kernel_type(self): + self.dtype = "float32" if core.is_compiled_with_rocm() else "float64" + + def setUp(self): + self.init_kernel_type() + self.config() + self.op_type = "unique_consecutive" + x = np.random.randint(self.x_range, size=self.x_size).astype(self.dtype) + result = reference_unique_consecutive(x, self.return_inverse, + self.return_counts) + out = reference_unique_consecutive(x) + out = np.array(out).astype(self.dtype) + self.inputs = {'X': x, } + self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)} + self.outputs = {'Out': out, } + + def test_check_output(self): + self.check_output() + + +class TestUniqueConsecutiveOp2(TestUniqueConsecutiveOp): + """case 2""" + + def config(self): + self.x_size = 100 + self.x_range = 20 + self.return_inverse = True + self.return_counts = False + + def setUp(self): + self.init_kernel_type() + self.config() + self.op_type = "unique_consecutive" + x = np.random.randint(self.x_range, size=self.x_size).astype(self.dtype) + result, inverse = reference_unique_consecutive(x, self.return_inverse, + self.return_counts) + result = np.array(result).astype(self.dtype) + inverse = inverse.astype(self.dtype) + self.inputs = {'X': x, } + self.attrs = { + 'return_inverse': self.return_inverse, + 'dtype': int(core.VarDesc.VarType.INT32) + } + self.outputs = {'Out': result, 'Index': inverse} + + +class TestUniqueConsecutiveOp3(TestUniqueConsecutiveOp): + """case 3""" + + def config(self): + self.x_size = 100 + self.x_range = 20 + self.return_inverse = False + self.return_counts = True + + def setUp(self): + self.init_kernel_type() + self.config() + self.op_type = "unique_consecutive" + x = np.random.randint(self.x_range, size=self.x_size).astype(self.dtype) + result, counts = reference_unique_consecutive(x, self.return_inverse, + self.return_counts) + result = np.array(result).astype(self.dtype) + counts = counts.astype(self.dtype) + self.inputs = {'X': x, } + self.attrs = { + 'return_counts': self.return_counts, + 'dtype': int(core.VarDesc.VarType.INT32) + } + self.outputs = {'Out': result, 'Counts': counts} + + +class TestUniqueConsecutiveOp4(TestUniqueConsecutiveOp): + """case 4""" + + def config(self): + self.x_size = 100 + self.x_range = 20 + self.return_inverse = True + self.return_counts = True + + def setUp(self): + self.init_kernel_type() + self.config() + self.op_type = "unique_consecutive" + x = np.random.randint(self.x_range, size=self.x_size).astype(self.dtype) + result, inverse, counts = reference_unique_consecutive( + x, self.return_inverse, self.return_counts) + result = np.array(result).astype(self.dtype) + inverse = inverse.astype(self.dtype) + counts = counts.astype(self.dtype) + self.inputs = {'X': x, } + self.attrs = { + 'return_inverse': self.return_inverse, + 'return_counts': self.return_counts, + 'dtype': int(core.VarDesc.VarType.INT32) + } + self.outputs = {'Out': result, 'Index': inverse, 'Counts': counts} + + +class TestUniqueConsecutiveAPI(unittest.TestCase): + def setUp(self): + self.places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + self.places.append(fluid.CUDAPlace(0)) + + def check_static_result(self, place): + with fluid.program_guard(fluid.Program(), fluid.Program()): + paddle.enable_static() + input_x = fluid.data(name="input_x", shape=[100, ], dtype="float32") + result = paddle.unique_consecutive(input_x) + x_np = np.random.randint(20, size=100).astype("float32") + exe = fluid.Executor(place) + fetches = exe.run(fluid.default_main_program(), + feed={"input_x": x_np}, + fetch_list=[result]) + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + def test_dygraph(self): + for place in self.places: + with fluid.dygraph.guard(place): + input_x = np.random.randint(20, size=100).astype("float64") + x = paddle.to_tensor(input_x) + result = paddle.unique_consecutive(x) + + +class TestUniqueConsecutiveCase2API(unittest.TestCase): + def setUp(self): + self.places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + self.places.append(fluid.CUDAPlace(0)) + + def check_static_result(self, place): + with fluid.program_guard(fluid.Program(), fluid.Program()): + paddle.enable_static() + input_x = fluid.data(name="input_x", shape=[100, ], dtype="float32") + result, inverse, counts = paddle.unique_consecutive( + input_x, return_inverse=True, return_counts=True) + x_np = np.random.randint(20, size=100).astype("float32") + exe = fluid.Executor(place) + fetches = exe.run(fluid.default_main_program(), + feed={"input_x": x_np}, + fetch_list=[result]) + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + def test_dygraph(self): + for place in self.places: + with fluid.dygraph.guard(place): + input_x = np.random.randint(20, size=100).astype("float64") + x = paddle.to_tensor(input_x) + result, inverse, counts = paddle.unique_consecutive( + x, return_inverse=True, return_counts=True) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 1c6996bcad6e5c..bcb508d11922fc 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -88,6 +88,7 @@ from .manipulation import stack # noqa: F401 from .manipulation import strided_slice # noqa: F401 from .manipulation import unique # noqa: F401 +from .manipulation import unique_consecutive # noqa: F401 from .manipulation import unsqueeze # noqa: F401 from .manipulation import unsqueeze_ # noqa: F401 from .manipulation import unstack # noqa: F401 @@ -333,6 +334,7 @@ 'strided_slice', 'transpose', 'unique', + 'unique_consecutive', 'unsqueeze', 'unsqueeze_', 'unstack', diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 434069fe74bce6..4b84401aa09458 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -717,6 +717,112 @@ def squeeze_(x, axis=None, name=None): return out +def unique_consecutive(x, + return_inverse=False, + return_counts=False, + axis=None, + dtype="int64", + name=None): + r""" + Eliminates all but the first element from every consecutive group of equivalent elements. + + .. note:: This function is different from :func:`paddle.unique` in the sense that this function + only eliminates consecutive duplicate values. This semantics is similar to `std::unique` in C++. + + Args: + x(Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + return_inverse(bool, optional): If True, also return the indices for where elements in + the original input ended up in the returned unique consecutive tensor. Default is False. + return_counts(bool, optional): If True, also return the counts for each unique consecutive element. + Default is False. + axis(int, optional): The axis to apply unique consecutive. If None, the input will be flattened. + Default is None. + dtype(np.dtype|str, optional): The data type `inverse` tensor: int32 or int64. + Default: int64. + name(str, optional): Name for the operation. For more information, please refer to + :ref:`api_guide_Name`. Default is None. + + Returns: + tuple: (out, inverse, counts). `out` is the unique consecutive tensor for `x`. `inverse` is provided only if `return_inverse` is True. `counts` is provided only if `return_counts` is True. + + Example: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([1, 1, 2, 2, 3, 1, 1, 2]) + output = paddle.unique_consecutive(x) # + np_output = output.numpy() # [1 2 3 1 2] + _, inverse, counts = paddle.unique_consecutive(x, return_inverse=True, return_counts=True) + np_inverse = inverse.numpy() # [0 0 1 1 2 3 3 4] + np_counts = inverse.numpy() # [2 2 1 2 1] + + x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3], [2, 1, 3]]) + output = paddle.unique_consecutive(x, axis=0) # + np_output = output.numpy() # [2 1 3 0 1 2 1 3 2 1 3] + + x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3], [2, 1, 3]]) + output = paddle.unique_consecutive(x, axis=0) # + np_output = output.numpy() + # [[2 1 3] + # [3 0 1] + # [2 1 3]] + """ + + if axis is None: + axis = [] + else: + axis = [axis] + attr_dtype = convert_np_dtype_to_dtype_(dtype) + if in_dygraph_mode(): + out, inverse, counts = core.ops.unique_consecutive( + x, 'dtype', attr_dtype, 'return_inverse', return_inverse, + 'return_counts', return_counts, 'axis', axis) + outs = [out] + if return_inverse: + outs.append(inverse) + if return_counts: + outs.append(counts) + if len(outs) == 1: + return outs[0] + return tuple(outs) + check_variable_and_dtype(x, "input", + ['float32', 'float64', 'int32', 'int64'], + 'unique_consecutive') + check_type(return_inverse, 'return_inverse', bool, 'unique_consecutive') + check_type(return_counts, 'return_counts', bool, 'unique_consecutive') + check_dtype(dtype, 'dtype', ['int32', 'int64'], 'unique_consecutive') + if len(axis) != 0: + check_type(axis[0], 'axis', int, 'unique_consecutive') + helper = LayerHelper('unique_consecutive', **locals()) + attrs = { + 'dtype': attr_dtype, + "return_inverse": return_inverse, + "return_counts": return_counts, + "axis": axis, + } + out = helper.create_variable_for_type_inference( + dtype=x.dtype, stop_gradient=True) + inverse = helper.create_variable_for_type_inference( + dtype=attr_dtype, stop_gradient=True) + counts = helper.create_variable_for_type_inference( + dtype=attr_dtype, stop_gradient=True) + outputs = {"Out": out, "Index": inverse, "Counts": counts} + outs = [out] + if return_inverse: + outs.append(inverse) + if return_counts: + outs.append(counts) + helper.append_op( + type="unique_consecutive", + inputs={"X": x}, + attrs=attrs, + outputs=outputs) + if len(outs) == 1: + return outs[0] + return tuple(outs) + + def unique(x, return_index=False, return_inverse=False, From e4e8cc9b33db53a0755f0e0e033a7a8542034804 Mon Sep 17 00:00:00 2001 From: From00 Date: Mon, 16 Aug 2021 14:55:12 +0800 Subject: [PATCH 052/126] [NPU] Support NPU kernel for nearest_interp and nearest_interp_grad op (#34881) * Add NPU kernel for nearest_interp op * Add grad op * Modify codes according to the review comments * Modify codes according to the review comments --- paddle/fluid/operators/interpolate_op_npu.cc | 214 ++++++++ .../npu/test_nearest_interp_op_npu.py | 461 ++++++++++++++++++ 2 files changed, 675 insertions(+) create mode 100755 paddle/fluid/operators/interpolate_op_npu.cc create mode 100755 python/paddle/fluid/tests/unittests/npu/test_nearest_interp_op_npu.py diff --git a/paddle/fluid/operators/interpolate_op_npu.cc b/paddle/fluid/operators/interpolate_op_npu.cc new file mode 100755 index 00000000000000..8d4b1e00c5d89a --- /dev/null +++ b/paddle/fluid/operators/interpolate_op_npu.cc @@ -0,0 +1,214 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/interpolate_op.h" +#include +#include +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; + +inline static void CheckArgument(const framework::ExecutionContext& ctx) { + const std::string interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + PADDLE_ENFORCE_EQ( + align_corners, false, + platform::errors::InvalidArgument( + "NPU Interpolate Kernel has diff when align_corners is true.")); + PADDLE_ENFORCE_EQ( + interp_method, "nearest", + platform::errors::InvalidArgument( + "NPU Interpolate Kernel only support nearest interpolotion.")); +} + +inline static void ExtractNCHW(const framework::DDim& dims, + const DataLayout& data_layout, int32_t* n, + int32_t* c, int32_t* h, int32_t* w) { + *n = dims[0]; + if (data_layout == DataLayout::kNCHW) { + *c = dims[1]; + *h = dims[2]; + *w = dims[3]; + } else { // kNHWC + *h = dims[1]; + *w = dims[2]; + *c = dims[3]; + } +} + +static void CalcOutSize(const framework::ExecutionContext& ctx, int32_t in_h, + int32_t in_w, int32_t* out_h, int32_t* out_w) { + // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w + *out_h = ctx.Attr("out_h"); + *out_w = ctx.Attr("out_w"); + + auto dev_ctx = platform::DeviceContextPool::Instance().Get(ctx.GetPlace()); + auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); + + if (list_new_size_tensor.size() > 0) { + std::vector new_size_h(1); + std::vector new_size_w(1); + framework::TensorToVector(*list_new_size_tensor[0], *dev_ctx, &new_size_h); + framework::TensorToVector(*list_new_size_tensor[1], *dev_ctx, &new_size_w); + *out_h = new_size_h[0]; + *out_w = new_size_w[0]; + } else { + float scale; + auto scale_tensor = ctx.Input("Scale"); + if (scale_tensor != nullptr) { + std::vector scale_data; + framework::TensorToVector(*scale_tensor, *dev_ctx, &scale_data); + scale = scale_data[0]; + } else { + scale = ctx.Attr("scale"); + } + + if (scale > 0) { + *out_h = static_cast(in_h * scale); + *out_w = static_cast(in_w * scale); + } + + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + std::vector out_size_data; + framework::TensorToVector(*out_size, *dev_ctx, &out_size_data); + *out_h = out_size_data[0]; + *out_w = out_size_data[1]; + } + } + + PADDLE_ENFORCE_GT(*out_h, 0, + platform::errors::InvalidArgument( + "out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT(*out_w, 0, + platform::errors::InvalidArgument( + "out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); +} + +template +class InterpolateNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // NOTE(Ruibiao): + // this kernel only support nearest interpolotion for 2D images + // the Ascend 'ResizeNearestNeighborV2' used in this kernle has diff + // when 'align_corners' is 'true' or data type is 'double' + CheckArgument(ctx); + + auto* input = ctx.Input("X"); + framework::DDim input_dims = input->dims(); + + const std::string data_layout_str = + ctx.Attr("data_layout"); // kNCHW or kNHWC + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + int32_t n, c, h, w, out_h, out_w; + ExtractNCHW(input_dims, data_layout, &n, &c, &h, &w); + CalcOutSize(ctx, h, w, &out_h, &out_w); + + // the 'input' tensor may has no set (or wrong set) of the layout + Tensor input_x(input->type()); + input_x.ShareDataWith(*input); + input_x.set_layout(data_layout); + + auto* output = ctx.Output("Out"); + framework::DDim output_dims; + if (data_layout == DataLayout::kNCHW) { + output_dims = {n, c, out_h, out_w}; + } else { + output_dims = {n, out_h, out_w, c}; + } + output->set_layout(data_layout); + output->mutable_data(output_dims, ctx.GetPlace()); + + NpuOpRunner npu_op_runner; + auto npu_stream = + ctx.template device_context() + .stream(); + npu_op_runner.SetType("ResizeNearestNeighborV2") + .AddInput(input_x) + .AddInput(std::vector{out_h, out_w}) + .AddOutput(*output) + .AddAttr("align_corners", false) + .AddAttr("half_pixel_centers", false) + .Run(npu_stream); + } +}; + +template +class InterpolateGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // NOTE(Ruibiao): + // this kernel only support nearest interpolotion for 2D images + // the Ascend 'ResizeNearestNeighborV2' used in this kernle has diff + // when 'align_corners' is 'true' or data type is 'double' + CheckArgument(ctx); + + auto* input = ctx.Input("X"); + framework::DDim input_dims = input->dims(); + + const std::string data_layout_str = + ctx.Attr("data_layout"); // kNCHW or kNHWC + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + int32_t n, c, h, w, out_h, out_w; + ExtractNCHW(input_dims, data_layout, &n, &c, &h, &w); + CalcOutSize(ctx, h, w, &out_h, &out_w); + + // the 'output_grad' tensor may has no set (or wrong set) of the layout + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + Tensor output_grad_tmp(output_grad->type()); + output_grad_tmp.ShareDataWith(*output_grad); + output_grad_tmp.set_layout(data_layout); + + auto* input_grad = ctx.Output(framework::GradVarName("X")); + input_grad->set_layout(data_layout); + framework::DDim input_grad_dims; + if (data_layout == DataLayout::kNCHW) { + input_grad_dims = {n, c, h, w}; + } else { + input_grad_dims = {n, h, w, c}; + } + input_grad->mutable_data(input_grad_dims, ctx.GetPlace()); + + NpuOpRunner npu_op_runner; + auto npu_stream = + ctx.template device_context() + .stream(); + npu_op_runner.SetType("ResizeNearestNeighborV2Grad") + .AddInput(output_grad_tmp) + .AddInput(std::vector{h, w}) + .AddOutput(*input_grad) + .AddAttr("align_corners", false) + .AddAttr("half_pixel_centers", false) + .Run(npu_stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL(nearest_interp, ops::InterpolateNPUKernel, + ops::InterpolateNPUKernel); +REGISTER_OP_NPU_KERNEL(nearest_interp_grad, + ops::InterpolateGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_op_npu.py new file mode 100755 index 00000000000000..c6f85c8dee40ce --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_op_npu.py @@ -0,0 +1,461 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from test_nearest_interp_op import nearest_neighbor_interp_np + +paddle.enable_static() + + +class TestNearestInterpOp(OpTest): + def setUp(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + self.out_size = None + self.actual_shape = None + self.data_layout = 'NCHW' + self.init_test_case() + self.op_type = "nearest_interp" + input_np = np.random.random(self.input_shape).astype("float32") + if self.data_layout == "NCHW": + in_h = self.input_shape[2] + in_w = self.input_shape[3] + else: + in_h = self.input_shape[1] + in_w = self.input_shape[2] + + if self.scale > 0: + out_h = int(in_h * self.scale) + out_w = int(in_w * self.scale) + else: + out_h = self.out_h + out_w = self.out_w + + output_np = nearest_neighbor_interp_np( + input_np, out_h, out_w, self.out_size, self.actual_shape, + self.align_corners, self.data_layout) + self.inputs = {'X': input_np} + if self.out_size is not None: + self.inputs['OutSize'] = self.out_size + if self.actual_shape is not None: + self.inputs['OutSize'] = self.actual_shape + self.attrs = { + 'out_h': self.out_h, + 'out_w': self.out_w, + 'scale': self.scale, + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, + 'data_layout': self.data_layout + } + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.__class__.no_need_check_grad = True + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True) + + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [2, 3, 4, 5] + self.out_h = 2 + self.out_w = 2 + self.scale = 0. + self.out_size = np.array([3, 3]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpCase1(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [4, 1, 7, 8] + self.out_h = 1 + self.out_w = 1 + self.scale = 0. + self.align_corners = False + + +class TestNearestNeighborInterpCase2(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.scale = 0. + self.align_corners = False + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', in_place=True, max_relative_error=0.006) + + +class TestNearestNeighborInterpCase3(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [1, 1, 32, 64] + self.out_h = 64 + self.out_w = 32 + self.scale = 0. + self.align_corners = False + + +class TestNearestNeighborInterpCase4(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [4, 1, 7, 8] + self.out_h = 1 + self.out_w = 1 + self.scale = 0. + self.out_size = np.array([2, 2]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpCase5(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.scale = 0. + self.out_size = np.array([11, 11]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpCase6(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [1, 1, 32, 64] + self.out_h = 64 + self.out_w = 32 + self.scale = 0. + self.out_size = np.array([65, 129]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpSame(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [2, 3, 32, 64] + self.out_h = 32 + self.out_w = 64 + self.scale = 0. + self.align_corners = False + + +class TestNearestNeighborInterpActualShape(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.scale = 0. + self.out_size = np.array([66, 40]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpDataLayout(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [2, 4, 4, 5] + self.out_h = 2 + self.out_w = 2 + self.scale = 0. + self.out_size = np.array([3, 8]).astype("int32") + self.align_corners = False + self.data_layout = "NHWC" + + +class TestNearestInterpOpUint8(OpTest): + def setUp(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + self.out_size = None + self.actual_shape = None + self.init_test_case() + self.op_type = "nearest_interp" + input_np = np.random.randint( + low=0, high=256, size=self.input_shape).astype("uint8") + + if self.scale > 0: + out_h = int(self.input_shape[2] * self.scale) + out_w = int(self.input_shape[3] * self.scale) + else: + out_h = self.out_h + out_w = self.out_w + + output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, + self.out_size, self.actual_shape, + self.align_corners) + self.inputs = {'X': input_np} + if self.out_size is not None: + self.inputs['OutSize'] = self.out_size + self.attrs = { + 'out_h': self.out_h, + 'out_w': self.out_w, + 'scale': self.scale, + 'interp_method': self.interp_method, + 'align_corners': self.align_corners + } + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [1, 3, 9, 6] + self.out_h = 10 + self.out_w = 9 + self.scale = 0. + self.align_corners = False + + +class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [2, 3, 32, 64] + self.out_h = 80 + self.out_w = 40 + self.scale = 0. + self.align_corners = False + + +class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [4, 1, 7, 8] + self.out_h = 5 + self.out_w = 13 + self.scale = 0. + self.out_size = np.array([6, 15]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpScale1(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 7, 5] + self.out_h = 64 + self.out_w = 32 + self.scale = 2. + self.out_size = np.array([66, 40]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpScale2(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 5, 7] + self.out_h = 64 + self.out_w = 32 + self.scale = 1.5 + self.out_size = np.array([66, 40]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpScale3(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 7, 5] + self.out_h = 64 + self.out_w = 32 + self.scale = 1. + self.out_size = np.array([66, 40]).astype("int32") + self.align_corners = False + + +class TestNearestInterpOp_attr_tensor(OpTest): + def setUp(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + self.out_size = None + self.actual_shape = None + self.init_test_case() + self.op_type = "nearest_interp" + self.shape_by_1Dtensor = False + self.scale_by_1Dtensor = False + self.attrs = { + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, + } + + input_np = np.random.random(self.input_shape).astype("float32") + self.inputs = {'X': input_np} + + if self.scale_by_1Dtensor: + self.inputs['Scale'] = np.array([self.scale]).astype("float64") + elif self.scale > 0: + out_h = int(self.input_shape[2] * self.scale) + out_w = int(self.input_shape[3] * self.scale) + self.attrs['scale'] = self.scale + else: + out_h = self.out_h + out_w = self.out_w + + if self.shape_by_1Dtensor: + self.inputs['OutSize'] = self.out_size + elif self.out_size is not None: + size_tensor = [] + for index, ele in enumerate(self.out_size): + size_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + self.inputs['SizeTensor'] = size_tensor + + self.attrs['out_h'] = self.out_h + self.attrs['out_w'] = self.out_w + output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, + self.out_size, self.actual_shape, + self.align_corners) + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True) + + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [2, 5, 4, 4] + self.out_h = 3 + self.out_w = 3 + self.scale = 0. + self.out_size = [3, 3] + self.align_corners = False + + +# out_size is a tensor list +class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.scale = 0. + self.out_size = [8, 12] + self.align_corners = False + + +# out_size is a 1-D tensor +class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.scale = 0. + self.out_size = np.array([66, 40]).astype("int32") + self.align_corners = False + self.shape_by_1Dtensor = True + + +# scale is a 1-D tensor +class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.scale = 2.0 + self.out_size = None + self.align_corners = False + self.scale_by_1Dtensor = True + + +class TestNearestAPI(unittest.TestCase): + def test_case(self): + x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32") + y = fluid.data(name="y", shape=[2, 6, 6, 3], dtype="float32") + + dim = fluid.data(name="dim", shape=[1], dtype="int32") + shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32") + actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32") + scale_tensor = fluid.data( + name="scale_tensor", shape=[1], dtype="float32") + + out1 = fluid.layers.resize_nearest( + y, out_shape=[12, 12], data_format='NHWC', align_corners=False) + out2 = fluid.layers.resize_nearest( + x, out_shape=[12, dim], align_corners=False) + out3 = fluid.layers.resize_nearest( + x, out_shape=shape_tensor, align_corners=False) + out4 = fluid.layers.resize_nearest( + x, out_shape=[4, 4], actual_shape=actual_size, align_corners=False) + out5 = fluid.layers.resize_nearest( + x, scale=scale_tensor, align_corners=False) + + x_data = np.random.random((2, 3, 6, 6)).astype("float32") + dim_data = np.array([12]).astype("int32") + shape_data = np.array([12, 12]).astype("int32") + actual_size_data = np.array([12, 12]).astype("int32") + scale_data = np.array([2.0]).astype("float32") + + place = paddle.NPUPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + results = exe.run(fluid.default_main_program(), + feed={ + "x": x_data, + "y": np.transpose(x_data, (0, 2, 3, 1)), + "dim": dim_data, + "shape_tensor": shape_data, + "actual_size": actual_size_data, + "scale_tensor": scale_data + }, + fetch_list=[out1, out2, out3, out4, out5], + return_numpy=True) + + expect_res = nearest_neighbor_interp_np( + x_data, out_h=12, out_w=12, align_corners=False) + self.assertTrue( + np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1)))) + for i in range(len(results) - 1): + self.assertTrue(np.allclose(results[i + 1], expect_res)) + + +class TestNearestInterpException(unittest.TestCase): + def test_exception(self): + input = fluid.data(name="input", shape=[1, 3, 6, 6], dtype="float32") + + def attr_data_format(): + # for 4-D input, data_format can only be NCHW or NHWC + out = fluid.layers.resize_nearest( + input, out_shape=[4, 8], data_format='NDHWC') + + def attr_scale_type(): + out = fluid.layers.resize_nearest(input, scale='scale') + + def attr_scale_value(): + out = fluid.layers.resize_nearest(input, scale=-0.3) + + self.assertRaises(ValueError, attr_data_format) + self.assertRaises(TypeError, attr_scale_type) + self.assertRaises(ValueError, attr_scale_value) + + +if __name__ == "__main__": + unittest.main() From 3b9f040d019f8f64b84e469a2ec53ca4238257a2 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Mon, 16 Aug 2021 15:13:21 +0800 Subject: [PATCH 053/126] [NPU] add nearest_interp_v2 and nearest_interp_v2_grad, test=develop (#34769) --- paddle/fluid/operators/interpolate_v2_op.h | 6 + .../fluid/operators/interpolate_v2_op_npu.cc | 332 ++++++++++++++++ .../npu/test_nearest_interp_v2_op_npu.py | 366 ++++++++++++++++++ 3 files changed, 704 insertions(+) create mode 100644 paddle/fluid/operators/interpolate_v2_op_npu.cc create mode 100755 python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h index ebab5794edc517..8daf440f60e5f6 100644 --- a/paddle/fluid/operators/interpolate_v2_op.h +++ b/paddle/fluid/operators/interpolate_v2_op.h @@ -58,6 +58,12 @@ inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor); new_data = cpu_starts_tensor.data(); } +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(new_data_tensor->place())) { + TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor); + new_data = cpu_starts_tensor.data(); + } +#endif vec_new_data = std::vector(new_data, new_data + new_data_tensor->numel()); return vec_new_data; } diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc new file mode 100644 index 00000000000000..d893fbd0196289 --- /dev/null +++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc @@ -0,0 +1,332 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/operators/interpolate_v2_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; + +template +class InterpolateV2NPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + + auto input_dims = input->dims(); + PADDLE_ENFORCE_EQ(input_dims.size(), 4UL, + platform::errors::External( + "NPU Interpolate Kernel only support 4-D Tensor.")); + + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w); + + PADDLE_ENFORCE_EQ( + input->layout(), data_layout, + platform::errors::InvalidArgument( + "Interpolate OP's input tensor layout should equal to attr " + "data_layout, but got tensor layout <%s>, attr layout <%s>", + framework::DataLayoutToString(input->layout()), data_layout_str)); + PADDLE_ENFORCE_EQ( + output->layout(), data_layout, + platform::errors::InvalidArgument( + "Interpolate OP's output tensor layout should equal to attr " + "data_layout, but got tensor layout <%s>, attr layout <%s>", + framework::DataLayoutToString(output->layout()), data_layout_str)); + + auto interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + + // To-do(qili93): need to support align_corners = true case, try ReSizeD + PADDLE_ENFORCE_EQ( + align_corners, false, + platform::errors::InvalidArgument( + "NPU Interpolate Kernel has diff when align_corners is true.")); + + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + float scale_h = -1; + float scale_w = -1; + + // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w + auto list_new_shape_tensor = + ctx.MultiInput("SizeTensor"); + if (list_new_shape_tensor.size() > 0) { + std::vector output_h(1); + std::vector output_w(1); + auto dev_ctx = + platform::DeviceContextPool::Instance().Get(ctx.GetPlace()); + framework::TensorToVector(*list_new_shape_tensor[0], *dev_ctx, &output_h); + framework::TensorToVector(*list_new_shape_tensor[1], *dev_ctx, &output_w); + out_h = output_h[0]; + out_w = output_w[0]; + } else if (ctx.HasInput("OutSize")) { + auto out_size = ctx.Input("OutSize"); + auto out_size_data = get_new_data_from_tensor(out_size); + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } else { + auto scale_tensor = ctx.Input("Scale"); + auto scale = ctx.Attr>("scale"); + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + if (scale_data.size() > 1) { + scale_h = scale_data[0]; + scale_w = scale_data[1]; + } else { + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } else { + if (scale.size() > 1) { + scale_h = scale[0]; + scale_w = scale[1]; + + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } + } + if (scale_h > 0. && scale_w > 0.) { + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + } + PADDLE_ENFORCE_GT(out_h, 0, + platform::errors::InvalidArgument( + "out_h in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT(out_w, 0, + platform::errors::InvalidArgument( + "out_w in Attr(out_shape) of Op(interpolate) " + "should be greater than 0.")); + framework::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_h, out_w}; + } else { + dim_out = {n, out_h, out_w, c}; + } + output->mutable_data(dim_out, ctx.GetPlace()); + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(*input, ctx.GetPlace(), output); + return; + } + + auto stream = + ctx.template device_context() + .stream(); + + NpuOpRunner runner; + // To-do(qili93): need to support bilineare, try ResizeD + if ("nearest" == interp_method) { + runner.SetType("ResizeNearestNeighborV2") + .AddInput(*input) + .AddInput(std::vector{out_h, out_w}) + .AddOutput(*output) + .AddAttr("align_corners", align_corners) + .AddAttr("half_pixel_centers", false); + } + runner.Run(stream); + } +}; + +template +class InterpolateV2NPUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + PADDLE_ENFORCE_EQ( + input->layout(), data_layout, + platform::errors::InvalidArgument( + "Interpolate OP's input tensor layout should equal to attr " + "data_layout, but got tensor layout <%s>, attr layout <%s>", + framework::DataLayoutToString(input->layout()), data_layout_str)); + PADDLE_ENFORCE_EQ(output_grad->layout(), data_layout, + platform::errors::InvalidArgument( + "Interpolate OP's output_grad tensor layout should " + "equal to attr data_layout, but got tensor layout is " + "<%s>, and attr layout is <%s>", + framework::DataLayoutToString(output_grad->layout()), + data_layout_str)); + PADDLE_ENFORCE_EQ(input_grad->layout(), data_layout, + platform::errors::InvalidArgument( + "Interpolate OP's input_grad tensor layout should " + "equal to attr data_layout, but got tensor layout is " + "<%s>, and attr layout is <%s>", + framework::DataLayoutToString(input_grad->layout()), + data_layout_str)); + + auto interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + + // To-do(qili93): need to support align_corners = true case, try ReSizeD + PADDLE_ENFORCE_EQ( + align_corners, false, + platform::errors::InvalidArgument( + "NPU Interpolate Kernel has diff when align_corners is true.")); + + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + float scale_h = -1; + float scale_w = -1; + + // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w + auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); + if (list_new_size_tensor.size() > 0) { + std::vector output_h(1); + std::vector output_w(1); + auto dev_ctx = + platform::DeviceContextPool::Instance().Get(ctx.GetPlace()); + framework::TensorToVector(*list_new_size_tensor[0], *dev_ctx, &output_h); + framework::TensorToVector(*list_new_size_tensor[1], *dev_ctx, &output_w); + out_h = output_h[0]; + out_w = output_w[0]; + } else if (ctx.HasInput("OutSize")) { + auto out_size = ctx.Input("OutSize"); + auto out_size_data = get_new_data_from_tensor(out_size); + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } else { + auto scale_tensor = ctx.Input("Scale"); + auto scale = ctx.Attr>("scale"); + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + if (scale_data.size() > 1) { + scale_h = scale_data[0]; + scale_w = scale_data[1]; + } else { + scale_w = scale_data[0]; + scale_h = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } else { + if (scale.size() > 1) { + scale_h = scale[0]; + scale_w = scale[1]; + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + } + } + if (scale_h > 0. && scale_w > 0.) { + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + } + + framework::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_h, in_w}; + } else { + dim_grad = {n, in_h, in_w, c}; + } + + input_grad->mutable_data(dim_grad, ctx.GetPlace()); + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad); + return; + } + + auto stream = + ctx.template device_context() + .stream(); + + NpuOpRunner runner; + // To-do(qili93): need to support bilineare, try ResizeGradD + if ("nearest" == interp_method) { + runner.SetType("ResizeNearestNeighborV2Grad") + .AddInput(*output_grad) + .AddInput(std::vector{in_h, in_w}) + .AddOutput(*input_grad) + .AddAttr("align_corners", align_corners) + .AddAttr("half_pixel_centers", false); + } + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + nearest_interp_v2, + ops::InterpolateV2NPUKernel, + ops::InterpolateV2NPUKernel); + +REGISTER_OP_NPU_KERNEL( + nearest_interp_v2_grad, + ops::InterpolateV2NPUGradKernel, + ops::InterpolateV2NPUGradKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py new file mode 100755 index 00000000000000..f3df1fca30749e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py @@ -0,0 +1,366 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest +import paddle.fluid.core as core +import paddle.fluid as fluid +import paddle.nn as nn +import paddle +from paddle.nn.functional import interpolate + +from test_nearest_interp_v2_op import nearest_neighbor_interp_np + +paddle.enable_static() + + +class TestNearestInterpOp(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def setUp(self): + self.set_npu() + self.out_size = None + self.actual_shape = None + self.data_layout = 'NCHW' + self.init_test_case() + self.op_type = "nearest_interp_v2" + input_np = np.random.random(self.input_shape).astype("float32") + + if self.data_layout == "NCHW": + in_h = self.input_shape[2] + in_w = self.input_shape[3] + else: + in_h = self.input_shape[1] + in_w = self.input_shape[2] + scale_h = 0 + scale_w = 0 + if self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0: + scale_h = scale_w = float(self.scale) + if isinstance(self.scale, list) and len(self.scale) == 1: + scale_w = scale_h = self.scale[0] + elif isinstance(self.scale, list) and len(self.scale) > 1: + scale_w = self.scale[1] + scale_h = self.scale[0] + output_h = int(in_h * scale_h) + output_w = int(in_w * scale_w) + else: + output_h = self.out_h + output_w = self.out_w + + output_np = nearest_neighbor_interp_np( + input_np, output_h, output_w, scale_h, scale_w, self.out_size, + self.actual_shape, self.align_corners, self.data_layout) + self.inputs = {'X': input_np} + if self.out_size is not None: + self.inputs['OutSize'] = self.out_size + if self.actual_shape is not None: + self.inputs['OutSize'] = self.actual_shape + self.attrs = { + 'out_h': self.out_h, + 'out_w': self.out_w, + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, + 'data_layout': self.data_layout + } + if self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0: + self.scale = [self.scale] + if isinstance(self.scale, list) and len(self.scale) == 1: + self.scale = [self.scale[0], self.scale[0]] + self.attrs['scale'] = self.scale + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', in_place=True, max_relative_error=0.006) + + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [2, 3, 4, 5] + self.out_h = 2 + self.out_w = 2 + self.scale = 0. + self.out_size = np.array([3, 3]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpCase1(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [4, 1, 7, 8] + self.out_h = 1 + self.out_w = 1 + self.scale = 0. + self.align_corners = False + + +class TestNearestNeighborInterpCase2(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.scale = 0. + self.align_corners = False + + +class TestNearestNeighborInterpCase3(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [1, 1, 32, 64] + self.out_h = 64 + self.out_w = 32 + self.scale = 0. + self.align_corners = False + + +class TestNearestNeighborInterpCase4(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [4, 1, 7, 8] + self.out_h = 1 + self.out_w = 1 + self.scale = 0. + self.out_size = np.array([2, 2]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpCase5(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.scale = 0. + self.out_size = np.array([11, 11]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpCase6(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [1, 1, 32, 64] + self.out_h = 64 + self.out_w = 32 + self.scale = 0. + self.out_size = np.array([65, 129]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpSame(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [2, 3, 32, 64] + self.out_h = 32 + self.out_w = 64 + self.scale = 0. + self.align_corners = False + + +class TestNearestNeighborInterpActualShape(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.scale = 0. + self.out_size = np.array([66, 40]).astype("int32") + self.align_corners = False + + +class TestNearestNeighborInterpScale1(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 7, 5] + self.out_h = 64 + self.out_w = 32 + self.scale = 2. + self.out_size = None + self.align_corners = False + + +class TestNearestNeighborInterpScale2(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 5, 7] + self.out_h = 64 + self.out_w = 32 + self.scale = 1.5 + self.out_size = None + self.align_corners = False + + +class TestNearestNeighborInterpScale3(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 7, 5] + self.out_h = 64 + self.out_w = 32 + self.scale = [2.0, 3.0] + self.out_size = None + self.align_corners = False + + +class TestNearestInterpOp_attr_tensor(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def setUp(self): + self.set_npu() + self.out_size = None + self.actual_shape = None + self.init_test_case() + self.op_type = "nearest_interp_v2" + self.shape_by_1Dtensor = False + self.scale_by_1Dtensor = False + self.attrs = { + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, + } + + input_np = np.random.random(self.input_shape).astype("float32") + self.inputs = {'X': input_np} + + if self.scale_by_1Dtensor: + self.inputs['Scale'] = np.array([self.scale]).astype("float32") + elif self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0: + scale_h = scale_w = float(self.scale) + if isinstance(self.scale, list) and len(self.scale) == 1: + scale_w = scale_h = self.scale[0] + elif isinstance(self.scale, list) and len(self.scale) > 1: + scale_w = self.scale[1] + scale_h = self.scale[0] + out_h = int(self.input_shape[2] * scale_h) + out_w = int(self.input_shape[3] * scale_w) + else: + out_h = self.out_h + out_w = self.out_w + + if self.shape_by_1Dtensor: + self.inputs['OutSize'] = self.out_size + elif self.out_size is not None: + size_tensor = [] + for index, ele in enumerate(self.out_size): + size_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + self.inputs['SizeTensor'] = size_tensor + + self.attrs['out_h'] = self.out_h + self.attrs['out_w'] = self.out_w + if self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0: + self.scale = [self.scale] + if isinstance(self.scale, list) and len(self.scale) == 1: + self.scale = [self.scale[0], self.scale[0]] + self.attrs['scale'] = self.scale + output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0, + self.out_size, self.actual_shape, + self.align_corners) + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True) + + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [2, 5, 4, 4] + self.out_h = 3 + self.out_w = 3 + self.scale = 0. + self.out_size = [3, 3] + self.align_corners = False + + +# out_size is a tensor list +class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.scale = 0. + self.out_size = [8, 12] + self.align_corners = False + + +# out_size is a 1-D tensor +class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.scale = 0. + self.out_size = np.array([66, 40]).astype("int32") + self.align_corners = False + self.shape_by_1Dtensor = True + + +# scale is a 1-D tensor +class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.scale = 2.0 + self.out_size = None + self.align_corners = False + self.scale_by_1Dtensor = True + + +class TestNearestInterpOpAPI_dy(unittest.TestCase): + def test_case(self): + import paddle + if core.is_compiled_with_npu(): + place = core.NPUPlace(0) + else: + place = core.CPUPlace() + with fluid.dygraph.guard(place): + input_data = np.random.random((2, 3, 6, 6)).astype("float32") + scale_np = np.array([2, 2]).astype("int64") + input_x = paddle.to_tensor(input_data) + scale = paddle.to_tensor(scale_np) + expect_res = nearest_neighbor_interp_np( + input_data, out_h=12, out_w=12, align_corners=False) + out = interpolate( + x=input_x, + scale_factor=scale, + mode="nearest", + align_corners=False) + self.assertTrue(np.allclose(out.numpy(), expect_res)) + + +if __name__ == "__main__": + unittest.main() From f6d8ab549a16cc23a06d8bc6577013f1568f2df6 Mon Sep 17 00:00:00 2001 From: zhangchunle Date: Mon, 16 Aug 2021 15:20:21 +0800 Subject: [PATCH 054/126] fix iscan bug in test file (#34912) --- .../slim/tests/test_imperative_out_scale.py | 2 +- .../fluid/incubate/fleet/base/fleet_base.py | 9 - .../tests/unittests/dist_sharding_save.py | 2 +- .../mkldnn/test_reshape_mkldnn_op.py | 2 +- .../unittests/test_dist_fleet_grad_clip.py | 2 +- .../tests/unittests/test_fleet_base_3.py | 2 +- .../unittests/test_fleet_rolemaker_new.py | 258 ------------------ .../fluid/tests/unittests/test_layers.py | 8 +- .../fluid/tests/unittests/test_matmul_op.py | 10 +- ...test_parallel_dygraph_pipeline_parallel.py | 4 +- .../fluid/tests/unittests/test_reduce_op.py | 18 -- .../fluid/tests/unittests/test_sum_op.py | 3 - 12 files changed, 16 insertions(+), 304 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py index 6cc58a38f227a5..c4318b8bf8ef62 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py @@ -35,7 +35,7 @@ from paddle.fluid.log_helper import get_logger from paddle.fluid.dygraph import nn -from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenet +from imperative_test_utils import fix_model_dict, train_lenet paddle.enable_static() diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py index 105180030ace82..6aa4fcf45c7926 100644 --- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py +++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py @@ -26,15 +26,6 @@ from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecision from . import mode - -class Mode: - """ - There are various mode for fleet, each of them is designed for different model. - """ - PS = 1 - COLLECTIVE = 2 - - __all__ = ['Fleet', 'DistributedOptimizer'] __all__ += mode.__all__ diff --git a/python/paddle/fluid/tests/unittests/dist_sharding_save.py b/python/paddle/fluid/tests/unittests/dist_sharding_save.py index 99b2dcb97d1fbd..7d3d934cb458f1 100755 --- a/python/paddle/fluid/tests/unittests/dist_sharding_save.py +++ b/python/paddle/fluid/tests/unittests/dist_sharding_save.py @@ -16,7 +16,7 @@ import paddle import paddle.fluid as fluid -from test_dist_base import TestDistRunnerBase, runtime_main +from test_dist_base import TestDistRunnerBase from dist_mnist import cnn_model # from paddle.fluid.incubate.fleet.collective import fleet import paddle.distributed.fleet as fleet diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_mkldnn_op.py index 13894215866386..a28827207ee832 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_mkldnn_op.py @@ -85,7 +85,7 @@ def set_outputs(self): 'XShape': np.random.random(self.ori_shape).astype("float32") } - def init_data(self): + def init_data1(self): self.ori_shape = (6, 20) self.new_shape = (0, -1, 20) self.actual_shape = (2, 3, 20) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py index 7c0eb83aeced6d..7807646dca3a55 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py @@ -46,7 +46,7 @@ def test_dist_train(self): self.check_with_place( "dist_fleet_ctr.py", delta=1e-5, check_error_log=True) - def _setup_config(self): + def _setup_config1(self): self._sync_mode = False self._grad_clip_mode = 2 diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py index 6be05f436328e0..8dcacafabbbf2f 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py @@ -21,7 +21,7 @@ paddle.enable_static() -class TestFleetBase(unittest.TestCase): +class TestFleetBase_1(unittest.TestCase): def setUp(self): os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py index a3c38c462cd23d..5e8be9a852273e 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py @@ -268,264 +268,6 @@ def test_fs_gloo3(self): self.case(role, "server") self.clean(tmp) - def test_fs_gloo4(self): - plats = platform.platform() - if 'Linux' not in plats: - print("skip gloo UT on MacOS/Win") - return - - os.environ["TRAINING_ROLE"] = "WORKER" - os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" - os.environ["POD_IP"] = "127.0.0.1" - os.environ["PADDLE_PORT"] = "36001" - - os.environ["SYS_JOB_ID"] = "gloo_for_cluster" - os.environ["PADDLE_WITH_GLOO"] = "1" - os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3" - os.environ["PADDLE_GLOO_HTTP_ENDPOINT"] = "127.0.0.1:30019" - - role = role_maker.PaddleCloudRoleMaker(is_collecitve=True) - role._generate_role() - import time - time.sleep(3) - - def test_fs_gloo5(self): - plats = platform.platform() - if 'Linux' not in plats: - print("skip gloo UT on MacOS/Win") - return - - tmp = self.mkdir() - - os.environ["TRAINING_ROLE"] = "PSERVER" - os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" - os.environ["POD_IP"] = "127.0.0.1" - os.environ["PADDLE_PORT"] = "36001" - os.environ["PADDLE_TRAINERS_NUM"] = "0" - - os.environ["SYS_JOB_ID"] = "gloo_for_cluster" - os.environ["PADDLE_WITH_GLOO"] = "2" - os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2" - os.environ["PADDLE_GLOO_FS_PATH"] = tmp - - role = role_maker.PaddleCloudRoleMaker() - role._generate_role() - self.case(role, "server") - self.case(role, "all") - self.clean(tmp) - - def test_fs_gloo6(self): - plats = platform.platform() - if 'Linux' not in plats: - print("skip gloo UT on MacOS/Win") - return - - tmp = self.mkdir() - - os.environ["TRAINING_ROLE"] = "PSERVER" - os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" - os.environ["POD_IP"] = "127.0.0.1" - os.environ["PADDLE_PORT"] = "36001" - os.environ["PADDLE_TRAINERS_NUM"] = "0" - - os.environ["SYS_JOB_ID"] = "gloo_for_cluster" - - os.environ["PADDLE_WITH_GLOO"] = "2" - os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1" - os.environ["PADDLE_GLOO_FS_NAME"] = "NULL" - os.environ["PADDLE_GLOO_FS_UGI"] = "NULL" - os.environ["PADDLE_GLOO_FS_PATH"] = tmp - - role = role_maker.PaddleCloudRoleMaker() - role._generate_role() - self.case(role, "server") - self.case(role, "all") - self.clean(tmp) - - def test_fs_gloo7(self): - plats = platform.platform() - if 'Linux' not in plats: - print("skip gloo UT on MacOS/Win") - return - - os.environ["TRAINING_ROLE"] = "PSERVER" - os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" - os.environ["POD_IP"] = "127.0.0.1" - os.environ["PADDLE_PORT"] = "36001" - os.environ["PADDLE_TRAINERS_NUM"] = "0" - - os.environ["SYS_JOB_ID"] = "gloo_for_cluster" - - os.environ["PADDLE_WITH_GLOO"] = "1" - os.environ["PADDLE_GLOO_RENDEZVOUS"] = "5" - - role = role_maker.PaddleCloudRoleMaker() - self.assertRaises(ValueError, role._generate_role) - - def test_fs_gloo8(self): - plats = platform.platform() - if 'Linux' not in plats: - print("skip gloo UT on MacOS/Win") - return - - tmp = self.mkdir() - - os.environ["TRAINING_ROLE"] = "PSERVER" - os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" - os.environ["POD_IP"] = "127.0.0.1" - os.environ["PADDLE_PORT"] = "36001" - os.environ["PADDLE_TRAINERS_NUM"] = "0" - - os.environ["SYS_JOB_ID"] = "gloo_for_cluster" - - os.environ["PADDLE_WITH_GLOO"] = "2" - os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1" - os.environ["PADDLE_GLOO_FS_NAME"] = "NULL" - os.environ["PADDLE_GLOO_FS_UGI"] = "NULL" - os.environ["PADDLE_GLOO_FS_PATH"] = tmp - - def net(): - x = paddle.fluid.layers.data(name='x', shape=[13], dtype='float32') - y_predict = paddle.fluid.layers.fc(input=x, size=1, act=None) - y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') - cost = paddle.fluid.layers.square_error_cost( - input=y_predict, label=y) - avg_cost = paddle.fluid.layers.mean(cost) - return avg_cost - - from paddle.distributed import fleet - - role = role_maker.PaddleCloudRoleMaker() - fleet.init(role) - avg_cost = net() - - strategy = paddle.distributed.fleet.DistributedStrategy() - strategy.a_sync = False - - optimizer = paddle.optimizer.SGD(0.01) - optimizer = fleet.distributed_optimizer(optimizer, strategy) - optimizer.minimize(avg_cost) - - comm_world = "server" - fleet.util.barrier(comm_world) - - gather = fleet.util.all_gather(1, comm_world) - self.assertEqual(gather[0], 1) - - all_reduce = fleet.util.all_reduce(1, "sum", comm_world) - self.assertEqual(1, all_reduce) - - self.clean(tmp) - - -class TestGlooWithCloudRoleMaker(unittest.TestCase): - def setUp(self): - os.environ["PADDLE_TRAINERS_NUM"] = "1" - os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" - os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001" - os.environ["POD_IP"] = "127.0.0.1" - os.environ["PADDLE_TRAINER_ID"] = "0" - - def case(self, role, comm_world): - role._barrier(comm_world) - - gather = role._all_gather(1, comm_world) - self.assertEqual(gather[0], 1) - - all_reduce = role._all_reduce(1, "sum", comm_world) - self.assertEqual(1, all_reduce) - - def mkdir(self): - tmp = tempfile.mkdtemp() - return tmp - - def clean(self, tmp): - shutil.rmtree(tmp) - - def test_hdfs_gloo(self): - plats = platform.platform() - if 'Linux' not in plats: - print("skip gloo UT on MacOS/Win") - return - - tmp = self.mkdir() - os.environ["TRAINING_ROLE"] = "TRAINER" - os.environ["SYS_JOB_ID"] = "gloo_for_cluster" - os.environ["PADDLE_WITH_GLOO"] = "1" - os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1" - os.environ["PADDLE_GLOO_FS_NAME"] = "NULL" - os.environ["PADDLE_GLOO_FS_UGI"] = "NULL" - os.environ["PADDLE_GLOO_FS_PATH"] = tmp - - role = role_maker.PaddleCloudRoleMaker() - role._generate_role() - self.case(role, "worker") - self.clean(tmp) - - def test_fs_gloo(self): - plats = platform.platform() - if 'Linux' not in plats: - print("skip gloo UT on MacOS/Win") - return - - tmp = self.mkdir() - os.environ["TRAINING_ROLE"] = "TRAINER" - os.environ["SYS_JOB_ID"] = "gloo_for_cluster" - os.environ["PADDLE_WITH_GLOO"] = "1" - os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2" - os.environ["PADDLE_GLOO_FS_PATH"] = tmp - - role = role_maker.PaddleCloudRoleMaker() - role._generate_role() - self.case(role, "worker") - self.clean(tmp) - - def test_fs_gloo2(self): - plats = platform.platform() - if 'Linux' not in plats: - print("skip gloo UT on MacOS/Win") - return - - tmp = self.mkdir() - os.environ["TRAINING_ROLE"] = "PSERVER" - os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" - os.environ["POD_IP"] = "127.0.0.1" - os.environ["PADDLE_PORT"] = "36001" - - os.environ["SYS_JOB_ID"] = "gloo_for_cluster" - os.environ["PADDLE_WITH_GLOO"] = "1" - os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2" - os.environ["PADDLE_GLOO_FS_PATH"] = tmp - - role = role_maker.PaddleCloudRoleMaker() - role._generate_role() - self.case(role, "server") - self.clean(tmp) - - def test_fs_gloo3(self): - plats = platform.platform() - if 'Linux' not in plats: - print("skip gloo UT on MacOS/Win") - return - - tmp = self.mkdir() - os.environ["TRAINING_ROLE"] = "PSERVER" - os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001" - os.environ["POD_IP"] = "127.0.0.1" - os.environ["PADDLE_PORT"] = "36001" - - os.environ["SYS_JOB_ID"] = "gloo_for_cluster" - os.environ["PADDLE_WITH_GLOO"] = "1" - os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1" - os.environ["PADDLE_GLOO_FS_NAME"] = "NULL" - os.environ["PADDLE_GLOO_FS_UGI"] = "NULL" - os.environ["PADDLE_GLOO_FS_PATH"] = tmp - - role = role_maker.PaddleCloudRoleMaker() - role._generate_role() - self.case(role, "server") - self.clean(tmp) - def test_fs_gloo4(self): plats = platform.platform() if 'Linux' not in plats: diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 5da4a1889b6b43..ad53c815cd1c81 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -1658,20 +1658,20 @@ def body(i): i = layers.fill_constant(shape=[1], dtype='int64', value=0) ten = layers.fill_constant(shape=[1], dtype='int64', value=10) - def cond(i): + def cond1(i): return layers.less_than(i, ten) - def body(i): + def body1(i): return i + 1 - dy_ret = layers.while_loop(cond, body, [i]) + dy_ret = layers.while_loop(cond1, body1, [i]) with self.assertRaises(ValueError): j = layers.fill_constant(shape=[1], dtype='int64', value=0) def body2(i): return i + 1, i + 2 - layers.while_loop(cond, body2, [j]) + layers.while_loop(cond1, body2, [j]) self.assertTrue(np.array_equal(static_ret[0], dy_ret[0].numpy())) diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py index b936567d5b5a81..aa67d923370171 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py @@ -207,8 +207,8 @@ def inject_test(dim_x, dim_y, trans_x, trans_y): # Test case more batch_size and N, M, K -def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y, - batch_size): +def generate_compatible_shapes_batch(dim_X, dim_Y, transpose_X, transpose_Y, + batch_size): BATCH_SIZE = 2 M = 3 N = 4 @@ -243,7 +243,7 @@ def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y, # Test case n-dim -def generate_compatible_shapes(dim, transpose_X, transpose_Y): +def generate_compatible_shapes_ndim(dim, transpose_X, transpose_Y): M = 2 N = 4 K = 3 @@ -270,8 +270,8 @@ def generate_compatible_shapes(dim, transpose_X, transpose_Y): test_name = ( 'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format( dim, dim, transpose_X, transpose_Y)) - shape_X, shape_Y = generate_compatible_shapes(dim, transpose_X, - transpose_Y) + shape_X, shape_Y = generate_compatible_shapes_ndim(dim, transpose_X, + transpose_Y) globals()[test_name] = type(test_name, (Generator, OpTest), { 'shape_X': shape_X, 'shape_Y': shape_Y, diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py index 35fd49dfffff83..7a4f7f9fbd62bd 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py @@ -36,10 +36,10 @@ def test_pipeline_parallel(self): def test_hybrid_parallel_transformer(self): self.run_mnist_2gpu('hybrid_parallel_pp_transformer.py') - def test_hybrid_parallel_transformer(self): + def test_hybrid_parallel_save_load(self): self.run_mnist_2gpu('hybrid_parallel_pp_save_load.py') - def test_hybrid_parallel_transformer(self): + def test_hybrid_parallel_recompute(self): self.run_mnist_2gpu('hybrid_parallel_pp_recompute.py') diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index 047366145584e5..25bf60334e7f34 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -561,24 +561,6 @@ def setUp(self): } -class TestReduceAll(Test1DReduce): - def setUp(self): - self.op_type = "reduce_sum" - self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")} - self.attrs = {'reduce_all': True} - self.outputs = {'Out': self.inputs['X'].sum()} - - -class TestReduceAll(Test1DReduce): - def setUp(self): - self.op_type = "reduce_sum" - self.inputs = { - 'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64") - } - self.attrs = {'reduce_all': True, 'dim': (3, 4, 5)} - self.outputs = {'Out': self.inputs['X'].sum(axis=self.attrs['dim'])} - - @skip_check_grad_ci( reason="reduce_max is discontinuous non-derivable function," " its gradient check is not supported by unittest framework.") diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py index f0fbd143c5a77a..b42520a6a1c139 100644 --- a/python/paddle/fluid/tests/unittests/test_sum_op.py +++ b/python/paddle/fluid/tests/unittests/test_sum_op.py @@ -49,9 +49,6 @@ def test_check_output(self): def test_check_grad(self): self.check_grad(['x0'], 'Out') - def init_kernel_type(self): - pass - class TestSelectedRowsSumOp(unittest.TestCase): def setUp(self): From 9cb65653d81a86f9c50f6a0b2a2e7fe2150fda0f Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Mon, 16 Aug 2021 10:00:35 +0200 Subject: [PATCH 055/126] [oneDNN] Fix to 34554 (same as previous PR but should build with GPU) (#34859) * - Added softmax without caching * - Binary is no longer manually cached * - Activation onednn caching removed * - Removed manual caching of activation * - modified UT * - fix * - fix * - fixes to building * - fix * - fix * - fix to UT * - Faulty UT workaround * - approval workaround * - Fixes after review * - compilation fixes * - more lint fixes * - more fixes after review * - fixes after another round of review * - hopefully compilation fix - compilation fix --- .../mkldnn/elementwise_mkldnn_op.h | 19 +- .../mkldnn/elementwise_mul_mkldnn_op.cc | 10 +- .../operators/mkldnn/activation_mkldnn_op.cc | 11 +- .../operators/mkldnn/caching_tests.cmake | 7 +- .../fluid/operators/mkldnn/scale_mkldnn_op.cc | 8 +- .../operators/mkldnn/softmax_mkldnn_op.cc | 105 ++-- .../operators/mkldnn/test_mkldnn_caching.cc | 84 ++-- paddle/fluid/platform/mkldnn_reuse.h | 476 ++++++++++++------ 8 files changed, 441 insertions(+), 279 deletions(-) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index ddad70a6a5f31c..ffcdc079985fa6 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -47,13 +47,24 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); - platform::BinaryMKLDNNHandler handler( - BINARY_OP, axis, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, y, z, - scale_x, scale_y, scale_o, ctx.OutputName("Out")); + platform::BinaryMKLDNNHandler handler(BINARY_OP, axis, mkldnn_engine, + ctx.GetPlace(), x, y, z, scale_x, + scale_y, scale_o); const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_memory = handler.AcquireDstMemory(z); + // (jczaja) For Inplace src and dst should be the same memory object. + // So x should share buffer with z. But UT mechanics is testing inplace + // execution for this op not checking that x can be bradcasted to match in + // shape y tensor. + // This is wrong as when x is to be broadcasted then z(out) will match the + // shape of y which is bigger than x. Hence if x is smaller in shape than z + // and they share a buffer (of + // shape x) then this buffer is not big enough to hold result of elementwise + // operation. + auto dst_memory = (x->numel() == z->numel() && x->IsSharedBufferWith(*z)) + ? src_x_memory + : handler.AcquireDstMemory(z); const auto binary_prim = handler.AcquireForwardPrimitive(); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index 1c246e8d189370..af4aab8047888a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -48,9 +48,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { if (dx) { // dx = dout*y platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine, - ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f, - ctx.InputName(framework::GradVarName("Out"))); + dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), + dout, y, dx, 1.0f, 1.0f, 1.0f); const auto src_dout_memory = handler.AcquireSrcMemory(dout); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); @@ -75,9 +74,8 @@ class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { // Handler is having nullptr passed instead of output tensor as // we want Dst buffer to be allocated by oneDNN not to use Tensor platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine, - ctx.GetPlace(), dout, x, nullptr, 1.0f, 1.0f, 1.0f, - ctx.InputName(framework::GradVarName("Out"))); + dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), + dout, x, nullptr, 1.0f, 1.0f, 1.0f); const auto src_dout_memory = handler.AcquireSrcMemory(dout); const auto src_x_memory = handler.AcquireSecondSrcMemory(x); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 3b92d2e2d88913..d992890adeec3e 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -79,15 +79,15 @@ void eltwise_forward(const framework::ExecutionContext &ctx, paddle::platform::errors::PreconditionNotMet( "Operator DNNL eletwise_forward must use CPUPlace")); auto &dev_ctx = ctx.template device_context(); + const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); bool is_inplaced = x->IsSharedBufferWith(*y); - platform::ActivationMKLDNNHandler handler(algorithm, ctx, dev_ctx, - ctx.GetPlace(), x, - ctx.InputName("X"), is_inplaced); + platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, + ctx.GetPlace(), x); auto src_memory_p = handler.AcquireSrcMemory(x); auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y); @@ -106,13 +106,14 @@ template void eltwise_grad(const framework::ExecutionContext &ctx, mkldnn::algorithm algorithm) { auto &dev_ctx = ctx.template device_context(); + const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); const auto *diff_y = ctx.Input(framework::GradVarName("Out")); auto *diff_x = ctx.Output(framework::GradVarName("X")); - platform::ActivationMKLDNNHandler handler( - algorithm, ctx, dev_ctx, ctx.GetPlace(), x, diff_y, ctx.InputName("X")); + platform::ActivationMKLDNNHandler handler(algorithm, ctx, mkldnn_engine, + ctx.GetPlace(), x, diff_y); auto src_memory_p = handler.AcquireBackwardSrcMemory(x); auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y); diff --git a/paddle/fluid/operators/mkldnn/caching_tests.cmake b/paddle/fluid/operators/mkldnn/caching_tests.cmake index 4130c295b203eb..f48a5d822f8dc8 100644 --- a/paddle/fluid/operators/mkldnn/caching_tests.cmake +++ b/paddle/fluid/operators/mkldnn/caching_tests.cmake @@ -1 +1,6 @@ -cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op softmax scope device_context enforce) +set(TEST_MKLDNN_CACHING_DEPS op_registry elementwise_mul_op elementwise_add_op activation_op softmax_op conv_op im2col vol2col softmax scope device_context enforce) +if (WITH_GPU OR WITH_ROCM) + set(TEST_MKLDNN_CACHING_DEPS ${TEST_MKLDNN_CACHING_DEPS} depthwise_conv) +endif() +cc_test(test_mkldnn_caching SRCS mkldnn/test_mkldnn_caching.cc DEPS ${TEST_MKLDNN_CACHING_DEPS}) + diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc index ae17048b5d568b..84ac14d04b85b3 100644 --- a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc @@ -29,6 +29,7 @@ class ScaleMKLDNNKernel : public framework::OpKernel { void RunKernel(const framework::ExecutionContext& ctx) const { const auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); auto* x = ctx.Input("X"); auto* out = ctx.Output("Out"); @@ -36,11 +37,12 @@ class ScaleMKLDNNKernel : public framework::OpKernel { bool is_inplaced = x->IsSharedBufferWith(*out); platform::ActivationMKLDNNHandler handler( - mkldnn::algorithm::eltwise_linear, ctx, dev_ctx, ctx.GetPlace(), x, - ctx.InputName("X"), is_inplaced); + mkldnn::algorithm::eltwise_linear, ctx, mkldnn_engine, ctx.GetPlace(), + x); auto src_memory_p = handler.AcquireSrcMemory(x); - auto dst_memory_p = handler.AcquireDstMemory(out); + auto dst_memory_p = + is_inplaced ? src_memory_p : handler.AcquireDstMemory(out); auto activation_p = handler.AcquireForwardPrimitive(); auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index e065800e4d1c71..b0f27719bf9adc 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -32,69 +32,56 @@ using platform::to_void_cast; template class SoftmaxMKLDNNHandler - : public platform::MKLDNNHandlerT { + : public platform::MKLDNNHandlerNoCachingT { public: - SoftmaxMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, - const mkldnn::engine mkldnn_engine, + SoftmaxMKLDNNHandler(const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* input, - Tensor* output, const int axis, - const std::string uniq_name, bool is_inplaced) - : platform::MKLDNNHandlerT( - dev_ctx, mkldnn_engine, cpu_place, - // Softmax may be inplace then uniq_name is no longer unique - is_inplaced ? platform::CreateKey( - dev_ctx, framework::vectorize(input->dims()), - axis, uniq_name) - : platform::CreateKey( - dev_ctx, framework::vectorize(input->dims()), - uniq_name)) { - if (!this->isCached()) { - PADDLE_ENFORCE_EQ( - input->dims(), output->dims(), - platform::errors::InvalidArgument( - "The shape of input and output tensor must be identical.")); - - auto softmax_tz = framework::vectorize(input->dims()); - auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), - input->format()); - - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, - axis); - } + Tensor* output, const int axis) + : platform::MKLDNNHandlerNoCachingT( + mkldnn_engine, cpu_place) { + PADDLE_ENFORCE_EQ( + input->dims(), output->dims(), + platform::errors::InvalidArgument( + "The shape of input and output tensor must be identical.")); + + auto softmax_tz = framework::vectorize(input->dims()); + auto md = memory::desc(softmax_tz, platform::MKLDNNGetDataType(), + input->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, + axis); } SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* out, const Tensor* out_grad, Tensor* in_x_grad, const std::string& unique_name) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(out->dims()), - unique_name)) { - if (!this->isBwdCached()) { - PADDLE_ENFORCE_EQ( - out_grad->dims(), in_x_grad->dims(), - platform::errors::InvalidArgument("The shape of softmax_grad's input " - "and output must be identical.")); - - auto dims = out_grad->dims(); // input and output share the same shape - const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); - auto softmax_tz = framework::vectorize(dims); - - auto data_softmax_md = MKLDNNMemDesc( - softmax_tz, platform::MKLDNNGetDataType(), out->format()); - auto diff_softmax_md = MKLDNNMemDesc( - softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); - - this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, - data_softmax_md, axis); - this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, - axis); - } + : platform::MKLDNNHandlerNoCachingT( + mkldnn_engine, cpu_place) { + PADDLE_ENFORCE_EQ(out_grad->dims(), in_x_grad->dims(), + platform::errors::InvalidArgument( + "The shape of softmax_grad's input " + "and output must be identical, but shapes differ, " + "out_grad: %s in_grad: %s", + out_grad->dims(), in_x_grad->dims())); + + auto dims = out_grad->dims(); // input and output share the same shape + const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); + auto softmax_tz = framework::vectorize(dims); + + auto data_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out->format()); + auto diff_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, + data_softmax_md, axis); + this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, + axis); } }; @@ -111,9 +98,8 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { const int axis = CanonicalAxis(ctx.Attr("axis"), input->dims().size()); - SoftmaxMKLDNNHandler handler(dev_ctx, mkldnn_engine, ctx.GetPlace(), - input, output, axis, ctx.OutputName("Out"), - is_inplaced); + SoftmaxMKLDNNHandler handler(mkldnn_engine, ctx.GetPlace(), input, + output, axis); auto softmax_src_memory_p = handler.AcquireSrcMemory(input); // For Inplace src and and dst are the same memory object @@ -149,11 +135,12 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { paddle::platform::errors::PreconditionNotMet( "Operator DNNL SoftmaxGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); const Tensor* output = ctx.Input("Out"); auto* out_grad = ctx.template Input(framework::GradVarName("Out")); auto* in_x_grad = ctx.template Output(framework::GradVarName("X")); - SoftmaxMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), output, + SoftmaxMKLDNNHandler handler(ctx, mkldnn_engine, ctx.GetPlace(), output, out_grad, in_x_grad, ctx.InputName("Out")); auto dst_memory_p = handler.AcquireDstMemory(output); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index cad4f47ec14022..7251653793f899 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -33,6 +33,8 @@ USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); +USE_OP(conv2d); +USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); namespace paddle { namespace operators { @@ -64,16 +66,19 @@ class CacheTester { template void RunOperator(const platform::Place &place, const std::string &op_type, - const framework::DDim &dims, const std::string &output_name, - bool inplace = false) { + const framework::DDim &dims, const std::string &first_input) { framework::Scope scope; std::map num_inputs = {{"softmax", 1}, {"relu", 1}, + {"conv2d", 2}, {"elementwise_add", 2}, {"elementwise_mul", 2}}; - std::string first_input = inplace == true ? output_name : "x"; + std::string first_input_var_name = (op_type == "conv2d") ? "Input" : "X"; + std::string second_input_var_name = (op_type == "conv2d") ? "Filter" : "Y"; + std::string output_var_name = (op_type == "conv2d") ? "Output" : "Out"; + std::string output_name = "output"; std::vector input_names = { {first_input, scope.Var(first_input)->GetMutable()}, @@ -113,71 +118,40 @@ void RunOperator(const platform::Place &place, const std::string &op_type, auto &pool = platform::DeviceContextPool::Instance(); - auto op = num_inputs[op_type] > 1 - ? framework::OpRegistry::CreateOp( - op_type, {{"X", {first_input}}, {"Y", {"x1"}}}, - {{"Out", {output_name}}}, {{"use_mkldnn", {true}}}) - : framework::OpRegistry::CreateOp( - op_type, {{"X", {first_input}}}, {{"Out", {output_name}}}, - {{"use_mkldnn", {true}}}); + auto op = + num_inputs[op_type] > 1 + ? framework::OpRegistry::CreateOp( + op_type, {{first_input_var_name, {first_input}}, + {second_input_var_name, {"x1"}}}, + {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}) + : framework::OpRegistry::CreateOp( + op_type, {{first_input_var_name, {first_input}}}, + {{output_var_name, {output_name}}}, {{"use_mkldnn", {true}}}); op->Run(scope, place); pool.Get(place)->Wait(); } -TEST(test_softmax_reuse_cache, cpu_place) { - framework::DDim dims({32, 64}); +TEST(test_conv2d_reuse_cache, cpu_place) { + framework::DDim dims({1, 16, 32, 64}); platform::CPUPlace p; CacheTester ct; - RunOperator(p, "softmax", dims, "softmax_out"); - RunOperator(p, "softmax", dims, "softmax_out"); - PADDLE_ENFORCE_EQ(ct.Analyze(4), true, + RunOperator(p, "conv2d", dims, "input_signal"); + RunOperator(p, "conv2d", dims, "input_signal"); + PADDLE_ENFORCE_EQ(ct.Analyze(9), true, platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); + "Invalid number of cached oneDNN objects")); } -TEST(test_softmax_noreuse_cache, cpu_place) { - framework::DDim dims({32, 64}); +TEST(test_conv2d_noreuse_cache, cpu_place) { + framework::DDim dims({1, 16, 32, 64}); platform::CPUPlace p; CacheTester ct; - RunOperator(p, "softmax", dims, "softmax_out"); - RunOperator(p, "softmax", dims, "softmax_out2"); - PADDLE_ENFORCE_EQ(ct.Analyze(8), true, + RunOperator(p, "conv2d", dims, "input_signal"); + RunOperator(p, "conv2d", dims, "input_signal2"); + PADDLE_ENFORCE_EQ(ct.Analyze(18), true, platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - -TEST(test_softmax_inplace_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "softmax", dims, "softmax_out"); - RunOperator(p, "softmax", dims, "softmax_out", true); - PADDLE_ENFORCE_EQ(ct.Analyze(7), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - -TEST(test_relu_inplace_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "relu", dims, "relu_out"); - RunOperator(p, "relu", dims, "relu_out", true); - PADDLE_ENFORCE_EQ(ct.Analyze(7), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - -TEST(test_elementwise_add_reuse_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "elementwise_add", dims, "elementwise_add_out"); - RunOperator(p, "relu", dims, "elementwise_add_out", true); - PADDLE_ENFORCE_EQ(ct.Analyze(8), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); + "Invalid number of cached oneDNN objects")); } } // namespace operators diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index f63d45d7ff6ae6..95b8e0c610b1d4 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -34,6 +34,211 @@ using framework::Tensor; using user_function = std::function(const float*)>; using memory = mkldnn::memory; +template +class MKLDNNHandlerNoCachingT { + public: + MKLDNNHandlerNoCachingT(mkldnn::engine engine, platform::Place cpu_place) + : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { + platform::MKLDNNDeviceContext::tls().log_lib_version(); + } + + std::shared_ptr AcquireForwardPrimitive() { + return std::make_shared(*fwd_pd_); + } + + std::shared_ptr AcquireBackwardPrimitive() { + return std::make_shared(*bwd_pd_); + } + + std::shared_ptr AcquireBackwardWeightsPrimitive() { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, platform::errors::Unavailable("BWD_PD should be set when " + "getting BWD prim .")); + return std::make_shared(*bwd_w_pd_); + } + + std::shared_ptr AcquireSrcMemory( + const framework::Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(), + to_void_cast(input_data)); + } + + template + std::shared_ptr AcquireDstMemory(framework::Tensor* output) { + T_out* ptr = + output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr); + } + + template + std::shared_ptr AcquireDstMemory(void) { + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc()); + } + + template + std::shared_ptr AcquireDstMemory( + const framework::Tensor* output) { + const T_out* output_data = output->data(); + return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), + to_void_cast(output_data)); + } + + std::shared_ptr AcquireDiffDstMemory( + const framework::Tensor* diffdst) { + const T* ptr = diffdst->data(); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_desc(), + to_void_cast(ptr)); + } + + std::shared_ptr AcquireDiffSrcMemory( + framework::Tensor* diffsrc) { + T* ptr = + diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr); + } + + // Buffer of given Tensor is used for oneDNN computation + std::shared_ptr AcquireDiffWeightsMemory( + framework::Tensor* diff_weights) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + T* ptr = diff_weights->mutable_data( + place_, bwd_w_pd_->diff_weights_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), + ptr); + } + + // Buffer is allocated by oneDNN to store computation results + std::shared_ptr AcquireDiffWeightsMemory(void) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc()); + } + + protected: + // If your primitive descriptor requires attributes, pass them as a + // first argument and paramters to descriptor constructor in the following + // arguments. Otherwise, all arguments will be forwarded to descriptor + // constructor, including the first one. + template + void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { + CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); + } + + // Using sfinae to specialise variadic function. Workaround for not having + // if constexpr in C++ 11. + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(args)...); + fwd_pd_ = std::make_shared( + fwd_desc, first, engine_); + } + + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(first), + std::forward(args)...); + fwd_pd_ = + std::make_shared(fwd_desc, engine_); + } + + template + void AcquireBackwardPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL(fwd_pd_, + platform::errors::Unavailable( + "Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = typename TBackward::desc(std::forward(args)...); + bwd_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + template + void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL(fwd_pd_, + platform::errors::Unavailable( + "Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = + typename TBackward_params::desc(std::forward(args)...); + bwd_w_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::desc md, void* ptr) { + return std::make_shared(md, engine_, ptr); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::desc md) { + return std::make_shared(md, engine_); + } + + void AcquireReorder(const std::shared_ptr& user_memory_p, + const std::shared_ptr& target_memory_p) { + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } + + template + std::shared_ptr AcquireMemoryWithReorder( + const mkldnn::memory::desc& user_md, + const mkldnn::memory::desc& target_md, void* ptr, + const std::string& suffix, bool is_persistent = false, + std::function(const F*)> custom_reorder_func = {}) { + std::shared_ptr target_memory_p; + if (custom_reorder_func) { + auto reordered_data = + custom_reorder_func(reinterpret_cast(ptr)); + ptr = reinterpret_cast(reordered_data.get()); + } + auto user_memory_p = std::make_shared(user_md, engine_, ptr); + if (user_md != target_md) { + target_memory_p = std::make_shared(target_md, engine_); + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } + return target_memory_p; + } + + mkldnn::engine engine_; + platform::Place place_; + std::shared_ptr fwd_pd_; + std::shared_ptr bwd_pd_; + std::shared_ptr bwd_w_pd_; +}; + template @@ -79,7 +284,7 @@ class MKLDNNHandlerT { std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); if (backward_p == nullptr) { PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, platform::errors::Unavailable( - "Error: BWD_PD should be set when " + "BWD_PD should be set when " "getting BWD prim witk key: %s .", key_p)); backward_p = std::make_shared(*bwd_w_pd_); @@ -138,7 +343,7 @@ class MKLDNNHandlerT { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, platform::errors::Unavailable( - "Error: BWD_W_PD should be set when getting BWD grad of weights.")); + "BWD_W_PD should be set when getting BWD grad of weights.")); T* ptr = diff_weights->mutable_data( place_, bwd_w_pd_->diff_weights_desc().get_size()); return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), ptr, @@ -150,7 +355,7 @@ class MKLDNNHandlerT { PADDLE_ENFORCE_NOT_NULL( bwd_w_pd_, platform::errors::Unavailable( - "Error: BWD_W_PD should be set when getting BWD grad of weights.")); + "BWD_W_PD should be set when getting BWD grad of weights.")); return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), "@diff_wei_mem_p"); } @@ -589,70 +794,70 @@ class MKLDNNHandler { }; template -class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT { +class BinaryMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { public: BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis, - const MKLDNNDeviceContext& dev_ctx, const mkldnn::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, Tensor* z, - float scale_x, float scale_y, float scale_z, - const std::string& uniq_name) - : platform::MKLDNNHandlerT( - dev_ctx, engine, cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), - uniq_name)) { - if (!this->isCached()) { - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for X tensor.")); - PADDLE_ENFORCE_NE( - x->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for X tensor.")); - - PADDLE_ENFORCE_EQ( - y->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for Y tensor.")); - PADDLE_ENFORCE_NE( - y->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for Y tensor.")); - - const auto src_x_tz = framework::vectorize(x->dims()); - const auto src_y_tz = framework::vectorize(y->dims()); - // if output tensor(z) is nullptr then we are computing into oneDNN - // managed buffer - auto rankdiff = x->dims().size() - y->dims().size(); - const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) - : framework::vectorize(z->dims()); - - auto src0_md = dnnl::memory::desc( - src_x_tz, platform::MKLDNNGetDataType(), x->format()); - auto src1_md = dnnl::memory::desc( - src_y_tz, platform::MKLDNNGetDataType(), y->format()); - if (rankdiff > 0) { // Second input is of smaller rank than first - std::vector dims1_ex(rankdiff, 1); - dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), - src_y_tz.begin(), src_y_tz.end()); - src1_md = src1_md.reshape(dims1_ex); - } else if (rankdiff < 0) { // First input is of smaller than second - std::vector dims0_ex(-rankdiff, 1); - dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), - src_x_tz.begin(), src_x_tz.end()); - src0_md = src0_md.reshape(dims0_ex); - } - const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); - - auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); - this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, - src1_md, dst_md); + float scale_x, float scale_y, float scale_z) + : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { + PADDLE_ENFORCE_EQ( + x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d", + DataLayout::kMKLDNN, x->layout())); + PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for X tensor : %d (undef)", + static_cast(x->format()))); + + PADDLE_ENFORCE_EQ( + y->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d", + DataLayout::kMKLDNN, y->layout())); + PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for Y tensor : %d (undef)", + static_cast(y->format()))); + + const auto src_x_tz = framework::vectorize(x->dims()); + const auto src_y_tz = framework::vectorize(y->dims()); + // if output tensor(z) is nullptr then we are computing into oneDNN + // managed buffer + auto rankdiff = x->dims().size() - y->dims().size(); + const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) + : framework::vectorize(z->dims()); + + auto src0_md = dnnl::memory::desc( + src_x_tz, platform::MKLDNNGetDataType(), x->format()); + auto src1_md = dnnl::memory::desc( + src_y_tz, platform::MKLDNNGetDataType(), y->format()); + if (rankdiff > 0) { // Second input is of smaller rank than first + std::vector dims1_ex(rankdiff, 1); + dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), + src_y_tz.begin(), src_y_tz.end()); + src1_md = src1_md.reshape(dims1_ex); + } else if (rankdiff < 0) { // First input is of smaller than second + std::vector dims0_ex(-rankdiff, 1); + dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), + src_x_tz.begin(), src_x_tz.end()); + src0_md = src0_md.reshape(dims0_ex); } + const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); + + auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); + this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, + dst_md); } std::shared_ptr AcquireSecondSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->src1_desc(), to_void_cast(input_data), "@src1_mem_p"); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(), + to_void_cast(input_data)); } private: @@ -775,111 +980,95 @@ class ReductionMKLDNNHandler template class ActivationMKLDNNHandler - : public MKLDNNHandlerT { + : public MKLDNNHandlerNoCachingT { public: ActivationMKLDNNHandler(mkldnn::algorithm algorithm, const framework::ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, Place cpu_place, - const framework::Tensor* in_x, - const std::string& unique_name, bool is_inplaced) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - is_inplaced ? platform::CreateKey( - dev_ctx, framework::vectorize(in_x->dims()), "a", - algorithm, unique_name) - : platform::CreateKey( - dev_ctx, framework::vectorize(in_x->dims()), "a", - unique_name)) { - if (!this->isCached()) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - // eltwise_linear means we are in scale op - if (algorithm == mkldnn::algorithm::eltwise_linear) { - bool bias_after_scale = ctx.Attr("bias_after_scale"); - auto* scale_tensor = ctx.Input("ScaleTensor"); - alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") - : (float)*(scale_tensor->data()); - beta = ctx.Attr("bias"); - // if bias_after_scale == true - // out = scale*X + bias - // else - // out = scale*(X + bias) = scale*X + scale*bias - if (!bias_after_scale) beta *= alpha; - } else { - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); - } + const mkldnn::engine engine, Place cpu_place, + const framework::Tensor* in_x) + : platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + // eltwise_linear means we are in scale op + if (algorithm == mkldnn::algorithm::eltwise_linear) { + bool bias_after_scale = ctx.Attr("bias_after_scale"); + auto* scale_tensor = ctx.Input("ScaleTensor"); + alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") + : (float)*(scale_tensor->data()); + beta = ctx.Attr("bias"); + // if bias_after_scale == true + // out = scale*X + bias + // else + // out = scale*(X + bias) = scale*X + scale*bias + if (!bias_after_scale) beta *= alpha; + } else { + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); } + } - PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, - platform::errors::Unimplemented( - "Input dimension size can be 1, 2, 3, 4, " - "5, or 6, but now the dimension size is", - in_x->dims().size())); + PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, + platform::errors::Unimplemented( + "Input dimension size can be 1, 2, 3, 4, " + "5, or 6, but now the dimension size is", + in_x->dims().size())); - auto src_tz = framework::vectorize(in_x->dims()); - auto src_fmt = - src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); - auto md = mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), - src_fmt); + auto src_tz = framework::vectorize(in_x->dims()); + auto src_fmt = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto md = + mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), src_fmt); - this->AcquireForwardPrimitiveDescriptor( - mkldnn::prop_kind::forward_training, algorithm, md, alpha, beta); - } + this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, + algorithm, md, alpha, beta); } ActivationMKLDNNHandler(mkldnn::algorithm algorithm, const framework::ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, Place cpu_place, - const framework::Tensor* in_x, const Tensor* out_grad, - const std::string& unique_name) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()), - "a", unique_name)) { - if (!this->isBwdCached()) { - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); - } + const mkldnn::engine engine, Place cpu_place, + const framework::Tensor* in_x, const Tensor* out_grad) + : platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); + } - auto diff_dst_tz = framework::vectorize(out_grad->dims()); + auto diff_dst_tz = framework::vectorize(out_grad->dims()); - auto src_fmt = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); - auto diff_fmt = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); + auto src_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto diff_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); - auto dims = framework::vectorize(in_x->dims()); - auto diff_dst_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), diff_fmt); - auto src_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), src_fmt); + auto dims = framework::vectorize(in_x->dims()); + auto diff_dst_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), diff_fmt); + auto src_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), src_fmt); - this->AcquireForwardPrimitiveDescriptor( - mkldnn::prop_kind::forward_training, algorithm, src_md, alpha, beta); - this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, - alpha, beta); - } + this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, + algorithm, src_md, alpha, beta); + this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, + alpha, beta); } std::shared_ptr AcquireBackwardSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(), - to_void_cast(input_data), - "@bwd-src_mem_p"); + to_void_cast(input_data)); } }; @@ -1430,11 +1619,6 @@ using ConvMKLDNNHandler = mkldnn::convolution_backward_data, mkldnn::convolution_backward_weights>; -using ConvTransposeMKLDNNHandler = - ConvMKLDNNTemplateHandler; - template static std::shared_ptr SetDstMemory( const framework::ExecutionContext& ctx, framework::Tensor* output, From ae80df915533fb1f83cd2c72f2eafead531a3ba3 Mon Sep 17 00:00:00 2001 From: "joanna.wozna.intel" Date: Mon, 16 Aug 2021 10:01:52 +0200 Subject: [PATCH 056/126] Fix elementwise_add quantization (#34820) * Remove force_fp32_output from elementwise_add quantization * Fix cpu_quantize_placement test * Review related changes --- .../framework/ir/graph_pattern_detector.cc | 6 +++--- .../framework/ir/mkldnn/cpu_quantize_pass.cc | 19 ++++++++----------- .../ir/mkldnn/cpu_quantize_pass_tester.cc | 7 +++---- .../cpu_quantize_placement_pass_tester.cc | 8 ++++---- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 70e48755dcd1e3..b4c94010e480a7 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2249,9 +2249,9 @@ PDNode *patterns::MultipleQuantize::operator()() { PDNode *patterns::QuantizePlacement::operator()( const std::unordered_set &quantize_enabled_op_types) { std::unordered_set supported_op_types = - std::unordered_set( - {"concat", "conv2d", "elementwise_add", "fc", "matmul", "pool2d", - "prior_box", "relu", "reshape2", "transpose2", "fusion_gru"}); + std::unordered_set({"concat", "conv2d", "elementwise_add", + "fc", "matmul", "pool2d", "prior_box", + "reshape2", "transpose2", "fusion_gru"}); if (!quantize_enabled_op_types.empty()) { supported_op_types = quantize_enabled_op_types; } diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 3c06c9ee41d2a2..f50cd0a01d204d 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -770,7 +770,8 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, elementwise_add_pattern); - if (!AreScalesPresentForNodes({elementwise_add_x, elementwise_add_y})) { + if (!AreScalesPresentForNodes( + {elementwise_add_x, elementwise_add_y, elementwise_add_out})) { LogCannotQuantizeOp(elementwise_add_op); return; } @@ -793,16 +794,12 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale, is_y_unsigned, "Scale_y"); - // if quantization scale is missing for output tensor, return fp32 data - if (AreScalesPresentForNodes({elementwise_add_out})) { - bool is_output_unsigned{false}; - auto output_scale = - GetScaleValueForNode(elementwise_add_out, &is_output_unsigned); - DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out", - output_scale, is_output_unsigned, "Scale_out"); - } else { - elementwise_add_op->Op()->SetAttr("force_fp32_output", true); - } + bool is_output_unsigned{false}; + auto output_scale = + GetScaleValueForNode(elementwise_add_out, &is_output_unsigned); + + DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out", + output_scale, is_output_unsigned, "Scale_out"); ++quantize_elementwise_add_count; }; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index adb431fdb097f5..6fcea6a66cc5d1 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -854,13 +854,12 @@ TEST(CpuQuantizePass, elementwise_add) { TEST(CpuQuantizePass, elementwise_add_output_scale_missing) { int elementwise_add_count = 1; - int quant_count = 2; + int quant_count = 0; int dequant_count = 2; - // 2 Quant + 2 IN - int added_nodes_count = 4; + int added_nodes_count = 0; MainTestElementwiseAdd(BuildProgramDescElementwiseAdd(), elementwise_add_count, quant_count, dequant_count, - added_nodes_count, 2.0f * 127, true); + added_nodes_count, 1.f, true); } TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) { diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc index 761defc25ff5c8..daf913bf7d80d1 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc @@ -131,13 +131,13 @@ TEST(QuantizerPlacementPass, enabled_conv_excluded_one) { } TEST(QuantizerPlacementPass, empty_list) { - // all operators quantized - MainTest({}, {}, 6); + // all operators except relu should be quantized + MainTest({}, {}, 5); } TEST(QuantizerPlacementPass, default_attr_value) { - // all operators quantized - DefaultAttrTest(6); + // all operators except relu should be quantized + DefaultAttrTest(5); } } // namespace ir From d028214d99405e3201b404b55446268936617730 Mon Sep 17 00:00:00 2001 From: Fan Zhang Date: Mon, 16 Aug 2021 17:25:24 +0800 Subject: [PATCH 057/126] [CPU-PSLIB] Add config for scale_sparse_grad in config_fleet.py,test=develop (#34893) --- paddle/fluid/framework/device_worker.h | 1 + paddle/fluid/framework/downpour_worker.cc | 8 ++++++-- paddle/fluid/framework/downpour_worker_opt.cc | 4 +++- paddle/fluid/framework/fleet/fleet_wrapper.cc | 5 +++-- paddle/fluid/framework/fleet/fleet_wrapper.h | 3 ++- paddle/fluid/framework/trainer_desc.proto | 1 + .../fleet/parameter_server/pslib/optimizer_factory.py | 2 ++ python/paddle/fluid/trainer_desc.py | 4 ++++ python/paddle/fluid/trainer_factory.py | 4 ++++ 9 files changed, 26 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 45efa43ccb74bc..6dd6fed0151585 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -212,6 +212,7 @@ class DeviceWorker { FetchConfig fetch_config_; bool use_cvm_; bool no_cvm_; + bool scale_sparse_gradient_with_batch_size_; TrainerDesc trainer_desc_; // dump params or grads for debug diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index ad3f27f03fa143..11f70acb73aa7f 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -89,6 +89,8 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) { use_cvm_ = desc.use_cvm(); // for sparse value accessor, embedding only no_cvm_ = desc.no_cvm(); + scale_sparse_gradient_with_batch_size_ = + desc.scale_sparse_gradient_with_batch_size(); scale_datanorm_ = desc.scale_datanorm(); dump_slot_ = desc.dump_slot(); adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); @@ -591,7 +593,8 @@ void DownpourWorker::TrainFilesWithProfiler() { *thread_scope_, tid, features_[tid], feature_labels_[tid], sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_, - dump_slot_, &sparse_push_keys_[tid], no_cvm_); + dump_slot_, &sparse_push_keys_[tid], no_cvm_, + scale_sparse_gradient_with_batch_size_); timeline.Pause(); push_sparse_time += timeline.ElapsedSec(); total_time += timeline.ElapsedSec(); @@ -866,7 +869,8 @@ void DownpourWorker::TrainFiles() { *thread_scope_, tid, features_[tid], feature_labels_[tid], sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_, - dump_slot_, &sparse_push_keys_[tid], no_cvm_); + dump_slot_, &sparse_push_keys_[tid], no_cvm_, + scale_sparse_gradient_with_batch_size_); } } diff --git a/paddle/fluid/framework/downpour_worker_opt.cc b/paddle/fluid/framework/downpour_worker_opt.cc index afe6ddfa3d9a63..ed0a9d9107e79d 100644 --- a/paddle/fluid/framework/downpour_worker_opt.cc +++ b/paddle/fluid/framework/downpour_worker_opt.cc @@ -450,11 +450,13 @@ void DownpourWorkerOpt::TrainFiles() { break; } } + bool scale_sparse_gradient_with_batch_size_ = true; fleet_ptr_->PushSparseVarsWithLabelAsync( *thread_scope_, tid, features_[tid], feature_labels_[tid], sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_, - dump_slot_, &sparse_push_keys_[tid], no_cvm_); + dump_slot_, &sparse_push_keys_[tid], no_cvm_, + scale_sparse_gradient_with_batch_size_); } } diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index bb318e59e46e41..dc5e24ef5de42f 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -870,7 +870,8 @@ void FleetWrapper::PushSparseVarsWithLabelAsync( std::vector>* push_values, std::vector<::std::future>* push_sparse_status, const int batch_size, const bool use_cvm, const bool dump_slot, - std::vector* sparse_push_keys, const bool no_cvm) { + std::vector* sparse_push_keys, const bool no_cvm, + const bool scale_sparse_gradient_with_batch_size) { #ifdef PADDLE_WITH_PSLIB int offset = 2; int slot_offset = 0; @@ -939,7 +940,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync( } float* g = g_tensor->data(); - if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) { + if (scale_sparse_gradient_with_batch_size && grad_dim > 0) { int dim = emb_dim; Eigen::Map< Eigen::Matrix> diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index 09f7801b19f988..c1db06a298c861 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -209,7 +209,8 @@ class FleetWrapper { std::vector>* push_values, std::vector<::std::future>* push_sparse_status, const int batch_size, const bool use_cvm, const bool dump_slot, - std::vector* sparse_push_keys, const bool no_cvm); + std::vector* sparse_push_keys, const bool no_cvm, + const bool scale_sparse_gradient_with_batch_size); // Push sparse variables to server in async mode void PushSparseFromTensorWithLabelAsync( diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 504885ff5ccbce..6f487d6984cc43 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -61,6 +61,7 @@ message TrainerDesc { optional bool use_ps_gpu = 32 [ default = false ]; optional string user_define_dump_filename = 33; + optional bool scale_sparse_gradient_with_batch_size = 34 [ default = true ]; // device worker parameters optional HogwildWorkerParameter hogwild_param = 101; diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index 607a3c94f8a4e7..9a21a5a850db97 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -825,6 +825,8 @@ def _minimize(self, opt_info["worker_skipped_ops"] = worker_skipped_ops opt_info["use_cvm"] = strategy.get("use_cvm", False) opt_info["no_cvm"] = strategy.get("no_cvm", False) + opt_info["scale_sparse_gradient_with_batch_size"] = strategy.get( + "scale_sparse_gradient_with_batch_size", True) opt_info["worker_class"] = strategy.get("worker_class", "DownpourWorker") opt_info["stat_var_names"] = strategy.get("stat_var_names", []) diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index 4eca3a494e25a4..6152bce55ce9f2 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -124,6 +124,10 @@ def _set_use_cvm(self, use_cvm=False): def _set_no_cvm(self, no_cvm=False): self.proto_desc.no_cvm = no_cvm + def _set_scale_sparse_grad_with_batch_size( + self, scale_sparse_gradient_with_batch_size=True): + self.proto_desc.scale_sparse_gradient_with_batch_size = scale_sparse_gradient_with_batch_size + def _set_scale_datanorm(self, scale_datanorm=-1): self.proto_desc.scale_datanorm = scale_datanorm diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index 7912ffca84ba41..ed10bee2e063a7 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -95,6 +95,10 @@ def _create_trainer(self, opt_info=None): trainer._set_use_cvm(opt_info["use_cvm"]) if opt_info.get("no_cvm") is not None: trainer._set_no_cvm(opt_info["no_cvm"]) + if opt_info.get( + "scale_sparse_gradient_with_batch_size") is not None: + trainer._set_scale_sparse_grad_with_batch_size(opt_info[ + "scale_sparse_gradient_with_batch_size"]) if opt_info.get("scale_datanorm") is not None: trainer._set_scale_datanorm(opt_info["scale_datanorm"]) if opt_info.get("adjust_ins_weight") is not None: From 2e30134f7942b34ae40d880b22eae6d90a60db45 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Mon, 16 Aug 2021 18:12:28 +0800 Subject: [PATCH 058/126] Change the invoking method of settiem by Ellipsis and None index from numpy to set_value op (#34911) * Change invoking mathod of the settiem by Ellipsis and None index from numpy to set_value op * add none_axes into attr of set_value_op in dygraph mode --- paddle/fluid/pybind/imperative.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 0b6af3b542395d..6c4213979a46be 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -785,7 +785,8 @@ void BindImperative(py::module *m_ptr) { const int size = PyTuple_GET_SIZE(index_ptr); for (int dim = 0; dim < size; ++dim) { PyObject *slice_item = PyTuple_GetItem(index_ptr, dim); - if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item))) { + if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item) || + slice_item == Py_Ellipsis || slice_item == Py_None)) { parse_index = false; break; } @@ -807,7 +808,8 @@ void BindImperative(py::module *m_ptr) { {"starts", starts}, {"ends", ends}, {"steps", steps}, - {"decrease_axes", decrease_axes}}; + {"decrease_axes", decrease_axes}, + {"none_axes", none_axes}}; imperative::NameVarBaseMap ins = {{"Input", {self}}}; imperative::NameVarBaseMap outs = {{"Out", {self}}}; From 49818943027b3c586f02b33d79b15ce6ef998242 Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Mon, 16 Aug 2021 19:14:34 +0800 Subject: [PATCH 059/126] [NPU] Add size npu op (#34636) * add size npu op * modify support data type * no longer use NPU size OP * remove useless comments, add test case * fix copyright, remove useless include --- paddle/fluid/operators/size_op_npu.cc | 51 +++++++ .../tests/unittests/npu/test_size_op_npu.py | 141 ++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 paddle/fluid/operators/size_op_npu.cc create mode 100755 python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc new file mode 100644 index 00000000000000..4e9c2ec482e927 --- /dev/null +++ b/paddle/fluid/operators/size_op_npu.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/mul_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class SizeNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("Input"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + Tensor cpu_tensor; + auto cpu_data = + cpu_tensor.mutable_data(out->dims(), platform::CPUPlace()); + cpu_data[0] = x->numel(); + TensorCopy(cpu_tensor, ctx.GetPlace(), + ctx.template device_context(), out); + ctx.template device_context().Wait(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + size, ops::SizeNPUKernel, + ops::SizeNPUKernel, + ops::SizeNPUKernel, + ops::SizeNPUKernel, + ops::SizeNPUKernel, + ops::SizeNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py new file mode 100755 index 00000000000000..80721cbd66a558 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py @@ -0,0 +1,141 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +sys.path.append("..") +import paddle +import paddle.fluid as fluid +from op_test import OpTest + +paddle.enable_static() + + +class TestSizeOp(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "size" + + self.config() + input = np.zeros(self.shape, dtype=self.dtype) + self.inputs = {'Input': input} + self.outputs = {'Out': np.array([np.size(input)], dtype=np.int64)} + + def config(self): + self.shape = [1, 2] + self.dtype = np.int32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def set_npu(self): + self.__class__.use_npu = True + + +class TestSizeOp1(TestSizeOp): + def config(self): + self.shape = [2] + self.dtype = np.float64 + + +class TestSizeOp2(TestSizeOp): + def config(self): + self.shape = [2, 3] + self.dtype = np.float32 + + +class TestSizeOp3(TestSizeOp): + def config(self): + self.shape = [2, 3, 100] + self.dtype = np.float16 + + +class TestSizeOp4(TestSizeOp): + def config(self): + self.shape = [2**10] + self.dtype = np.bool + + +class TestSizeOp5(TestSizeOp): + def config(self): + self.shape = [7, 8, 9, 10] + self.dtype = np.int64 + + +class TestSizeOp6(TestSizeOp): + def config(self): + self.shape = [] + self.dtype = np.int64 + + +class TestSizeAPI(unittest.TestCase): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + + def set_npu(self): + self.__class__.use_npu = True + + def test_size_static(self): + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + shape1 = [2, 1, 4, 5] + shape2 = [1, 4, 5] + x_1 = paddle.fluid.data(shape=shape1, dtype='int32', name='x_1') + x_2 = paddle.fluid.data(shape=shape2, dtype='int32', name='x_2') + input_1 = np.random.random(shape1).astype("int32") + input_2 = np.random.random(shape2).astype("int32") + out_1 = paddle.fluid.layers.size(x_1) + out_2 = paddle.fluid.layers.size(x_2) + exe = paddle.static.Executor(place=self.place) + res_1, res_2 = exe.run(feed={ + "x_1": input_1, + "x_2": input_2, + }, + fetch_list=[out_1, out_2]) + assert (np.array_equal( + res_1, np.array([np.size(input_1)]).astype("int64"))) + assert (np.array_equal( + res_2, np.array([np.size(input_2)]).astype("int64"))) + + def test_size_imperative(self): + paddle.disable_static(self.place) + input_1 = np.random.random([2, 1, 4, 5]).astype("int32") + input_2 = np.random.random([1, 4, 5]).astype("int32") + x_1 = paddle.to_tensor(input_1) + x_2 = paddle.to_tensor(input_2) + out_1 = paddle.fluid.layers.size(x_1) + out_2 = paddle.fluid.layers.size(x_2) + assert (np.array_equal(out_1.numpy().item(0), np.size(input_1))) + assert (np.array_equal(out_2.numpy().item(0), np.size(input_2))) + paddle.enable_static() + + def test_error(self): + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + + def test_x_type(): + shape = [1, 4, 5] + input_1 = np.random.random(shape).astype("int32") + out_1 = paddle.fluid.layers.size(input_1) + + self.assertRaises(TypeError, test_x_type) + + +if __name__ == '__main__': + unittest.main() From 6b4b9fea0e0e5fd8f2428cb8289f03d42460dad8 Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Mon, 16 Aug 2021 19:50:23 +0800 Subject: [PATCH 060/126] hccl init sync (#34918) --- paddle/fluid/operators/collective/c_comm_init_hccl_op.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc index 3df0595525941a..7dec645b5b3ad8 100644 --- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc @@ -87,6 +87,8 @@ class CCommInitOpAscend : public framework::OperatorBase { } PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( buff, size, HCCL_DATA_TYPE_FP32, 0, comm->comm(), stream)); + // Synchronize stream to find hccl error in time. + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream)); VLOG(3) << "Build connection successful."; #else PADDLE_THROW(platform::errors::PreconditionNotMet( From 2a4ed087315de1a2369d0d96933be21bbb41c046 Mon Sep 17 00:00:00 2001 From: feng_shuai Date: Mon, 16 Aug 2021 19:54:01 +0800 Subject: [PATCH 061/126] Jetson nano bilinear (#34751) * change bilinear thread for nano and tx2 * change bilinear thread for nano and tx2 --- cmake/cupti.cmake | 1 + paddle/fluid/operators/interpolate_v2_op.cu | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake index 17626688531e61..2d7b1917b68731 100644 --- a/cmake/cupti.cmake +++ b/cmake/cupti.cmake @@ -9,6 +9,7 @@ find_path(CUPTI_INCLUDE_DIR cupti.h $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/include + ${CUDA_TOOLKIT_ROOT_DIR}/targets/aarch64-linux/include NO_DEFAULT_PATH ) diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index 6745592c5c1a8b..d335e1a2f9d58b 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -1186,7 +1186,14 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); } else if ("bilinear" == interp_method) { - KeBilinearInterpFw<<<<>>( input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout); From b1cc4a4608a4ab4f5fff8570a08814eb91332223 Mon Sep 17 00:00:00 2001 From: veyron95 <87417304+veyron95@users.noreply.github.com> Date: Mon, 16 Aug 2021 20:50:39 +0800 Subject: [PATCH 062/126] [NPU] Support npu op:(1)arg_min (2)arg_max (#34867) * [NPU] Support npu op:(1)arg_min (2)arg_max * Modify and add unit test cases * Modify unit test cases --- paddle/fluid/operators/arg_max_op_npu.cc | 54 ++++ paddle/fluid/operators/arg_min_op_npu.cc | 54 ++++ .../unittests/npu/test_arg_max_op_npu.py | 273 ++++++++++++++++++ .../unittests/npu/test_arg_min_op_npu.py | 273 ++++++++++++++++++ 4 files changed, 654 insertions(+) create mode 100644 paddle/fluid/operators/arg_max_op_npu.cc create mode 100644 paddle/fluid/operators/arg_min_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py create mode 100644 python/paddle/fluid/tests/unittests/npu/test_arg_min_op_npu.py diff --git a/paddle/fluid/operators/arg_max_op_npu.cc b/paddle/fluid/operators/arg_max_op_npu.cc new file mode 100644 index 00000000000000..38f9813ad02b40 --- /dev/null +++ b/paddle/fluid/operators/arg_max_op_npu.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +template +class ArgMaxNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + int64_t axis = ctx.Attr("axis"); + auto dtype = ctx.Attr("dtype"); + + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + NpuOpRunner runner; + runner.SetType("ArgMaxV2") + .AddInput(*x) + .AddInput(std::vector{axis}) + .AddOutput(*out) + .AddAttr("dtype", dtype); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL( + arg_max, ops::ArgMaxNPUKernel, + ops::ArgMaxNPUKernel); diff --git a/paddle/fluid/operators/arg_min_op_npu.cc b/paddle/fluid/operators/arg_min_op_npu.cc new file mode 100644 index 00000000000000..f776412c16239f --- /dev/null +++ b/paddle/fluid/operators/arg_min_op_npu.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +template +class ArgMinNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + int64_t axis = ctx.Attr("axis"); + auto dtype = ctx.Attr("dtype"); + + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + NpuOpRunner runner; + runner.SetType("ArgMin") + .AddInput(*x) + .AddInput(std::vector{axis}) + .AddOutput(*out) + .AddAttr("dtype", dtype); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL( + arg_min, ops::ArgMinNPUKernel, + ops::ArgMinNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py new file mode 100644 index 00000000000000..9bc46697c0dfc0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py @@ -0,0 +1,273 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid.core as core + +paddle.enable_static() + + +class BaseTestCase(OpTest): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4) + self.dtype = 'float32' + self.axis = 1 + + def setUp(self): + self.initTestCase() + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + np.random.seed(2021) + self.x = (np.random.random(self.dims)).astype(self.dtype) + self.inputs = {'X': self.x} + self.attrs = {'axis': self.axis} + if self.op_type == "arg_min": + self.outputs = {'Out': np.argmin(self.x, axis=self.axis)} + else: + self.outputs = {'Out': np.argmax(self.x, axis=self.axis)} + + def test_check_output(self): + self.check_output_with_place(self.place) + + +# test argmax, dtype: float16 +class TestArgMaxFloat16Case1(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4, 5) + self.dtype = 'float16' + self.axis = -1 + + +class TestArgMaxFloat16Case2(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4, 5) + self.dtype = 'float16' + self.axis = 0 + + +class TestArgMaxFloat16Case3(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4, 5) + self.dtype = 'float16' + self.axis = 1 + + +class TestArgMaxFloat16Case4(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4, 5) + self.dtype = 'float16' + self.axis = 2 + + +class TestArgMaxFloat16Case5(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4) + self.dtype = 'float16' + self.axis = -1 + + +class TestArgMaxFloat16Case6(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4) + self.dtype = 'float16' + self.axis = 0 + + +class TestArgMaxFloat16Case7(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4) + self.dtype = 'float16' + self.axis = 1 + + +class TestArgMaxFloat16Case8(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (1, ) + self.dtype = 'float16' + self.axis = 0 + + +class TestArgMaxFloat16Case9(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (2, ) + self.dtype = 'float16' + self.axis = 0 + + +class TestArgMaxFloat16Case10(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, ) + self.dtype = 'float16' + self.axis = 0 + + +# test argmax, dtype: float32 +class TestArgMaxFloat32Case1(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = -1 + + +class TestArgMaxFloat32Case2(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + + +class TestArgMaxFloat32Case3(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 1 + + +class TestArgMaxFloat32Case4(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 2 + + +class TestArgMaxFloat32Case5(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4) + self.dtype = 'float32' + self.axis = -1 + + +class TestArgMaxFloat32Case6(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4) + self.dtype = 'float32' + self.axis = 0 + + +class TestArgMaxFloat32Case7(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4) + self.dtype = 'float32' + self.axis = 1 + + +class TestArgMaxFloat32Case8(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (1, ) + self.dtype = 'float32' + self.axis = 0 + + +class TestArgMaxFloat32Case9(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (2, ) + self.dtype = 'float32' + self.axis = 0 + + +class TestArgMaxFloat32Case10(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, ) + self.dtype = 'float32' + self.axis = 0 + + +class TestArgMaxAPI(unittest.TestCase): + def initTestCase(self): + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + + def setUp(self): + self.initTestCase() + self.__class__.use_npu = True + self.place = [paddle.NPUPlace(0)] + + def test_dygraph_api(self): + def run(place): + paddle.disable_static(place) + np.random.seed(2021) + numpy_input = (np.random.random(self.dims)).astype(self.dtype) + tensor_input = paddle.to_tensor(numpy_input) + numpy_output = np.argmax(numpy_input, axis=self.axis) + paddle_output = paddle.argmax(tensor_input, axis=self.axis) + self.assertEqual( + np.allclose(numpy_output, paddle_output.numpy()), True) + paddle.enable_static() + + for place in self.place: + run(place) + + +class TestArgMaxAPI_2(unittest.TestCase): + def initTestCase(self): + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + self.keep_dims = True + + def setUp(self): + self.initTestCase() + self.__class__.use_npu = True + self.place = [paddle.NPUPlace(0)] + + def test_dygraph_api(self): + def run(place): + paddle.disable_static(place) + np.random.seed(2021) + numpy_input = (np.random.random(self.dims)).astype(self.dtype) + tensor_input = paddle.to_tensor(numpy_input) + numpy_output = np.argmax( + numpy_input, axis=self.axis).reshape(1, 4, 5) + paddle_output = paddle.argmax( + tensor_input, axis=self.axis, keepdim=self.keep_dims) + self.assertEqual( + np.allclose(numpy_output, paddle_output.numpy()), True) + self.assertEqual(numpy_output.shape, paddle_output.numpy().shape) + paddle.enable_static() + + for place in self.place: + run(place) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_arg_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_arg_min_op_npu.py new file mode 100644 index 00000000000000..455f92b8ed6cf1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_arg_min_op_npu.py @@ -0,0 +1,273 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid.core as core + +paddle.enable_static() + + +class BaseTestCase(OpTest): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4) + self.dtype = 'float32' + self.axis = 1 + + def setUp(self): + self.initTestCase() + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + np.random.seed(2021) + self.x = (np.random.random(self.dims)).astype(self.dtype) + self.inputs = {'X': self.x} + self.attrs = {'axis': self.axis} + if self.op_type == "arg_min": + self.outputs = {'Out': np.argmin(self.x, axis=self.axis)} + else: + self.outputs = {'Out': np.argmax(self.x, axis=self.axis)} + + def test_check_output(self): + self.check_output_with_place(self.place) + + +# test argmin, dtype: float16 +class TestArgMinFloat16Case1(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4, 5) + self.dtype = 'float16' + self.axis = -1 + + +class TestArgMinFloat16Case2(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4, 5) + self.dtype = 'float16' + self.axis = 0 + + +class TestArgMinFloat16Case3(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4, 5) + self.dtype = 'float16' + self.axis = 1 + + +class TestArgMinFloat16Case4(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4, 5) + self.dtype = 'float16' + self.axis = 2 + + +class TestArgMinFloat16Case5(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4) + self.dtype = 'float16' + self.axis = -1 + + +class TestArgMinFloat16Case6(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4) + self.dtype = 'float16' + self.axis = 0 + + +class TestArgMinFloat16Case7(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4) + self.dtype = 'float16' + self.axis = 1 + + +class TestArgMinFloat16Case8(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (1, ) + self.dtype = 'float16' + self.axis = 0 + + +class TestArgMinFloat16Case9(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (2, ) + self.dtype = 'float16' + self.axis = 0 + + +class TestArgMinFloat16Case10(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, ) + self.dtype = 'float16' + self.axis = 0 + + +# test argmin, dtype: float32 +class TestArgMinFloat32Case1(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = -1 + + +class TestArgMinFloat32Case2(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + + +class TestArgMinFloat32Case3(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 1 + + +class TestArgMinFloat32Case4(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 2 + + +class TestArgMinFloat32Case5(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4) + self.dtype = 'float32' + self.axis = -1 + + +class TestArgMinFloat32Case6(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4) + self.dtype = 'float32' + self.axis = 0 + + +class TestArgMinFloat32Case7(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4) + self.dtype = 'float32' + self.axis = 1 + + +class TestArgMinFloat32Case8(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (1, ) + self.dtype = 'float32' + self.axis = 0 + + +class TestArgMinFloat32Case9(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (2, ) + self.dtype = 'float32' + self.axis = 0 + + +class TestArgMinFloat32Case10(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, ) + self.dtype = 'float32' + self.axis = 0 + + +class TestArgMinAPI(unittest.TestCase): + def initTestCase(self): + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + + def setUp(self): + self.initTestCase() + self.__class__.use_npu = True + self.place = [paddle.NPUPlace(0)] + + def test_dygraph_api(self): + def run(place): + paddle.disable_static(place) + np.random.seed(2021) + numpy_input = (np.random.random(self.dims)).astype(self.dtype) + tensor_input = paddle.to_tensor(numpy_input) + numpy_output = np.argmin(numpy_input, axis=self.axis) + paddle_output = paddle.argmin(tensor_input, axis=self.axis) + self.assertEqual( + np.allclose(numpy_output, paddle_output.numpy()), True) + paddle.enable_static() + + for place in self.place: + run(place) + + +class TestArgMaxAPI_2(unittest.TestCase): + def initTestCase(self): + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + self.keep_dims = True + + def setUp(self): + self.initTestCase() + self.__class__.use_npu = True + self.place = [paddle.NPUPlace(0)] + + def test_dygraph_api(self): + def run(place): + paddle.disable_static(place) + np.random.seed(2021) + numpy_input = (np.random.random(self.dims)).astype(self.dtype) + tensor_input = paddle.to_tensor(numpy_input) + numpy_output = np.argmin( + numpy_input, axis=self.axis).reshape(1, 4, 5) + paddle_output = paddle.argmin( + tensor_input, axis=self.axis, keepdim=self.keep_dims) + self.assertEqual( + np.allclose(numpy_output, paddle_output.numpy()), True) + self.assertEqual(numpy_output.shape, paddle_output.numpy().shape) + paddle.enable_static() + + for place in self.place: + run(place) + + +if __name__ == '__main__': + unittest.main() From 35ef41800fe0aacf80b4f2f9b6272a169cb226e3 Mon Sep 17 00:00:00 2001 From: Li Min <11663212+limin2021@users.noreply.github.com> Date: Mon, 16 Aug 2021 20:54:30 +0800 Subject: [PATCH 063/126] Fix typos in English docs for diag and diagflat. (#34869) * Fix typos in english docs for diag and diagflat. --- paddle/fluid/operators/diag_v2_op.cc | 2 +- python/paddle/fluid/layers/nn.py | 6 ++++-- python/paddle/tensor/creation.py | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc index ae78517182a22a..dd5ad739506e0d 100644 --- a/paddle/fluid/operators/diag_v2_op.cc +++ b/paddle/fluid/operators/diag_v2_op.cc @@ -79,7 +79,7 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker { "Tensor. The default value is 0.") .SetDefault(0.0f); AddComment(R"DOC( - If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned. + If ``x`` is a vector (1-D tensor), a 2-D square tensor with the elements of ``x`` as the diagonal is returned. If ``x`` is a matrix (2-D tensor), a 1-D tensor with the diagonal elements of ``x`` is returned. diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 656f1efe493dea..6251bdf165a974 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -13342,7 +13342,7 @@ def shuffle_channel(x, group, name=None): [[0.7, 0.8], [0.8, 0.9]]]] Given group: 2 - then we get a 4-D tensor out whth the same shape of input: + then we get a 4-D tensor out with the same shape of input: out.shape = (1, 4, 2, 2) out.data = [[[[0.1, 0.2], [0.2, 0.3]], @@ -13370,7 +13370,9 @@ def shuffle_channel(x, group, name=None): Examples: .. code-block:: python - import paddle.fluid as fluid + import paddle + import paddle.fluid as fluid + paddle.enable_static() input = fluid.data(name='input', shape=[None,4,2,2], dtype='float32') out = fluid.layers.shuffle_channel(x=input, group=2) """ diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 6354e8778795ad..71968d67ed693c 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -777,7 +777,7 @@ def meshgrid(*args, **kwargs): def diagflat(x, offset=0, name=None): """ - If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned. + If ``x`` is a vector (1-D tensor), a 2-D square tensor with the elements of ``x`` as the diagonal is returned. If ``x`` is a tensor (more than 1-D), a 2-D square tensor with the elements of flattened ``x`` as the diagonal is returned. @@ -902,7 +902,7 @@ def diagflat(x, offset=0, name=None): def diag(x, offset=0, padding_value=0, name=None): """ - If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned. + If ``x`` is a vector (1-D tensor), a 2-D square tensor with the elements of ``x`` as the diagonal is returned. If ``x`` is a matrix (2-D tensor), a 1-D tensor with the diagonal elements of ``x`` is returned. From ed6624ab78d70aa51ca25d6759e6d2ca4e9da9cb Mon Sep 17 00:00:00 2001 From: zhangchunle Date: Mon, 16 Aug 2021 21:18:43 +0800 Subject: [PATCH 064/126] concurrent (#34908) --- paddle/scripts/paddle_build.sh | 146 ++++--- tools/parallel_UT_rule.py | 692 ++++++++++++++++++++++++++++++++- 2 files changed, 788 insertions(+), 50 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 1d3504556fc3d8..abaae9a361d6bb 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1059,6 +1059,7 @@ function get_quickly_disable_ut() { function card_test() { set -m + CTEST_PARALLEL_LEVEL=2 case_count $1 $2 ut_startTime_s=`date +%s` @@ -1127,10 +1128,8 @@ function card_test() { ut_endTime_s=`date +%s` if (( $2 == -1 )); then echo "exclusive TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s" - echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt else echo "$2 card TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s" - echo "ipipe_log_param_${2}_Cards_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt fi set +m } @@ -1181,16 +1180,19 @@ set +x EXIT_CODE=0; test_cases=$(ctest -N -V) # get all test cases # Note(zhouwei): Parallel runs are relative to 'CTEST_PARALLEL_LEVEL', e.g: '4 job each time' means 4*CTEST_PARALLEL_LEVEL - single_card_tests_high_parallel='^job$' # cases list which would run the most job each time with single GPU - single_card_tests_two_parallel='^job$' # cases list which would run 2 job each time with single GPU - single_card_tests_non_parallel='^job$' # cases list which would run 1 job each time with single GPU - single_card_tests='^job$' # all cases list which would take single GPU + single_card_tests_high_parallel='^job$' # cases list which would run 24 job each time with single GPU + single_card_tests_secondary_high_parallel='^job$' # cases list which would run 15 job each time with single GPU + single_card_tests_third_high_parallel='^job$' # cases list which would run 12 job each time with single GPU + single_card_tests_medium_parallel='^job$' # cases list which would run 7 job each time with single GPU + single_card_tests_non_parallel='^job$' # cases list which would run 2 job each time with single GPU + single_card_tests='^job$' # all cases list which would take single GPU - multiple_card_tests_two_parallel='^job$' # cases list which would run 2 job each time with multiple GPUs, most cases would be two GPUs - multiple_card_tests_non_parallel='^job$' # cases list which would run 1 job each time with multiple GPUs, most cases would be two GPUs + multiple_card_tests_medium_parallel='^job$' # cases list which would run 4 job each time with multiple GPUs, most cases would be two GPUs + multiple_card_tests_non_parallel='^job$' # cases list which would run 2 job each time with multiple GPUs, most cases would be two GPUs - exclusive_tests_two_parallel='^job$' # cases list which would run 2 job exclusively(with all GPUs) - exclusive_tests_non_parallel='^job$' # cases list which would run 1 job exclusively(with all GPUs) + exclusive_tests_high_parallel='^job$' # cases list which would run 5 job exclusively(with all GPUs) + exclusive_tests_medium_parallel='^job$' # cases list which would run 3 job exclusively(with all GPUs) + exclusive_tests_non_parallel='^job$' # cases list which would run 2 job exclusively(with all GPUs) is_exclusive='' # indicate whether the case is exclusive type is_multicard='' # indicate whether the case is multiple GPUs type @@ -1200,9 +1202,11 @@ set +x UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d') output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}") cpu_parallel_job=$(echo $output | cut -d ";" -f 1) - tetrad_parallel_job=$(echo $output | cut -d ";" -f 2) - two_parallel_job=$(echo $output | cut -d ";" -f 3) - non_parallel_job=$(echo $output | cut -d ";" -f 4) + secondary_cpu_parallel_job=$(echo $output | cut -d ";" -f 2) + third_cpu_parallel_job=$(echo $output | cut -d ";" -f 3) + tetrad_parallel_job=$(echo $output | cut -d ";" -f 4) + two_parallel_job=$(echo $output | cut -d ";" -f 5) + non_parallel_job=$(echo $output | cut -d ";" -f 6) while read -r line; do if [[ "$line" == "" ]]; then continue @@ -1244,22 +1248,28 @@ set +x fi if [[ "$is_exclusive" != "" ]]; then - if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then - exclusive_tests_two_parallel="$exclusive_tests_two_parallel|^$testcase$" + if [[ $(echo $cpu_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then + exclusive_tests_high_parallel="$exclusive_tests_high_parallel|^$testcase$" + elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then + exclusive_tests_medium_parallel="$exclusive_tests_medium_parallel|^$testcase$" else exclusive_tests_non_parallel="$exclusive_tests_non_parallel|^$testcase$" fi elif [[ "$is_multicard" != "" ]]; then - if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then - multiple_card_tests_two_parallel="$multiple_card_tests_two_parallel|^$testcase$" + if [[ $(echo $cpu_parallel_job$tetrad_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then + multiple_card_tests_medium_parallel="$multiple_card_tests_medium_parallel|^$testcase$" else multiple_card_tests_non_parallel="$multiple_card_tests_non_parallel|^$testcase$" fi else if [[ $(echo $cpu_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then single_card_tests_high_parallel="$single_card_tests_high_parallel|^$testcase$" + elif [[ $(echo $secondary_cpu_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then + single_card_tests_secondary_high_parallel="$single_card_tests_secondary_high_parallel|^$testcase$" + elif [[ $(echo $third_cpu_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then + single_card_tests_third_high_parallel="$single_card_tests_third_high_parallel|^$testcase$" elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then - single_card_tests_two_parallel="$single_card_tests_two_parallel|^$testcase$" + single_card_tests_medium_parallel="$single_card_tests_medium_parallel|^$testcase$" else single_card_tests_non_parallel="$single_card_tests_non_parallel|^$testcase$" fi @@ -1271,24 +1281,42 @@ set +x matchstr='' testcase='' done <<< "$test_cases"; - - card_test "$single_card_tests_high_parallel" 1 6 # run cases the most each time with single GPU - card_test "$single_card_tests_two_parallel" 1 2 # run cases 2 job each time with single GPU - card_test "$single_card_tests_non_parallel" 1 # run cases 1 job each time with single GPU - card_test "$multiple_card_tests_two_parallel" 2 2 # run cases 2 job each time with two GPUs - card_test "$multiple_card_tests_non_parallel" 2 # run cases 1 job each time with two GPUs - - card_test "$exclusive_tests_two_parallel" -1 2 # run cases exclusively, in this cases would be run with 2/4/8 GPUs - card_test "$exclusive_tests_non_parallel" -1 # run cases exclusively, in this cases would be run with 2/4/8 GPUs + ut_actual_total_startTime_s=`date +%s` + + single_ut_startTime_s=`date +%s` + card_test "$single_card_tests_high_parallel" 1 24 # run cases 24 job each time with single GPU + card_test "$single_card_tests_secondary_high_parallel" 1 15 # run cases 15 job each time with single GPU + card_test "$single_card_tests_third_high_parallel" 1 12 # run cases 12 job each time with single GPU + card_test "$single_card_tests_medium_parallel" 1 7 # run cases 7 job each time with single GPU + card_test "$single_card_tests_non_parallel" 1 2 # run cases 2 job each time with single GPU + single_ut_endTime_s=`date +%s` + + multi_ut_startTime_s=`date +%s` + card_test "$multiple_card_tests_medium_parallel" 2 4 # run cases 2 job each time with two GPUs + card_test "$multiple_card_tests_non_parallel" 2 2 # run cases 1 job each time with two GPUs + multi_ut_endTime_s=`date +%s` + + exclu_ut_startTime_s=`date +%s` + card_test "$exclusive_tests_high_parallel" -1 5 # run cases exclusively, in this cases would be run with 2/4/8 GPUs + card_test "$exclusive_tests_medium_parallel" -1 3 # run cases exclusively, in this cases would be run with 2/4/8 GPUs + card_test "$exclusive_tests_non_parallel" -1 2 # run cases exclusively, in this cases would be run with 2/4/8 GPUs + exclu_ut_endTime_s=`date +%s` + + echo "ipipe_log_param_1_TestCases_Total_Time: $[ $single_ut_endTime_s - $single_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt + echo "ipipe_log_param_2_TestCases_Total_Time: $[ $multi_ut_endTime_s - $multi_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt + echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $exclu_ut_endTime_s - $exclu_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt + collect_failed_tests rm -f $tmp_dir/* exec_times=0 retry_unittests_record='' - retry_time=3 - exec_time_array=('first' 'second' 'third') + retry_time=4 + exec_time_array=('first' 'second' 'third' 'fourth') + parallel_failed_tests_exec_retry_threshold=80 exec_retry_threshold=10 is_retry_execuate=0 + rerun_ut_startTime_s=`date +%s` if [ -n "$failed_test_lists" ];then if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists" # cat logs for tiemout uts which killed by ctest @@ -1297,14 +1325,30 @@ set +x need_retry_ut_arr=(${need_retry_ut_str}) need_retry_ut_count=${#need_retry_ut_arr[@]} read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' ) - if [ $need_retry_ut_count -lt $exec_retry_threshold ];then - while ( [ $exec_times -lt $retry_time ] ) - do + while ( [ $exec_times -lt $retry_time ] ) + do + if [[ "${exec_times}" == "0" ]] ;then + if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then + is_retry_execuate=0 + else + is_retry_execuate=1 + fi + elif [[ "${exec_times}" == "1" ]] ;then + read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' ) + need_retry_ut_arr=(${need_retry_ut_str}) + need_retry_ut_count=${#need_retry_ut_arr[@]} + if [ $need_retry_ut_count -lt $exec_retry_threshold ];then + is_retry_execuate=0 + else + is_retry_execuate=1 + fi + fi + if [[ "$is_retry_execuate" == "0" ]];then set +e retry_unittests_record="$retry_unittests_record$failed_test_lists" failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'` set -e - if [[ "${exec_times}" == "1" ]];then + if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "3" ]];then if [[ "${failed_test_lists}" == "" ]];then break else @@ -1315,11 +1359,9 @@ set +x echo "This is the ${exec_time_array[$exec_times]} time to re-run" echo "=========================================" echo "The following unittest will be re-run:" - echo "${retry_unittests}" - + echo "${retry_unittests}" for line in ${retry_unittests[@]} ; do - read tmp_one_tmp <<< "$( echo $single_card_tests | grep -oEi $line )" read tmp_mul_tmp <<< "$( echo $multiple_card_tests | grep -oEi $line )" read exclusive_tmp <<< "$( echo $exclusive_tests | grep -oEi $line )" @@ -1347,7 +1389,7 @@ set +x done if [[ "$one_card_retry" != "" ]]; then - card_test "$one_card_retry" 1 + card_test "$one_card_retry" 1 4 fi if [[ "$multiple_card_retry" != "" ]]; then @@ -1357,21 +1399,22 @@ set +x if [[ "$exclusive_retry" != "" ]]; then card_test "$exclusive_retry" -1 fi - exec_times=$[$exec_times+1] failed_test_lists='' collect_failed_tests rm -f $tmp_dir/* one_card_retry='' multiple_card_retry='' - exclusive_retry='' - done - else - # There are more than 10 failed unit tests, so no unit test retry - is_retry_execuate=1 - fi + exclusive_retry='' + fi + done fi + rerun_ut_endTime_s=`date +%s` + + echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt + ut_actual_total_endTime_s=`date +%s` + echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt if [[ "$EXIT_CODE" != "0" ]]; then show_ut_retry_result fi @@ -1380,7 +1423,20 @@ set -ex } function show_ut_retry_result() { - if [[ "$is_retry_execuate" != "0" ]];then + if [ "$SYSTEM" == "Darwin" ]; then + exec_retry_threshold_count=10 + else + exec_retry_threshold_count=80 + fi + if [[ "$is_retry_execuate" != "0" ]] && [[ "${exec_times}" == "0" ]] ;then + failed_test_lists_ult=`echo "${failed_test_lists}" | grep -Po '[^ ].*$'` + echo "=========================================" + echo "There are more than ${exec_retry_threshold_count} failed unit tests in parallel test, so no unit test retry!!!" + echo "=========================================" + echo "The following tests FAILED: " + echo "${failed_test_lists_ult}" + exit 8; + elif [[ "$is_retry_execuate" != "0" ]] && [[ "${exec_times}" == "1" ]];then failed_test_lists_ult=`echo "${failed_test_lists}" | grep -Po '[^ ].*$'` echo "=========================================" echo "There are more than 10 failed unit tests, so no unit test retry!!!" diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index e6a628ae2fc95f..ccf849a2975845 100644 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -14,6 +14,660 @@ import sys import os +import platform + +# mem=0 : It run 24 job each time in Single cases; 4 job each time in Multi cases; 5 job each time in exclusive cases +HIGH_PARALLEL_JOB_NEW = [ + 'mask_util_test', 'test_communicator_ps_gpu', 'preprocess_local_imagenet', + 'test_nearest_interp_v2_mkldnn_op', 'op_call_stack_test', + 'test_fleet_amp_meta_optimizer', 'test_mkldnn_scale_matmul_fuse_pass', + 'bfloat16_gpu_test', 'test_fc_gru_fuse_pass_cc', 'device_worker_test', + 'test_custom_conj', 'save_load_util_test', 'infer_io_utils_tester', + 'test_transpose_bf16_mkldnn_op', 'test_container', 'cpu_helper_test', + 'test_fake_init_op', 'test_concat_int8_mkldnn_op', + 'test_lookup_table_dequant_op', 'test_broadcast_shape', + 'test_program_to_string', 'test_generate_mask_labels_op', + 'test_eager_deletion_dynamic_rnn_base', 'test_global_var_getter_setter', + 'test_ifelse_basic', 'test_get_set_flags', 'dim_test', + 'test_py_reader_return_list', 'test_fleet_meta_optimizer_base', + 'test_py_reader_error_msg', 'scope_test', 'buffered_allocator_test', + 'test_scaled_dot_product_attention', 'prune_test', 'test_chunk_eval_op', + 'test_static_analysis', 'test_fleet_lars_meta_optimizer', + 'heter_server_test', 'test_while_op', + 'test_runtime_and_compiletime_exception', 'test_precision_recall_op', + 'test_get_inputs_outputs_in_block', 'test_lite_engine_op', + 'test_repeated_fc_relu_fuse_pass_cc', + 'test_mkldnn_matmul_op_output_fuse_pass', 'cudnn_helper_test', + 'test_check_abi', 'data_type_test', 'test_recurrent_op', 'test_asp_utils', + 'test_paddle_inference_api', 'test_reference_count_pass_last_lived_ops', + 'test_op_support_gpu', 'test_conditional_block', + 'test_fleet_rolemaker_init', 'test_pybind_interface', 'test_io_save_load', + 'test_split_and_merge_lod_tensor_op', 'test_fusion_lstm_int8_mkldnn_op', + 'test_benchmark', 'test_protobuf', 'test_tdm_sampler_op', + 'test_teacher_student_sigmoid_loss_op', 'test_transpose_int8_mkldnn_op', + 'test_transpose_mkldnn_op', 'test_fleet_rolemaker_4', 'to_string_test', + 'test_c_comm_init_all_op', 'test_bilinear_interp_mkldnn_op', + 'test_split_bf16_mkldnn_op', 'test_cpu_quantize_squash_pass', + 'test_batch_norm_act_fuse_pass', 'test_mkldnn_op_inplace', + 'test_seqpool_concat_fuse_pass', 'test_analyzer_save_model', + 'test_exception', 'test_fc_lstm_fuse_pass', 'test_similarity_focus_op', + 'test_conv_batch_norm_mkldnn_fuse_pass', 'test_sequence_last_step', + 'test_mkldnn_cpu_bfloat16_pass', 'op_debug_string_test', + 'test_quant2_int8_mkldnn_pass', 'test_layer', 'test_sampling_id_op', + 'test_nce', 'graph_helper_test', + 'test_static_shape_inferrence_for_shape_tensor', + 'test_layer_norm_mkldnn_op', 'test_fleet_launch_async', + 'test_multi_gru_fuse_pass', 'test_hash_op', 'test_rpn_target_assign_op', + 'test_concat_bf16_mkldnn_op', 'test_fc_lstm_fuse_pass_cc', 'test_version', + 'gather_test', 'test_mkldnn_inplace_fuse_pass', 'test_reshape_bf16_op', + 'test_compat', 'test_data_feeder', 'cpu_vec_test', + 'test_distributed_strategy', 'test_hsigmoid_op', 'test_hooks', + 'test_fleet_base_2', 'op_kernel_type_test', + 'test_layer_norm_bf16_mkldnn_op', 'test_fleetrun', 'cpu_info_test', + 'brpc_utils_test', 'test_fusion_seqexpand_concat_fc_op', 'test_dataset_voc', + 'test_analyzer_capi_exp_int', 'test_post_training_quantization_resnet50', + 'cuda_helper_test', 'test_conv_concat_relu_mkldnn_fuse_pass', + 'test_bf16_utils', 'test_sum_bf16_mkldnn_op', + 'test_unsqueeze2_eltwise_fuse_pass', 'dense_table_test', + 'test_collective_optimizer', 'test_origin_info', 'test_dgc_optimizer', + 'test_avoid_twice_initialization', 'test_reduce_bf16_mkldnn_op', + 'test_mkldnn_conv_bias_fuse_pass', 'cow_ptr_tests', 'eigen_test', + 'reader_blocking_queue_test', 'test_fusion_gru_op', 'operator_test', + 'test_fusion_gru_int8_mkldnn_op', 'test_cpu_bfloat16_pass', + 'test_multiprocess_dataloader_iterable_dataset_split', 'test_scope', + 'test_analyzer_bfloat16_mobilenetv2', 'test_fleet_rolemaker_2', + 'float16_test', 'test_dpsgd_op', + 'test_conv_elementwise_add_mkldnn_fuse_pass', 'test_crypto', + 'test_sgd_op_bf16', 'test_analyzer_capi_exp_ner', + 'lite_subgraph_pass_tester', 'test_tf32_cudnn', 'threadpool_test', + 'test_cpu_quantize_pass', 'test_analyzer_capi_exp_pd_tensor', 'tuple_test', + 'test_analyzer_lac', 'test_prune', 'test_bilinear_interp_v2_mkldnn_op', + 'test_lod_tensor_array', 'test_logging_utils', 'test_fleet_nocvm_1', + 'stringprintf_test', 'test_nearest_interp_mkldnn_op', + 'test_matmul_mkldnn_op', 'test_debugger', 'test_custom_attrs_jit', + 'test_lrn_mkldnn_op', 'test_set_bool_attr', 'version_test', + 'test_broadcast_to_op', 'test_squared_mat_sub_fuse_pass', + 'test_fleet_ascend_utils', 'test_layer_norm_fuse_pass', + 'test_fused_emb_seq_pool_op', 'test_imperative_data_loader_exit_func', + 'test_feed_fetch_method', 'test_protobuf_descs', 'test_fleet_unitaccessor', + 'test_sequence_scatter_op', 'test_skip_layernorm_fuse_pass', + 'test_fs_interface', 'test_gast_with_compatibility', + 'test_repeated_fc_relu_fuse_pass', 'timer_test', 'var_type_traits_test', + 'test_py_reader_sample_generator', 'test_conv2d_transpose_mkldnn_op', + 'test_fleet_runtime', 'test_rnn_cudnn_params_packing', + 'test_mkldnn_placement_pass', 'test_fc_elementwise_layernorm_fuse_pass', + 'program_desc_test', 'test_simplify_with_basic_ops_pass', + 'test_dygraph_mode_of_unittest', 'gather_op_test', 'test_trainer_desc', + 'test_matmul_bf16_mkldnn_op', 'test_analyzer_seq_conv1', + 'test_fused_embedding_fc_lstm_op', 'test_conv2d_transpose_bf16_mkldnn_op', + 'check_reduce_rank_test', 'test_progressbar', 'test_seed_op', + 'test_shrink_rnn_memory', 'test_fc_bf16_mkldnn_op', + 'test_sequence_first_step', 'test_layer_norm_fuse_pass_cc', + 'test_fusion_lstm_mkldnn_op', 'test_elementwise_add_bf16_mkldnn_op', + 'test_static_save_load_bf16', 'test_elementwise_mul_bf16_mkldnn_op', + 'test_distributions', 'operator_exception_test', 'dropout_op_test', + 'test_gpu_package_without_gpu_device', 'test_detection_map_op', + 'test_zeros_op', 'test_launch_coverage', + 'test_mkldnn_conv_activation_fuse_pass', 'test_inference_model_io', + 'heter_listen_and_server_test', 'test_fusion_repeated_fc_relu_op', + 'cudnn_desc_test', 'test_beam_search_op', 'test_var_conv_2d', + 'test_listen_and_serv_op', 'test_dequantize_mkldnn_op', + 'test_analyzer_capi_exp_pd_threads', 'test_selected_rows', + 'test_fleet_sharding_meta_optimizer', 'test_inference_api', + 'test_mkldnn_inplace_pass', 'test_data_generator', + 'test_deprecated_memory_optimize_interfaces', 'test_ir_skip_layernorm_pass', + 'broadcast_op_test', 'test_multihead_matmul_fuse_pass', + 'test_lookup_table_bf16_op', 'test_positive_negative_pair_op', 'init_test', + 'test_tensorrt', 'test_check_error', 'test_program', 'mmap_allocator_test', + 'test_reshape_transpose_matmul_mkldnn_fuse_pass', 'test_communicator_async', + 'test_downpoursgd', 'variable_test', 'test_quantization_mkldnn_pass', + 'test_quantize_mkldnn_op', 'test_create_op_doc_string', + 'test_analyzer_lexical_gru_bfloat16', 'test_imperative_data_loader_process', + 'assign_op_test', 'test_analyzer_capi_exp_xpu', 'test_conv_bn_fuse_pass_cc', + 'test_recommender_system', 'test_ones_op', 'test_fc_mkldnn_op', + 'test_load_op_xpu', 'test_pool2d_int8_mkldnn_op', 'test_mul_int8_mkldnn_op', + 'test_scale_matmul_fuse_pass', 'test_fleet_graph_executor', 'decorator_test', + 'test_collective_base', 'test_lod_rank_table', 'test_multi_gru_mkldnn_op', + 'test_eager_deletion_conditional_block', 'op_proto_maker_test', + 'test_mkldnn_op_nhwc', 'test_fc_act_mkldnn_fuse_pass', 'test_fleet_base_3', + 'test_basic_rnn_name', 'test_query_op', 'test_fleet_base_4', + 'save_load_op_test', 'test_batch_sampler', + 'test_image_classification_layer', 'test_fusion_gru_mkldnn_op', + 'graph_test', 'test_ir_graph', 'test_hapi_hub_model', + 'test_requantize_mkldnn_op', 'test_depthwise_conv_mkldnn_pass', + 'test_fleet_metric', 'test_fc_fuse_pass_cc', 'test_fleet_private_function', + 'test_fleet', 'test_executor_check_feed', 'test_py_reader_lod_level_share', + 'nccl_context_test', 'inlined_vector_test', + 'test_generate_proposal_labels_op', 'test_analyzer_capi_exp_pd_config', + 'test_locality_aware_nms_op', 'test_imperative_decorator', + 'test_npair_loss_op', 'test_ps_dispatcher', 'test_analyzer_rnn2', + 'test_multi_gru_seq_fuse_pass', 'test_filter_by_instag_op', 'test_switch', + 'test_matmul_transpose_reshape_fuse_pass', 'test_mkldnn_caching', + 'test_fetch_var', 'op_compatible_info_test', 'complex_test', + 'test_fleet_static_mp_layers', 'test_aligned_allocator', + 'test_analyzer_transformer_fuse', 'test_sequence_topk_avg_pooling', + 'test_analyzer_lexical_gru', 'test_broadcast_error', 'test_context_manager', + 'test_registry', 'brpc_service_sparse_sgd_test', 'test_operator', + 'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass', 'test_collective_api_base', + 'test_entry_attr', 'test_get_places_op', 'test_softmax_mkldnn_op', + 'test_dynrnn_static_input', 'auto_growth_best_fit_allocator_test', + 'test_batch_norm_mkldnn_op', 'test_bpr_loss_op', + 'no_need_buffer_vars_inference_test', 'test_fleet_cc', 'test_download', + 'test_fleet_recompute_meta_optimizer', 'test_seqpool_cvm_concat_fuse_pass', + 'test_common_infer_shape_functions', 'test_fusion_seqpool_concat_op', + 'test_op_compat_sensible_pass', 'test_fs', 'test_fc_rnn_mkldnn_fuse_pass', + 'split_test', 'test_fusion_group_pass', 'test_fusion_lstm_bf16_mkldnn_op', + 'test_executor_feed_non_tensor', 'test_var_info', 'test_reducescatter', + 'test_fleet_ps', 'test_check_import_scipy', 'test_load_vars_shape_check', + 'test_nn_functional_embedding_static', 'test_fleet_rolemaker_new', + 'test_imperative_base', 'dist_multi_trainer_test', + 'test_mine_hard_examples_op', 'test_post_training_quantization_lstm_model', + 'aes_cipher_test', 'test_analyzer_zerocopytensor_tensor', 'rw_lock_test', + 'exception_holder_test', 'enforce_test', 'test_rnn_memory_helper_op', + 'ddim_test', 'test_eager_deletion_padding_rnn', 'test_is_test_pass', + 'test_fusion_seqconv_eltadd_relu_op', 'test_fleet_localsgd_meta_optimizer', + 'node_test', 'test_analyzer_text_classification', + 'test_seq_concat_fc_fuse_pass', 'test_imperative_numpy_bridge', + 'test_adaptive_pool2d_convert_global_pass', 'test_lookup_table_v2_bf16_op', + 'test_operator_desc', 'test_elementwise_mul_mkldnn_op', + 'test_fetch_handler', 'test_cpu_bfloat16_placement_pass', + 'test_match_matrix_tensor_op', 'test_fleet_run_random_port', + 'test_mkldnn_matmul_transpose_reshape_fuse_pass', + 'test_fleet_lamb_meta_optimizer', 'test_op_version', + 'fused_broadcast_op_test', 'stringpiece_test', 'test_tdm_child_op', + 'test_imperative_group', 'test_analyzer_capi_exp', + 'test_post_training_quantization_mobilenetv1', 'test_load_op', + 'test_executor_and_use_program_cache', 'op_registry_test', + 'test_create_global_var', 'test_dispatch_jit', 'table_test', 'test_full_op', + 'test_recv_save_op', 'test_fusion_lstm_op', + 'test_eager_deletion_recurrent_op', 'brpc_service_dense_sgd_test', + 'op_tester', 'test_eager_deletion_mnist', 'test_infer_shape', + 'test_fleet_rolemaker', 'test_entry_attr2', 'test_monitor', + 'test_require_version', 'test_function_spec', 'test_image', + 'lod_tensor_test', 'place_test', 'test_fleet_launch_cloud', + 'test_conv2d_bf16_mkldnn_op', + 'test_parallel_executor_run_load_infer_program', 'scatter_test', + 'graph_to_program_pass_test', 'test_lod_tensor_array_ops', + 'test_embedding_eltwise_layernorm_fuse_pass', 'complex_gpu_test', + 'save_load_combine_op_test', 'test_logger', 'test_analyzer', 'test_utils', + 'barrier_table_test', 'test_memory_usage', 'test_sysconfig', 'reader_test', + 'test_conv_bias_mkldnn_fuse_pass', 'math_function_test', + 'beam_search_decode_op_test', 'save_quant2_model_resnet50', 'bfloat16_test', + 'test_scale_bf16_mkldnn_op', 'test_fp16_utils', + 'test_cpu_quantize_placement_pass', 'test_slice_var', 'test_analyzer_ocr', + 'test_flags_use_mkldnn', 'pass_test', 'test_trainable', + 'test_sync_batch_norm_pass', 'lodtensor_printer_test', 'test_calc_gradient', + 'test_create_parameter', 'test_infer_no_need_buffer_slots', + 'test_run_fluid_by_module_or_command_line', 'test_boxps', + 'test_initializer', 'test_fusion_squared_mat_sub_op', 'test_desc_clone', + 'test_analyzer_mobilenet_depthwise_conv', 'test_analyzer_pyramid_dnn', + 'test_analyzer_detect_functional_mkldnn', 'errors_test', 'test_name_scope', + 'var_type_inference_test', 'test_const_value', + 'test_spawn_and_init_parallel_env', 'test_fleet_gradient_scale', + 'unroll_array_ops_test', 'test_fc_gru_fuse_pass', 'op_version_registry_test', + 'test_cudnn_placement_pass', 'cipher_utils_test', 'test_program_code', + 'test_save_model_without_var', 'program_processing_test', + 'test_fleet_distributed_strategy', 'test_hybrid_parallel_topology', + 'test_ascend_trigger', 'test_fleet_rolemaker_3', + 'test_conv_activation_mkldnn_fuse_pass', 'test_fusion_gru_bf16_mkldnn_op', + 'test_model_cast_to_bf16', 'test_quantize_transpiler', + 'conditional_block_op_test', 'test_fleet_gradient_merge_meta_optimizer', + 'test_graph_pattern_detector', 'test_fleet_fp16_allreduce_meta_optimizer', + 'test_unique_name', 'test_multi_out_jit', 'test_attention_lstm_op', + 'test_mkldnn_quantizer_config', 'data_layout_transform_test', + 'test_conv2d_int8_mkldnn_op', 'test_fusion_seqpool_cvm_concat_op', + 'save_quant2_model_gru', 'test_generator', 'test_sum_mkldnn_op', + 'test_fleet_util', 'test_fleet_dgc_meta_optimizer', + 'selected_rows_functor_test', 'test_default_scope_funcs', + 'test_communicator_sync', 'test_communicator_half_async', + 'test_dynrnn_gradient_check', 'test_pool2d_bf16_mkldnn_op', + 'test_table_printer', 'test_framework_debug_str', 'test_dist_fleet_ps2', + 'test_collective_scatter_api', 'test_dist_sparse_tensor_load_ftrl', + 'test_dist_mnist_dgc_nccl', 'test_dist_oneps', 'test_dist_tree_index', + 'test_dist_fleet_ps', 'test_dist_fleet_a_sync_optimizer_sync', + 'test_dist_fleet_decay', 'test_auto_checkpoint2', + 'test_dist_fleet_heter_ctr', 'test_dist_fleet_simnet', + 'test_dist_sparse_load_ps1', 'test_dist_mnist_fleet_save', + 'test_dist_fleet_ps7', 'test_dist_mnist_fleetapi', + 'test_dist_sparse_tensor_load_adam', 'test_dist_fleet_ps_gpu_ctr', + 'test_dist_mnist_ring_allreduce', 'test_dist_op', 'test_new_group_api', + 'test_dist_fleet_heter_base', 'test_collective_split_col_linear', + 'test_parallel_executor_mnist', 'test_dist_fleet_ctr2', + 'test_dist_fleet_heter_program', 'test_dist_fleet_ctr', + 'test_collective_allreduce_api', 'test_dataloader_unkeep_order', + 'test_dataloader_keep_order', 'test_dist_se_resnext_sync', 'test_hdfs2', + 'test_dist_fleet_ps6', 'test_dist_fleet_a_sync_optimizer_auto_async', + 'test_dist_fleet_a_sync_optimizer_auto', 'test_dist_fleet_ps9', + 'test_dist_fleet_raw_program_optimizer_fuse_allreduce', + 'test_dist_fleet_ps11', 'test_dist_fleet_ps8', + 'test_dist_mnist_fp16_allreduce', 'test_dist_fleet_ps12', + 'test_collective_split_row_linear', 'test_collective_reduce_api', + 'test_multiprocess_dataloader_exception', 'test_collective_allgather_api', + 'test_dist_fleet_ps10', 'test_dist_sparse_tensor_load_rmsprop', + 'test_collective_split_embedding_none_divisible', + 'test_parallel_dygraph_dataparallel', 'test_auto_checkpoint3', + 'test_fleet_graph_execution_meta_optimizer', 'test_auto_checkpoint1', + 'test_dist_fleet_ps3', 'test_dist_mnist_pg', 'test_pipeline_parallel', + 'test_dist_fleet_ps5', 'test_dist_fleet_sparse_embedding_ctr', + 'test_collective_broadcast_api', 'test_fleet_checkpoint', + 'retry_allocator_test', 'test_auto_checkpoint_multiple', + 'test_dist_mnist_backward_deps', 'test_dist_mnist_multi_comm', 'test_hdfs3', + 'test_hdfs1', 'test_dist_allreduce_op', + 'test_parallel_dygraph_sparse_embedding', 'test_dist_se_resnext_dgc', + 'test_dist_sharding_save', 'test_dist_fleet_a_sync_optimizer_async', + 'test_gen_nccl_id_op', 'test_auto_checkpoint', + 'test_collective_split_embedding', + 'test_parallel_dygraph_sparse_embedding_over_height', + 'test_dist_sparse_tensor_load_momentum', 'test_auto_checkpoint_dist_basic', + 'test_dist_fleet_ps4', 'test_collective_alltoall_api', + 'test_dist_fleet_raw_program_optimizer', 'test_parallel_dygraph_mp_layers', + 'test_dist_fleet_geo', 'test_fleet_raw_program_meta_optimizer', + 'test_sync_batch_norm_op', 'test_dist_mnist_batch_merge', + 'test_fleet_launch_ps', 'test_dist_sparse_tensor_load_sgd', + 'test_dist_fleet_a_sync_optimizer_auto_geo', + 'test_dist_lookup_sparse_table_fuse_ops', + 'test_dist_fleet_a_sync_optimizer_geo', + 'test_multiprocess_dataloader_iterable_dataset_static', + 'test_dist_fleet_grad_clip', + 'test_fleet_pipeline_meta_optimizer_with_recompute', + 'test_dist_sparse_load_ps0', 'test_collective_barrier_api', + 'test_fleet_pipeline_meta_optimizer', 'test_parallel_dygraph_mnist', + 'test_dist_sparse_tensor_load_adagrad', 'test_new_group', + 'test_imperative_signal_handler', 'test_parallel_dygraph_sharding_parallel', + 'test_dist_hapi_model', 'test_dist_mnist_gradient_merge' +] + +# mem=0 but always timeout or failed : It run 15 job each time in Single cases; +SECONDARY_HIGH_PARALLEL_JOB_NEW = [ + 'test_dataset_conll05', 'test_conv3d_mkldnn_op', 'test_matrix_nms_op', + 'test_data', 'test_analyzer_paddletensor_tensor', + 'test_linear_chain_crf_op', 'test_analyzer_multi_model_prediction', + 'test_default_dtype', 'device_context_test', 'test_analyzer_googlenet', + 'jit_kernel_test', 'profiler_test', 'preprocess_local_pascalvoc', + 'test_conv2d_transpose_layer', 'test_analyzer_int8_googlenet', + 'test_analyzer_seq_pool1_compare_determine', 'save_quant2_model_ernie', + 'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu', + 'test_dataset_uci_housing', 'test_parallel_executor_seresnext_base_cpu', + 'test_dataset_download', 'test_quant_int8_mobilenetv1_mkldnn', + 'test_crf_decoding_op', 'test_conv3d_transpose_layer', + 'test_quant2_int8_mobilenetv1_mkldnn', 'test_softmax_bf16_mkldnn_op', + 'test_quant2_int8_resnet50_range_mkldnn', 'test_pool2d_mkldnn_op', + 'test_flags_mkldnn_ops_on_off', 'test_c_comm_init_op', + 'test_uniform_random_bf16_op', 'test_custom_concat', + 'test_weight_quantization_mobilenetv1', 'test_retinanet_detection_output', + 'test_concat_mkldnn_op', 'test_gaussian_random_mkldnn_op', + 'test_parallel_executor_seresnext_with_reduce_cpu', 'test_dataset_imikolov', + 'test_analyzer_rnn1', 'test_conv2d_mkldnn_op', 'test_conv3d_layer', + 'test_error_clip', 'selected_rows_test', 'test_static_save_load_large', + 'test_bipartite_match_op', 'test_conv2d_layer', + 'test_analyzer_seq_pool1_fuse_statis', 'test_split_plugin', + 'test_analyzer_small_dam', 'test_analyzer_capi_exp_gpu', + 'test_quant2_int8_resnet50_channelwise_mkldnn', 'test_analyzer_bert', + 'test_directory_migration', 'test_elementwise_add_mkldnn_op', + 'test_quant_int8_googlenet_mkldnn', 'test_callback_early_stop', + 'test_quant2_int8_resnet50_mkldnn' +] + +# mem=0 but always timeout or failed : It run 12 job each time in Single cases; +THIRD_HIGH_PARALLEL_JOB_NEW = [ + 'test_api_impl', 'test_analyzer_seq_pool1_fuse_compare_zero_copy', + 'test_analyzer_seq_pool1_profile', 'test_analyzer_mobilenet_transpose', + 'test_analyzer_resnet50', 'test_analyzer_int8_resnet50', + 'test_analyzer_int8_mobilenetv2', 'test_analyzer_bfloat16_resnet50', + 'test_analyzer_bfloat16_mobilenetv1', 'test_analyzer_int8_mobilenet_ssd', + 'test_dataset_cifar', 'test_dataset_imdb', 'test_dataset_movielens', + 'test_datasets', 'test_allgather', 'test_c_concat', 'test_c_split', + 'test_collective_reduce', 'test_collective_sendrecv', + 'test_collective_wait', 'test_cyclic_cifar_dataset', 'test_dyn_rnn', + 'test_gru_op', 'test_multiclass_nms_op', 'test_communicator_geo', + 'test_quant_int8_mobilenetv2_mkldnn', + 'test_post_training_quantization_mnist', 'test_analyzer_seq_pool1', + 'test_analyzer_transformer', 'test_analyzer_transformer_profile', + 'test_analyzer_int8_mobilenetv1', 'test_analyzer_bfloat16_googlenet', + 'test_analyzer_quant_performance_benchmark', 'test_dataset_wmt', + 'test_allreduce', 'test_broadcast', 'test_c_identity', + 'test_collective_scatter', 'test_collective_sendrecv_api', + 'test_fleet_utils', 'test_fused_elemwise_activation_op', + 'test_group_norm_op', 'test_reducescatter_api', 'test_fleet_launch_nproc', + 'test_quant_int8_resnet50_mkldnn', 'test_quant2_int8_ernie_mkldnn', + 'convert_model2dot_ernie' +] + +# mem != 0: It run 7 job each time in Single cases; 4 job each time in Multi cases; 3 job each time in exclusive cases +TETRAD_PARALLEL_JOB_NEW = [ + 'test_meshgrid_op', 'test_gather_op', 'test_word2vec', 'test_analyzer_ner', + 'test_fetch_lod_tensor_array', 'test_adagrad_op_v2', + 'test_conv2d_fusion_op', 'test_hapi_amp', 'test_metrics', + 'test_clip_by_norm_op', 'test_lr_scheduler', 'test_generate_proposals_op', + 'test_masked_select_op', 'test_trt_anchor_generator_op', + 'test_imperative_ocr_attention_model', 'test_sentiment', 'test_chunk_op', + 'test_memcpy_op', 'test_warpctc_op', 'test_row_conv_op', + 'test_grid_sample_function', 'test_rnn_nets', 'test_pad3d_op', + 'test_imperative_mnist_sorted_gradient', 'tensor_test', + 'test_elementwise_nn_grad', 'test_tensorrt_engine_op', 'test_dot_op', + 'test_real_imag_op', 'test_adam_optimizer_fp32_fp64', 'test_reduce_op', + 'test_density_prior_box_op', 'test_top_k_op', 'test_grid_generator', + 'test_randn_op', 'test_activation_mkldnn_op', 'test_lac', 'test_pad_op', + 'test_lstmp_op', 'test_loop', 'test_pylayer_op', + 'data_device_transform_test', 'test_trt_roi_align_op', + 'test_nn_functional_hot_op', 'test_top_k_v2_op', 'test_crop_op', + 'test_conv_bn_fuse_pass', 'test_beam_search_decode_op', 'test_auc_op', + 'test_pool2d_op', 'test_gaussian_random_op', 'test_maximum_op', + 'test_rnn_cell_api', 'device_code_test', 'test_ir_inplace_pass', + 'test_cos_sim_op', 'test_lite_tensor_utils', 'test_fit_a_line', + 'test_mish_op', 'test_transpose_op', 'test_mean_iou', + 'test_conv3d_transpose_op', 'test_jit_save_load', 'test_unsqueeze2_op', + 'test_eager_deletion_while_op', 'test_zeros_like_op', 'test_c_embedding_op', + 'test_regularizer', 'zero_copy_tensor_test', 'test_tensor_shape', + 'test_resnet', 'test_dygraph_weight_norm', 'test_tracer', 'test_list', + 'test_sequence_concat', 'test_adaptive_avg_pool1d', + 'test_elementwise_div_op', 'test_conv1d_transpose_layer', 'test_adamw_op', + 'trt_fc_prelu_test', 'test_temporal_shift_op', + 'test_naive_best_fit_gpu_memory_limit', 'dlpack_tensor_test', + 'test_elementwise_max_op', 'test_typing', 'test_asp_pruning_2d_greedy', + 'test_fake_dequantize_op', 'test_crop_tensor_op', + 'test_imperative_load_static_param', 'test_imperative_qat_user_defined', + 'test_anchor_generator_op', 'test_if_else_op', 'test_prepare_op', + 'test_conj_op', 'test_imperative_hook_for_layer', 'test_roi_pool_op', + 'test_strided_slice_op', 'test_norm_all', 'test_weight_decay', + 'test_functional_conv2d', 'test_functional_conv3d_transpose', + 'test_imperative_layer_trainable', 'test_imperative_data_parallel', + 'test_digamma_op', 'test_distribution', 'test_box_clip_op', + 'custom_tensor_test', 'test_marker_op', 'test_dataloader_early_reset', + 'test_gather_nd_op', 'test_tensor_register_hook', 'test_retain_graph', + 'test_network_with_dtype', 'test_basic_api_transformation', 'test_diag', + 'test_lod_array_length_op', 'test_reinforcement_learning', + 'test_softmax_op', 'test_fc_fuse_pass', 'test_adaptive_max_pool2d', + 'test_inverse_op', 'test_declarative', 'test_imperative_double_grad', + 'test_tensor_methods', 'test_pool1d_api', 'system_allocator_test', + 'test_print', 'test_tensor_type_promotion', 'test_bce_with_logits_loss', + 'test_tensor', 'test_cross_op', 'concat_test', 'test_ast_util', + 'test_proximal_adagrad_op', 'test_pairwise_distance', + 'test_imperative_mnist', 'test_beam_search_decoder', + 'test_build_strategy_fusion_group_pass', 'test_dygraph_spectral_norm', + 'test_scale_mkldnn_op', 'test_load_state_dict_from_old_format', + 'test_margin_rank_loss_op', 'test_lookup_table_v2_op', + 'test_mix_precision_all_reduce_fuse', 'test_spp_op', 'test_op_converter', + 'mixed_vector_test', 'test_roi_align_op', 'test_pad_constant_like', + 'test_mul_op', 'test_spectral_norm_op', 'test_transformer', + 'test_for_enumerate', 'test_variable_trans_func', + 'test_squared_l2_distance_op', 'test_quantize_transpiler_v2', + 'test_im2sequence_op', 'test_reader_reset', 'test_one_hot_op', + 'test_adaptive_max_pool1d', 'test_label_smooth_op', + 'test_parallel_executor_fetch_feed', 'test_cast', + 'test_parallel_dygraph_sync_batch_norm', 'test_collect_fpn_proposals_op', + 'test_expand_as_v2_op', 'test_device', 'test_code_generator', + 'test_asp_pruning_2d_best', 'test_fleet_with_asp', 'test_pool2d_api', + 'test_mean_op', 'test_is_tensor', 'test_run_program_op', + 'test_cuda_random_seed', 'test_linear_interp_op', + 'test_fuse_all_reduce_pass', 'tensor_util_test', 'test_median', + 'test_linear', 'test_imperative_qat_amp', + 'test_truncated_gaussian_random_op', 'test_lstm_cudnn_op', + 'copy_same_tensor_test', 'test_squeeze2_op', + 'naive_best_fit_allocator_test', 'test_model', 'test_py_reader_combination', + 'test_prior_box_op', 'test_matmul_v2_mkldnn_op', 'test_sum_op', + 'test_paddle_imperative_double_grad', 'test_norm_op', 'test_pool3d_api', + 'test_imperative_gan', 'test_sequence_softmax_op', 'test_rand_op', + 'test_expand_v2_op', 'test_word2vec_book', 'test_histogram_op', + 'test_min_op', 'test_mse_loss', 'test_sign_op', + 'selected_rows_functor_gpu_test', 'test_fleet_base', 'test_logsumexp', + 'test_detection', 'test_image_classification_fp16', 'test_random_seed', + 'test_op_function_generator', 'test_unique_with_counts', + 'test_complex_elementwise_layers', 'test_array_read_write_op', + 'test_fusion_group_op', 'test_imperative_layer_apply', + 'test_executor_return_tensor_not_overwriting', + 'test_optimizer_in_control_flow', 'test_lookup_table_op', 'test_randint_op', + 'test_convert_call', 'test_sigmoid_cross_entropy_with_logits_op', + 'copy_cross_scope_test', 'test_normalization_wrapper', + 'test_pretrained_model', 'test_flip', 'test_cosine_similarity_api', + 'test_cumsum_op', 'test_range', 'test_log_loss_op', 'test_where_index', + 'test_tril_triu_op', 'test_lod_reset_op', 'test_lod_tensor', 'test_addmm_op', + 'test_index_select_op', 'test_nvprof', 'test_index_sample_op', + 'test_unstack_op', 'test_increment', 'strided_memcpy_test', + 'test_target_assign_op', 'test_trt_dynamic_shape_transformer_prune', + 'test_box_decoder_and_assign_op', 'test_trt_dynamic_shape', 'test_mnist', + 'test_convert_operators', 'test_fill_any_like_op', 'test_fill_constant_op', + 'test_callback_reduce_lr_on_plateau', 'test_tile_op', 'test_logical', + 'test_deformable_conv_op', 'test_elementwise_add_grad_grad', + 'test_simple_rnn_op', 'test_bicubic_interp_op', 'test_batch_norm_op_v2', + 'test_trt_slice_plugin', 'test_custom_relu_op_jit', + 'test_math_op_patch_var_base', 'test_se_resnet', 'test_device_guard', + 'test_elementwise_div_grad_grad', 'test_minus_op', 'test_shard_index_op', + 'test_dygraph_recompute', 'test_momentum_op', 'test_trt_nearest_interp_op', + 'test_modelaverage', 'test_compare_reduce_op', 'test_affine_grid_op', + 'test_allclose_layer', 'test_elementwise_pow_op', 'test_trt_subgraph_pass', + 'test_adaptive_avg_pool2d', 'test_functional_conv3d', + 'test_executor_and_mul', 'test_kron_op', 'test_cast_mkldnn_op', + 'test_imperative_auto_prune', 'allocator_facade_frac_flags_test', + 'test_fill_zeros_like_op', 'test_gather_tree_op', 'test_elementwise_mul_op', + 'test_cycle_gan', 'test_parallel_executor_transformer_auto_growth', + 'test_bitwise_op', 'test_uniform_random_op', 'trt_split_converter_test', + 'test_huber_loss_op', 'test_slice', 'test_label_smooth_functional', + 'test_conv_shift_op', 'test_imperative_optimizer_v2', 'test_len', + 'test_imperative_named_members', 'test_sequence_reshape', + 'test_elementwise_min_op', 'test_flatten2_op', 'test_param_guard', + 'test_imperative_ptb_rnn', 'test_batch_fc_op', 'test_Tensor_type', + 'test_complex_getitem', 'lod_tensor_gpu_test', 'im2col_test', + 'test_unbind_op', 'test_imperative_ptq', 'test_auc_single_pred_op', + 'test_imperative_reinforcement', 'test_tf32_cublas', 'test_return', + 'test_py_reader_push_pop', 'test_lstm', 'test_dygraph_mnist_fp16', + 'test_shuffle_channel_op', 'test_partial_concat_op', + 'test_fill_zeros_like2_op', 'test_deformable_conv_v1_op', + 'test_complex_grad_accumulated', 'test_sequence_mask', 'test_fill_op', + 'test_imperative_deepcf', 'test_reorder_lod_tensor', + 'test_cross_entropy_loss', 'test_multiply', 'test_partial_program', + 'test_fetch_feed', 'test_group', 'test_trt_reduce_sum_op', + 'data_type_transform_test', 'test_gru_rnn_op', 'test_cudnn_grucell', + 'test_argsort_op', 'test_batch_norm_op', 'test_inplace', + 'test_deprecated_decorator', 'test_complex_cast', 'test_diag_v2', + 'test_iou_similarity_op', 'test_inplace_auto_generated_apis', 'test_dataset', + 'test_bilinear_api', 'test_empty_like_op', 'test_imperative_layer_children', + 'nccl_op_test', 'test_tree_conv_op', 'test_share_data_op', + 'test_ir_memory_optimize_transformer', 'test_lod_append_op', + 'test_math_op_patch', 'test_base_layer', 'test_dequantize_log_op', + 'test_complex_matmul', 'test_prelu_op', 'test_l1_norm_op', + 'test_rmsprop_op', 'test_fuse_bn_act_pass', 'test_inplace_addto_strategy', + 'test_ptb_lm_v2', 'test_paddle_save_load', 'test_prelu_mkldnn_op', + 'test_box_coder_op', 'test_atan2_op', 'test_unsqueeze_op', 'test_profiler', + 'test_affine_channel_op', 'test_leaky_relu_grad_grad_functor', + 'test_ctc_align', 'test_fuse_relu_depthwise_conv_pass', 'test_complex_kron', + 'test_imperative_skip_op', 'test_dgc_op', 'test_regularizer_api', + 'test_nll_loss', 'test_imperative_layers', 'test_rnn_decode_api', + 'test_imperative_partitial_backward', 'test_where_op', 'test_std_layer', + 'test_ir_embedding_eltwise_layernorm_fuse_pass', 'test_multihead_attention', + 'test_ir_memory_optimize_ifelse_op', 'test_grid_sampler_op', + 'test_initializer_nn', 'test_var_base', 'test_fuse_elewise_add_act_pass', + 'test_select_input_output_op', 'test_lstm_op', 'test_break_continue', + 'test_imperative_parallel_coalesce_split', 'test_expand_as_op', + 'test_user_defined_quantization', 'test_tensor_to_list', + 'test_limit_gpu_memory', 'test_adamax_api', + 'test_softmax_mask_fuse_upper_triangle_op', 'test_fake_quantize_op', + 'vol2col_test', 'test_cast_op', 'test_proximal_gd_op', 'test_mul_nn_grad', + 'test_full_like_op', 'test_imperative_static_runner_while', + 'trt_instance_norm_test', 'test_elementwise_mod_op', + 'test_grad_clip_minimize', 'test_one_hot_v2_op', 'test_complex_sum_layer', + 'test_isfinite_v2_op', 'test_is_empty_op', 'test_simnet_v2', + 'beam_search_test', 'test_randperm_op', 'test_elementwise_add_op_inplace', + 'test_imperative_selected_rows', 'test_py_reader_using_executor', + 'test_activation_op', 'test_nn_functional_embedding_dygraph', + 'test_reshape_op', 'test_maxout_op', 'test_cudnn_lstmcell', + 'test_sigmoid_focal_loss', 'test_manual_seed', 'test_lrn_op', + 'test_ir_memory_optimize_nlp', 'test_dataset_dataloader', + 'test_complex_variable', 'test_lite_engine', 'test_neg_op', + 'test_view_op_reuse_allocation', 'test_split_op', 'test_ptb_lm', + 'test_elementwise_sub_op', 'test_compare_op', 'test_simnet', + 'test_label_semantic_roles', 'test_normal', + 'test_tensor_scalar_type_promotion_static', 'test_trt_group_norm_op', + 'test_learning_rate_scheduler', 'test_numel_op', 'test_adaptive_max_pool3d', + 'test_sequential', 'test_imperative_optimizer', 'test_subtract_op', + 'test_conv_transpose_nn_grad', 'test_sigmoid_focal_loss_op', + 'test_cuda_stream_event', 'test_sequence_pad_op', 'test_rnn_cells', + 'test_partial_sum_op', 'test_rnn_nets_static', 'test_max_op', + 'test_logical_op', 'test_squared_l2_norm_op', 'test_center_loss', + 'test_quantization_pass', 'test_imperative_gnn', + 'test_conv_elementwise_add_act_fuse_pass', 'test_roll_op', + 'test_imperative_container_layerdict', 'test_shape_op', 'test_bmm_op', + 'test_matmul_v2_op', 'test_hinge_loss_op', 'test_imperative_qat', + 'test_add_position_encoding_op', 'test_rnn_op', 'test_gradient_clip', + 'test_py_reader_pin_memory', 'test_concat_op', 'test_weight_decay_extend', + 'test_accuracy_op', 'test_cond', 'test_resnet_v2', 'test_adagrad_op', + 'test_mv_op', 'test_print_op', 'test_grad', 'test_square_error_cost', + 'test_rnn_cells_static', 'test_mkldnn_batch_norm_act_fuse_pass', + 'test_input_spec', 'test_adam_op', 'test_elementwise_floordiv_op', + 'test_eager_deletion_gru_net', 'test_diagonal_op', + 'test_imperative_static_runner_mnist', 'test_nearest_interp_op', + 'test_conv2d_transpose_op', 'test_diag_embed', 'test_imperative_basic', + 'test_merge_selectedrows_op', 'test_feed_data_check_shape_type', + 'test_complex_trace_layer', 'test_slice_op', 'test_bmn', + 'test_nn_quant_functional_layers', 'test_broadcast_tensors_op', + 'test_selu_op', 'test_group_norm_op_v2', 'test_tensor_to_numpy', + 'test_queue', 'test_rank_loss_op', 'test_trace_op', 'test_case', + 'test_prroi_pool_op', 'test_op_name_conflict', 'test_psroi_pool_op', + 'test_set_value_op', 'test_ones_like', 'test_assign_value_op', 'test_ema', + 'test_lamb_op', 'test_dgc_momentum_op', 'test_custom_grad_input', + 'test_trunc_op', 'test_bernoulli_op', 'test_custom_relu_model', + 'test_backward', 'test_conv3d_transpose_part2_op', 'test_complex_transpose', + 'test_memory_reuse_exclude_feed_var', 'test_polygon_box_transform', + 'math_function_gpu_test', 'test_program_prune_backward', + 'test_fleet_amp_init', 'test_normalize', 'test_correlation', + 'test_conv_elementwise_add2_act_fuse_pass', + 'test_imperative_container_layerlist', 'test_dequantize_abs_max_op', + 'test_fuse_optimizer_pass', 'test_optimizer', + 'test_dynamic_rnn_stop_gradient', 'test_raw_program_optimizer', 'test_pow', + 'test_inplace_softmax_with_cross_entropy', 'test_transforms', + 'test_unfold_op', 'test_assign_op', 'test_isinstance', + 'test_conv_affine_channel_fuse_pass', + 'auto_growth_best_fit_allocator_facade_test', 'test_cholesky_op', + 'test_adaptive_avg_pool3d', 'test_paddle_save_load_binary', + 'test_fused_fc_elementwise_layernorm_op', 'test_sequence_enumerate_op', + 'test_lgamma_op', 'test_modified_huber_loss_op', 'trt_quant_int8_test', + 'test_callback_visualdl', 'test_linspace', 'test_update_loss_scaling_op', + 'test_arg_min_max_op', 'test_empty_op', 'test_bce_loss', + 'test_nn_margin_rank_loss', 'test_arg_min_max_v2_op', 'test_variance_layer', + 'test_quantization_scale_pass', 'test_segment_ops', 'test_layers', + 'test_isfinite_op', 'test_imperative_qat_channelwise', 'test_eye_op', + 'test_imperative_framework', 'test_l1_loss', 'test_ifelse', + 'test_cache_program', 'test_ir_fc_fuse_pass', 'test_kldiv_loss_op', + 'test_switch_case', 'test_unique', 'test_prod_op', 'test_edit_distance_op', + 'test_sequence_expand_as', 'test_full_name_usage', 'test_glu', + 'test_pad2d_op', 'test_read_file', 'test_erf_op', 'test_sequence_unpad_op', + 'test_sequence_conv', 'allocator_facade_abs_flags_test', 'test_detach', + 'test_cross_entropy_op', 'test_wrappers', 'test_fleet_base_single', + 'test_conv_elementwise_add_fuse_pass', 'test_auto_growth_gpu_memory_limit', + 'test_sequence_reverse', 'test_fc_op', 'test_diagflat', 'test_adamax_op', + 'test_op_attr', 'paddle_infer_api_test', 'test_mixed_precision', + 'lite_mul_model_test', 'test_sort_op', 'test_scatter_op', + 'test_imperative_out_scale', 'test_vision_models', + 'test_rnn_encoder_decoder', 'test_fleet_with_asp_amp', + 'test_partial_eager_deletion_transformer', + 'test_imperative_star_gan_with_gradient_penalty', 'test_stack_op', + 'test_shuffle_batch_op', 'test_clip_op', 'test_py_func_op', + 'test_pool_max_op', 'test_log_softmax', + 'test_imperative_container_parameterlist', 'test_multiplex_op', + 'test_trt_transpose_flatten_concat_fuse_pass', + 'test_seqconv_eltadd_relu_fuse_pass', 'test_assert_op', 'test_scatter_nd_op', + 'test_sequence_expand', 'test_arange', 'test_translated_layer', + 'test_decoupled_py_reader_data_check', 'test_analyzer_ernie_large', + 'test_tensor_array_to_tensor', 'test_functional_conv2d_transpose', + 'test_error', 'test_callbacks', 'test_imperative_recurrent_usage', + 'test_deform_conv2d', 'test_coalesce_tensor_op', 'test_tsm', + 'test_fused_multihead_matmul_op', 'test_softmax_mask_fuse_op', + 'test_optimizer_grad', 'test_complex_abs', 'test_gradient_accmulator', + 'test_instance_norm_op_v2', 'test_random_crop_op', 'test_mobile_net', + 'test_parallel_executor_transformer', + 'test_tensor_scalar_type_promotion_dynamic', + 'test_eager_deletion_delete_vars', 'test_asp_pruning_1d', + 'test_imperative_auto_mixed_precision', 'test_imperative_using_non_zero_gpu', + 'test_machine_translation', 'test_flatten_op', 'test_onnx_export', + 'test_optimizer_for_varbase', 'test_fusion_transpose_flatten_concat_op', + 'best_fit_allocator_test', 'test_ir_fusion_group_pass', + 'test_trt_quant_conv2d_dequant_fuse_pass', 'test_allclose_op', + 'test_ftrl_op', 'test_elementwise_add_op', 'test_instance_norm_op', + 'test_lambv2_op', 'test_yolo_box_op', 'test_parallel_executor_drop_scope', + 'test_generator_dataloader', 'test_conv2d_transpose_op_depthwise_conv', + 'test_imperative_save_load_v2', 'test_lookahead', + 'test_moving_average_abs_max_scale_op', 'test_roi_perspective_transform_op', + 'test_tensorrt_engine', 'test_affine_grid_function', 'test_nonzero_api', + 'test_ir_memory_optimize_pass', 'test_reduce_mkldnn_op', + 'test_bilinear_interp_op', 'test_cvm_op', 'test_scale_op', 'test_matmul_op', + 'test_sequence_pool', 'test_complex_simplenet', 'test_complex_reshape', + 'test_flatten_contiguous_range_op', 'test_python_operator_overriding', + 'lite_resnet50_test', 'test_sequence_erase_op', + 'test_deformable_psroi_pooling', 'test_multi_precision_fp16_train', + 'test_adam_op_multi_thread', 'test_decoupled_py_reader', + 'test_distribute_fpn_proposals_op', 'transform_test', 'test_nan_inf', + 'test_fuse_bn_add_act_pass', 'test_unpool_op', + 'test_parallel_executor_dry_run', 'test_layer_norm_op_v2', + 'test_embedding_id_stop_gradient', 'test_mkldnn_fc_act_fuse_pass', + 'sequence_pooling_test', 'test_get_tensor_from_selected_rows_op', + 'test_imperative_ptb_rnn_sorted_gradient', 'test_hapi_hub', + 'test_reverse_op', 'test_compiled_program', 'test_lambda', 'test_adadelta_op', + 'test_nn_sigmoid_op', 'test_nearest_interp_v2_op', 'test_sequence_slice_op', + 'test_program_translator', 'test_eager_deletion_lstm_net', 'malloc_test', + 'test_size_op', 'test_analysis_predictor', 'test_recognize_digits', + 'test_parameter', 'test_transpose_flatten_concat_fuse_pass', + 'test_imperative_trace_non_persistable_inputs', 'test_pass_builder', + 'thread_local_allocator_test', 'test_variable', 'test_fsp_op', + 'test_elementwise_gradient_op', 'test_multinomial_op', + 'test_trt_shuffle_channel_detect_pass', 'test_generate_proposals_v2_op', + 'test_graph', 'test_gelu_op', 'test_sample_logits_op', + 'test_weight_normalization', 'test_activation_bf16_mkldnn_op', + 'trt_dynamic_shape_test', 'test_traced_layer_err_msg', 'test_conv1d_layer', + 'test_asp_optimize', 'test_imperative_container_sequential', 'test_bert', + 'test_transformer_api', 'test_linear_interp_v2_op', 'test_pixel_shuffle', + 'test_expand_op', 'test_save_load', 'test_dygraph_multi_forward', + 'test_dropout_op', 'test_while_loop_op', 'float16_gpu_test', 'test_dict', + 'test_bilinear_tensor_product_op', 'test_parallel_executor_pg', 'test_assert', + 'test_smooth_l1_loss_op', 'sequence_padding_test', 'test_analyzer_ernie', + 'test_minimum_op', 'test_yolov3_loss_op', 'test_decayed_adagrad_op', + 'test_split_mkldnn_op', 'test_squeeze_op', 'test_save_inference_model', + 'test_smooth_l1_loss', 'test_bilateral_slice_op', 'test_inplace_abn_op', + 'test_fetch_unmerged', 'test_parallel_executor_feed_persistable_var', + 'test_parallel_executor_fetch_isolated_var', + 'test_parallel_executor_inference_feed_partial_data', + 'test_parallel_executor_seresnext_base_gpu', + 'test_parallel_executor_test_while_train', + 'test_parallel_executor_seresnext_with_fuse_all_reduce_gpu', + 'test_parallel_ssa_graph_inference_feed_partial_data', + 'test_parallel_executor_seresnext_with_reduce_gpu', 'test_data_norm_op', + 'test_install_check', 'graph_node_test', 'trt_quant_int8_yolov3_r50_test', + 'test_trt_dynamic_shape_ernie', 'trt_mobilenet_test', + 'trt_cascade_rcnn_test', 'trt_resnext_test', 'test_activation_nn_grad', + 'test_trt_dynamic_shape_ernie_fp16_ser_deser', 'test_bilinear_interp_v2_op', + 'test_cross_entropy2_op', 'test_conv3d_op', 'test_layer_norm_op', + 'test_pool3d_op', 'test_static_save_load', 'test_trilinear_interp_v2_op', + 'test_trilinear_interp_op', 'test_trt_gather_nd_op', 'test_trt_gather_op', + 'test_trt_flatten_op', 'test_trt_instance_norm_op', 'test_trt_yolo_box_op', + 'test_trt_reshape_op', 'test_trt_reduce_mean_op', 'test_trt_pool_op', + 'test_trt_dynamic_shape_ernie_ser_deser', 'test_trt_elementwise_op', + 'test_trt_affine_channel_op', 'test_trt_conv_pass', + 'test_softmax_with_cross_entropy_op', 'test_trt_matmul', + 'test_trt_fc_fuse_pass', 'test_trt_pad_op', 'test_trt_scale_op', + 'test_trt_activation_pass', 'trt_resnet50_test', + 'test_imperative_lod_tensor_to_selected_rows', 'test_gru_unit_op', + 'test_amp_check_finite_and_scale_op', + 'test_imperative_selected_rows_to_lod_tensor', 'test_imperative_save_load', + 'test_add_reader_dependency', 'test_imperative_transformer_sorted_gradient', + 'test_bicubic_interp_v2_op', 'test_rank_attention_op', 'test_seq2seq', + 'test_space_to_depth_op', 'test_image_classification', + 'test_custom_relu_op_setup', 'test_sgd_op' +] + +# mem != 0 : It run 7 job each time in Single cases; 3 job each time in exclusive cases +TWO_PARALLEL_JOB_NEW = [ + 'test_buffer_shared_memory_reuse_pass', + 'test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass', + 'test_parallel_executor_crf', 'test_multiprocess_reader_exception', + 'buddy_allocator_test', 'test_multiprocess_dataloader_dataset', + 'test_multiprocess_dataloader_dynamic', + 'test_multiprocess_dataloader_static', 'test_imperative_resnet', + 'test_nn_grad', 'test_conv2d_op_depthwise_conv', 'test_yolov3', + 'test_conv_nn_grad', 'test_imperative_data_loader_fds_clear', + 'test_conv2d_op', 'test_imperative_data_loader_base', + 'test_imperative_resnet_sorted_gradient', + 'test_multiprocess_dataloader_iterable_dataset_dynamic', + 'test_imperative_se_resnext', 'test_norm_nn_grad', 'test_conv2d_api' +] # *=======These unittest doesn't occupy GPU memory, just run as CPU unittest=======* # # It run 16 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, @@ -1031,6 +1685,9 @@ def main(): cpu_parallel_job = '^job$' + secondary_cpu_parallel_job = '^job$' + third_cpu_parallel_job = '^job$' + tetrad_parallel_job = '^job$' two_parallel_job = '^job$' non_parallel_job = '^job$' @@ -1038,17 +1695,36 @@ def main(): test_cases = sys.argv[1] test_cases = test_cases.split("\n") - for unittest in CPU_PARALLEL_JOB: + if platform.system() == 'Windows': + cpu_parallel_job_list = CPU_PARALLEL_JOB + tetrad_parallel_job_list = TETRAD_PARALLEL_JOB + two_parallel_job_list = TWO_PARALLEL_JOB + else: + cpu_parallel_job_list = HIGH_PARALLEL_JOB_NEW + tetrad_parallel_job_list = TETRAD_PARALLEL_JOB_NEW + two_parallel_job_list = TWO_PARALLEL_JOB_NEW + + for unittest in cpu_parallel_job_list: if unittest in test_cases: cpu_parallel_job = cpu_parallel_job + '|^' + unittest + '$' test_cases.remove(unittest) - for unittest in TETRAD_PARALLEL_JOB: + if platform.system() != 'Windows': + for unittest in SECONDARY_HIGH_PARALLEL_JOB_NEW: + if unittest in test_cases: + secondary_cpu_parallel_job = secondary_cpu_parallel_job + '|^' + unittest + '$' + test_cases.remove(unittest) + for unittest in THIRD_HIGH_PARALLEL_JOB_NEW: + if unittest in test_cases: + third_cpu_parallel_job = third_cpu_parallel_job + '|^' + unittest + '$' + test_cases.remove(unittest) + + for unittest in tetrad_parallel_job_list: if unittest in test_cases: tetrad_parallel_job = tetrad_parallel_job + '|^' + unittest + '$' test_cases.remove(unittest) - for unittest in TWO_PARALLEL_JOB: + for unittest in two_parallel_job_list: if unittest in test_cases: two_parallel_job = two_parallel_job + '|^' + unittest + '$' test_cases.remove(unittest) @@ -1056,8 +1732,14 @@ def main(): for unittest in test_cases: non_parallel_job = non_parallel_job + '|^' + unittest + '$' - print("{};{};{};{}".format(cpu_parallel_job, tetrad_parallel_job, - two_parallel_job, non_parallel_job)) + if platform.system() == 'Windows': + print("{};{};{};{}".format(cpu_parallel_job, tetrad_parallel_job, + two_parallel_job, non_parallel_job)) + else: + print("{};{};{};{};{};{}".format( + cpu_parallel_job, secondary_cpu_parallel_job, + third_cpu_parallel_job, tetrad_parallel_job, two_parallel_job, + non_parallel_job)) if __name__ == '__main__': From 181f7cec6e72bbd0cf39c4de61d5080a1a10edef Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Tue, 17 Aug 2021 09:31:20 +0800 Subject: [PATCH 065/126] fix a bug in nlp: text_matching/sentence_transformers when last dim is 1 and reduce mid dim (#34941) --- paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index fe77d3158ed27c..30b1cf5ac711d2 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -770,7 +770,7 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, auto x_dim = framework::vectorize(x.dims()); auto config = ReduceConfig(origin_reduce_dims, x_dim); config.Run(); // get the parameters of LaunchReduceKernel - + int numel = x.numel(); // after config.run() // SetOutputData for ReduceHigherDim when should_reduce_again is true, // temp_output should be stored temp_data in output_data space or stored in @@ -787,7 +787,7 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, } config.SetOutputData(y_data, x.place(), &tmp); - bool use_cub_reduce = (config.left_num == 1) && + bool use_cub_reduce = (config.reduce_num == numel) && (!std::is_same::value); if (use_cub_reduce) { // launch CUB::Reduce From 16146088e3b815d372dab28f2358eb45bb938982 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Tue, 17 Aug 2021 10:57:14 +0800 Subject: [PATCH 066/126] fix drop_last not work on IterableDataset (#34801) * fix drop_last not work in IterableDataset. test=develop --- .../fluid/dataloader/dataloader_iter.py | 7 +++--- python/paddle/fluid/dataloader/worker.py | 7 +++--- python/paddle/fluid/reader.py | 1 + .../test_multiprocess_dataloader_dataset.py | 25 +++++++++++++++++++ .../test_multiprocess_dataloader_exception.py | 4 +-- 5 files changed, 36 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index 069dff28ccf9b7..cc98d378f14894 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -59,6 +59,7 @@ def __init__(self, loader): self._places = loader.places self._return_list = loader.return_list self._batch_sampler = loader.batch_sampler + self._drop_last = loader.drop_last self._auto_collate_batch = loader.auto_collate_batch self._num_workers = loader.num_workers self._use_buffer_reader = loader.use_buffer_reader @@ -111,7 +112,7 @@ def __init__(self, loader): self._dataset_fetcher = _DatasetKind.create_fetcher( self._dataset_kind, self._dataset, self._auto_collate_batch, - self._collate_fn, True) + self._collate_fn, self._drop_last) # NOTE: _structrue_infos used to record the data structure of # batch to restore batch structure after reading Tensor @@ -309,8 +310,8 @@ def _init_workers(self): args=(self._dataset, self._dataset_kind, indices_queue, self._data_queue, self._workers_done_event, self._auto_collate_batch, self._collate_fn, - self._worker_init_fn, i, self._num_workers, - self._use_shared_memory)) + self._drop_last, self._worker_init_fn, i, + self._num_workers, self._use_shared_memory)) worker.daemon = True worker.start() self._workers.append(worker) diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py index 66ca4150460d74..622f85cf65ab79 100644 --- a/python/paddle/fluid/dataloader/worker.py +++ b/python/paddle/fluid/dataloader/worker.py @@ -253,7 +253,7 @@ def mix(x, y): def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event, - auto_collate_batch, collate_fn, init_fn, worker_id, + auto_collate_batch, collate_fn, drop_last, init_fn, worker_id, num_workers, use_shared_memory): try: # NOTE: [ mmap files clear ] When the child process exits unexpectedly, @@ -282,8 +282,9 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event, try: if init_fn is not None: init_fn(worker_id) - fetcher = _DatasetKind.create_fetcher( - dataset_kind, dataset, auto_collate_batch, collate_fn, True) + fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset, + auto_collate_batch, + collate_fn, drop_last) except: init_exception = _WorkerException(worker_id) diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 7076ef22ba605c..dfc887292e7cff 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -401,6 +401,7 @@ def __init__(self, shuffle=shuffle, drop_last=drop_last) + self.drop_last = drop_last self.auto_collate_batch = self.batch_sampler is not None self.pin_memory = False diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py index 30e70a77c369c1..8f1febcdeddf71 100755 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py @@ -397,5 +397,30 @@ def test_main(self): assert out == outp +class TestDatasetWithDropLast(unittest.TestCase): + def run_main(self, dataset, num_samples, batch_size): + for num_workers in [0, 1]: + for drop_last in [True, False]: + steps = (num_samples + (1 - int(drop_last)) * \ + (batch_size - 1)) // batch_size + dataloader = DataLoader( + dataset, + batch_size=batch_size, + drop_last=drop_last, + num_workers=num_workers) + datas = [] + for data in dataloader: + datas.append(data) + assert len(datas) == steps + + def test_map_dataset(self): + dataset = RandomDataset(10) + self.run_main(dataset, 10, 3) + + def test_iterable_dataset(self): + dataset = RandomIterableDataset(10) + self.run_main(dataset, 10, 3) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py index 1bda6edfecf1c7..52f4c2567730f5 100644 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py @@ -180,7 +180,7 @@ def _collate_fn(sample_list): indices_queue.put(None) _worker_loop(loader._dataset, 0, indices_queue, loader._data_queue, loader._workers_done_event, - True, _collate_fn, _init_fn, 0, 1, + True, _collate_fn, True, _init_fn, 0, 1, loader._use_shared_memory) self.assertTrue(False) except AssertionError: @@ -224,7 +224,7 @@ def _collate_fn(sample_list): loader._workers_done_event.set() _worker_loop(loader._dataset, 0, indices_queue, loader._data_queue, loader._workers_done_event, - True, _collate_fn, _init_fn, 0, 1, + True, _collate_fn, True, _init_fn, 0, 1, loader._use_shared_memory) self.assertTrue(True) except AssertionError: From 5de576b0af93519236a2307855b1182c86c5d142 Mon Sep 17 00:00:00 2001 From: zhiboniu <31800336+zhiboniu@users.noreply.github.com> Date: Tue, 17 Aug 2021 11:22:55 +0800 Subject: [PATCH 067/126] add api fill_diagonal_inplace (#34460) --- paddle/fluid/operators/fill_diagonal_op.cc | 217 ++++++++++++++++++ paddle/fluid/operators/fill_diagonal_op.cu | 122 ++++++++++ paddle/fluid/operators/fill_diagonal_op.h | 25 ++ .../unittests/test_tensor_fill_diagonal_.py | 173 ++++++++++++++ python/paddle/tensor/manipulation.py | 49 ++++ 5 files changed, 586 insertions(+) create mode 100644 paddle/fluid/operators/fill_diagonal_op.cc create mode 100644 paddle/fluid/operators/fill_diagonal_op.cu create mode 100644 paddle/fluid/operators/fill_diagonal_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py diff --git a/paddle/fluid/operators/fill_diagonal_op.cc b/paddle/fluid/operators/fill_diagonal_op.cc new file mode 100644 index 00000000000000..db55c3e99693ae --- /dev/null +++ b/paddle/fluid/operators/fill_diagonal_op.cc @@ -0,0 +1,217 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_diagonal_op.h" + +namespace paddle { +namespace operators { + +int64_t CalStride(framework::DDim dim) { + int rank = dim.size(); + int64_t dimsum = 1; + int64_t strides = 0; + for (int i = rank - 1; i >= 0; i--) { + strides += dimsum; + dimsum *= dim[i]; + } + return strides; +} + +class FillIDiagonalOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddComment(R"DOC(Fill replace operator + Fill the diagonal of an tensor with 'value'. + )DOC"); + AddInput("X", "(Tensor) The input tensor."); + AddOutput("Out", + "Tensor, the output tensor, with the same shape and data type " + "as input(x)"); + AddAttr( + "value", + "The float values of tensor, whose dim is one, and no need of grad") + .SetDefault(0); + AddAttr("wrap", + "the diagonal 'wrapped' after N columns for tall matrices") + .SetDefault(false); + AddAttr("offset", + "offset of diagonal, zero means no offset, positive means " + "offset to up-right corner; negtive means offset to " + "bottom-left corner") + .SetDefault(0); + } +}; + +class FillIDiagonalOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *context) const override { + OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "FillIDiagonal"); + OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "FillIDiagonal"); + auto x_dims = context->GetInputDim("X"); + context->SetOutputDim("Out", x_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +class FillIDiagonalOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto var_type = ctx->GetInputType("X", 0); + auto data_type = ctx->GetInputDataType("X", 0); + ctx->SetOutputType("Out", var_type, framework::ALL_ELEMENTS); + ctx->SetOutputDataType("Out", data_type, framework::ALL_ELEMENTS); + } +}; + +template +class FillIDiagonalKernel : public framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext &ctx) const override { + auto fill_val = ctx.template Attr("value"); + auto *out = ctx.Output("Out"); + auto offset = ctx.Attr("offset"); + auto wrap = ctx.Attr("wrap"); + + auto *xin = ctx.Input("X"); + + T temp_var = static_cast(fill_val); + + T *out_data = out->mutable_data(ctx.GetPlace()); + framework::TensorCopy(*xin, ctx.GetPlace(), out); + + auto out_dims = out->dims(); + auto strides = CalStride(out_dims); + auto size = out->numel(); + + // The wrap mode supported only the dims equels to 2; In wrap mode, the + // value will be filled in cycles + if (!wrap) { + size = std::min(size, out_dims[1] * out_dims[1]); + } + + for (int64_t i = offset; i < size; i += strides) { + out_data[i] = temp_var; + } + } +}; + +class FillIDiagonalGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + "Out@GRAD", "mul"); + auto x_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + // Note: don't get data type from ctx.Input("Input"); + auto dtype = + ctx.Input(framework::GradVarName("Out"))->type(); + return framework::OpKernelType(dtype, ctx.GetPlace()); + } +}; + +template +class FillIDiagonalGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("fill_diagonal_grad"); + retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; + +template +class FillIDiagonalGradKernel : public framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext &ctx) const override { + auto *dx = ctx.Output(framework::GradVarName("X")); + auto *dout = ctx.Input(framework::GradVarName("Out")); + + auto offset = ctx.Attr("offset"); + auto wrap = ctx.Attr("wrap"); + + if (dx) { + auto *data = dx->mutable_data(ctx.GetPlace()); + framework::TensorCopy(*dout, ctx.GetPlace(), dx); + + auto dx_dims = dx->dims(); + auto strides = CalStride(dx_dims); + auto size = dx->numel(); + auto wrapsize = std::min(size, dx_dims[1] * dx_dims[1]); + + // The wrap mode supported only the dims equels to 2; In wrap mode, the + // value will be filled in cycles + if (wrap) { + wrapsize = size; + } + + for (int64_t i = offset; i < wrapsize; i += strides) { + data[i] = T(0); + } + } + } +}; + +DECLARE_INPLACE_OP_INFERER(FillIDiagonalOpInplaceInferer, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(FillIDiagonalGradOpInplaceInferer, + {framework::GradVarName("Out"), + framework::GradVarName("X")}); + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(fill_diagonal, ops::FillIDiagonalOp, + ops::FillIDiagonalOpMaker, + ops::FillIDiagonalOpVarTypeInference, + ops::FillIDiagonalGradOpMaker, + ops::FillIDiagonalGradOpMaker, + ops::FillIDiagonalOpInplaceInferer); + +REGISTER_OPERATOR(fill_diagonal_grad, ops::FillIDiagonalGradOp, + ops::FillIDiagonalGradOpInplaceInferer); + +REGISTER_OP_CPU_KERNEL(fill_diagonal, ops::FillIDiagonalKernel, + ops::FillIDiagonalKernel, + ops::FillIDiagonalKernel, + ops::FillIDiagonalKernel, + ops::FillIDiagonalKernel, + ops::FillIDiagonalKernel); + +REGISTER_OP_CPU_KERNEL(fill_diagonal_grad, ops::FillIDiagonalGradKernel, + ops::FillIDiagonalGradKernel, + ops::FillIDiagonalGradKernel, + ops::FillIDiagonalGradKernel, + ops::FillIDiagonalGradKernel, + ops::FillIDiagonalGradKernel); diff --git a/paddle/fluid/operators/fill_diagonal_op.cu b/paddle/fluid/operators/fill_diagonal_op.cu new file mode 100644 index 00000000000000..5047059fb364d3 --- /dev/null +++ b/paddle/fluid/operators/fill_diagonal_op.cu @@ -0,0 +1,122 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_diagonal_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using CUDADeviceContext = paddle::platform::CUDADeviceContext; + +template +__global__ void fill_constant_kernel(const int64_t featuresize, T* in_data, + int64_t strides, int offset, T fillvar) { + for (int64_t idx = blockIdx.x * featuresize + threadIdx.x; + idx * strides + offset < (blockIdx.x + 1) * featuresize; + idx += blockDim.x) { + in_data[idx * strides + offset] = fillvar; + } +} + +template +class FillIDiagonalCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#ifdef __HIPCC__ + const int64_t kMaxBlockDim = 256; +#else + const int64_t kMaxBlockDim = 512; +#endif + auto* out = ctx.Output("Out"); + auto offset = ctx.Attr("offset"); + auto wrap = ctx.Attr("wrap"); + + auto* xin = ctx.Input("X"); + framework::TensorCopy(*xin, ctx.GetPlace(), out); + + T* out_data = out->mutable_data(ctx.GetPlace()); + auto fill_val = static_cast(ctx.template Attr("value")); + T temp_var = static_cast(fill_val); + + auto size = out->numel(); + auto out_dims = out->dims(); + auto strides = CalStride(out_dims); + + // The wrap mode supported only the dims equels to 2; In wrap mode, the + // value will be filled in cycles + if (!wrap) { + size = std::min(size, out_dims[1] * out_dims[1]); + } + + int64_t kBlockDim = std::min(int64_t(size / strides), kMaxBlockDim); + fill_constant_kernel<<<1, kBlockDim, 0>>>(size, out_data, strides, + offset, temp_var); + } +}; + +template +class FillIDiagonalGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#ifdef __HIPCC__ + const int64_t kMaxBlockDim = 256; +#else + const int64_t kMaxBlockDim = 512; +#endif + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* in_data = dx->mutable_data(ctx.GetPlace()); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto offset = ctx.Attr("offset"); + auto wrap = ctx.Attr("wrap"); + + framework::TensorCopy(*dout, ctx.GetPlace(), dx); + + auto size = dx->numel(); + auto out_dims = dx->dims(); + auto strides = CalStride(out_dims); + + auto wrapsize = std::min(size, out_dims[1] * out_dims[1]); + // The wrap mode supported only the dims equels to 2; In wrap mode, the + // value will be filled in cycles + if (wrap) { + wrapsize = size; + } + + int64_t kBlockDim = std::min(int64_t(size), kMaxBlockDim); + fill_constant_kernel<<<1, kBlockDim, 0>>>(wrapsize, in_data, strides, + offset, T(0)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(fill_diagonal, ops::FillIDiagonalCUDAKernel, + ops::FillIDiagonalCUDAKernel, + ops::FillIDiagonalCUDAKernel, + ops::FillIDiagonalCUDAKernel, + ops::FillIDiagonalCUDAKernel, + ops::FillIDiagonalCUDAKernel); + +REGISTER_OP_CUDA_KERNEL(fill_diagonal_grad, + ops::FillIDiagonalGradCUDAKernel, + ops::FillIDiagonalGradCUDAKernel, + ops::FillIDiagonalGradCUDAKernel, + ops::FillIDiagonalGradCUDAKernel, + ops::FillIDiagonalGradCUDAKernel, + ops::FillIDiagonalGradCUDAKernel); diff --git a/paddle/fluid/operators/fill_diagonal_op.h b/paddle/fluid/operators/fill_diagonal_op.h new file mode 100644 index 00000000000000..4531503e30de54 --- /dev/null +++ b/paddle/fluid/operators/fill_diagonal_op.h @@ -0,0 +1,25 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +int64_t CalStride(framework::DDim dim); + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py new file mode 100644 index 00000000000000..41a8a9750cb64c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py @@ -0,0 +1,173 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import unittest +import numpy as np +import six +import paddle + + +class TensorFillDiagonal_Test(unittest.TestCase): + def test_dim2_normal(self): + expected_np = np.array( + [[1, 2, 2], [2, 1, 2], [2, 2, 1]]).astype('float32') + expected_grad = np.array( + [[0, 1, 1], [1, 0, 1], [1, 1, 0]]).astype('float32') + + typelist = ['float32', 'float64', 'int32', 'int64'] + places = [fluid.CPUPlace()] + if fluid.core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + + for idx, p in enumerate(places): + if idx == 0: + paddle.set_device('cpu') + else: + paddle.set_device('gpu') + for dtype in typelist: + x = paddle.ones((3, 3), dtype=dtype) + x.stop_gradient = False + y = x * 2 + y.fill_diagonal_(1, offset=0, wrap=True) + loss = y.sum() + loss.backward() + + self.assertEqual( + (y.numpy().astype('float32') == expected_np).all(), True) + self.assertEqual( + (y.grad.numpy().astype('float32') == expected_grad).all(), + True) + + def test_bool(self): + expected_np = np.array( + [[False, True, True], [True, False, True], [True, True, False]]) + + typelist = ['bool'] + places = [fluid.CPUPlace()] + if fluid.core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + + for idx, p in enumerate(places): + if idx == 0: + paddle.set_device('cpu') + else: + paddle.set_device('gpu') + for dtype in typelist: + x = paddle.ones((3, 3), dtype=dtype) + x.stop_gradient = True + x.fill_diagonal_(0, offset=0, wrap=True) + + self.assertEqual((x.numpy() == expected_np).all(), True) + + def test_dim2_unnormal_wrap(self): + expected_np = np.array([[1, 2, 2], [2, 1, 2], [2, 2, 1], [2, 2, 2], + [1, 2, 2], [2, 1, 2], + [2, 2, 1]]).astype('float32') + expected_grad = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0], [1, 1, 1], + [0, 1, 1], [1, 0, 1], + [1, 1, 0]]).astype('float32') + + typelist = ['float32', 'float64', 'int32', 'int64'] + places = [fluid.CPUPlace()] + if fluid.core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + + for idx, p in enumerate(places): + if idx == 0: + paddle.set_device('cpu') + else: + paddle.set_device('gpu') + for dtype in typelist: + x = paddle.ones((7, 3), dtype=dtype) + x.stop_gradient = False + y = x * 2 + y.fill_diagonal_(1, offset=0, wrap=True) + loss = y.sum() + loss.backward() + + self.assertEqual( + (y.numpy().astype('float32') == expected_np).all(), True) + self.assertEqual( + (y.grad.numpy().astype('float32') == expected_grad).all(), + True) + + def test_dim2_unnormal_unwrap(self): + expected_np = np.array([[1, 2, 2], [2, 1, 2], [2, 2, 1], [2, 2, 2], + [2, 2, 2], [2, 2, 2], + [2, 2, 2]]).astype('float32') + expected_grad = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0], [1, 1, 1], + [1, 1, 1], [1, 1, 1], + [1, 1, 1]]).astype('float32') + + typelist = ['float32', 'float64', 'int32', 'int64'] + places = [fluid.CPUPlace()] + if fluid.core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + + for idx, p in enumerate(places): + if idx == 0: + paddle.set_device('cpu') + else: + paddle.set_device('gpu') + for dtype in typelist: + x = paddle.ones((7, 3), dtype=dtype) + x.stop_gradient = False + y = x * 2 + y.fill_diagonal_(1, offset=0, wrap=False) + loss = y.sum() + loss.backward() + + self.assertEqual( + (y.numpy().astype('float32') == expected_np).all(), True) + self.assertEqual( + (y.grad.numpy().astype('float32') == expected_grad).all(), + True) + + def test_dim_larger2_normal(self): + expected_np = np.array([[[1, 2, 2], [2, 2, 2], [2, 2, 2]], [[2, 2, 2], [ + 2, 1, 2 + ], [2, 2, 2]], [[2, 2, 2], [2, 2, 2], [2, 2, 1]]]).astype('float32') + expected_grad = np.array( + [[[0, 1, 1], [1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 0, 1], + [1, 1, 1]], + [[1, 1, 1], [1, 1, 1], [1, 1, 0]]]).astype('float32') + + typelist = ['float32', 'float64', 'int32', 'int64'] + places = [fluid.CPUPlace()] + if fluid.core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + + for idx, p in enumerate(places): + if idx == 0: + paddle.set_device('cpu') + else: + paddle.set_device('gpu') + for dtype in typelist: + x = paddle.ones((3, 3, 3), dtype=dtype) + x.stop_gradient = False + y = x * 2 + y.fill_diagonal_(1, offset=0, wrap=True) + loss = y.sum() + loss.backward() + + self.assertEqual( + (y.numpy().astype('float32') == expected_np).all(), True) + self.assertEqual( + (y.grad.numpy().astype('float32') == expected_grad).all(), + True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 4b84401aa09458..1f0c0ba24d95ba 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -37,6 +37,55 @@ __all__ = [] +@dygraph_only +def fill_diagonal_(x, value, offset=0, wrap=False, name=None): + """ + **Notes**: + **This API is ONLY available in Dygraph mode** + This function fill the value into the x Tensor's diagonal inplace. + Args: + x(Tensor): ``x`` is the original Tensor + value(Scale): ``value`` is the value to filled in x + offset(int,optional): the offset to the main diagonal. Default: 0 (main diagonal). + wrap(bool,optional): the diagonal 'wrapped' after N columns for tall matrices. + name(str,optional): Name for the operation (optional, default is None) + Returns: + Tensor: Tensor with diagonal filled with value. + Returns type: + dtype is same as x Tensor + Examples: + .. code-block:: python + import paddle + x = paddle.ones((4, 3)) * 2 + x.fill_diagonal_(1.0) + print(x.tolist()) #[[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]] + """ + helper = LayerHelper("fill_diagonal_", **locals()) + check_type(x, 'X', (Variable), 'fill_diagonal_') + dtype = helper.input_dtype('x') + check_dtype(dtype, 'X', + ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], + 'fill_diagonal_') + check_type(value, 'value', (bool, int, float), 'fill_diagonal_') + check_type(wrap, 'wrap', (bool), 'fill_diagonal_') + + inshape = x.shape + inshapeset = set(inshape) + assert len(inshape) >= 2, ('Tensor dims should >= 2 in fill_diagonal_ API') + if len(inshape) > 2: + assert len(inshapeset) == 1, ( + 'Tensor dims should be equal while input dims > 2 in fill_diagonal_ API' + ) + if len(inshape) == 2: + return core.ops.fill_diagonal_(x, 'value', value, 'offset', offset, + 'wrap', wrap) + return core.ops.fill_diagonal_(x, 'value', value, 'offset', offset, 'wrap', + True) + + +setattr(core.VarBase, 'fill_diagonal_', fill_diagonal_) + + @dygraph_only def tolist(x): """ From 8046e33db0957475d6fb1634f82caf8a53009f76 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 17 Aug 2021 11:31:07 +0800 Subject: [PATCH 068/126] Add some passes which can be applied to Program (#34730) * add inplace passes and tests * update * fix use_cuda undefined fix compile error of op compat * add more ut * fix CPU CI error * check adam unique * fix mac/windows ci, improve coverage * fix ci error * follow weihang's comment * fix BlockDesc::MoveFrom * follow qiuliang's comment * update * follow huihuang's comments --- paddle/fluid/framework/block_desc.cc | 36 +++ paddle/fluid/framework/block_desc.h | 2 + .../fluid/framework/details/build_strategy.h | 5 + paddle/fluid/framework/ir/CMakeLists.txt | 2 +- .../ir/memory_optimize_pass/CMakeLists.txt | 2 +- .../buffer_shared_inplace_op_pass.cc | 139 ++++++++++ .../inplace_addto_op_pass.cc | 262 ++++++++++++++++++ paddle/fluid/framework/ir/pass.cc | 89 +++++- paddle/fluid/framework/ir/pass.h | 6 + paddle/fluid/operators/share_buffer_op.cc | 66 +++++ paddle/fluid/operators/share_buffer_op.cu | 18 ++ paddle/fluid/operators/share_buffer_op.h | 61 ++++ paddle/fluid/pybind/ir.cc | 2 + paddle/fluid/pybind/protobuf.cc | 3 +- paddle/fluid/pybind/pybind.cc | 7 + python/paddle/fluid/framework.py | 48 +++- python/paddle/fluid/ir.py | 83 ++++++ .../unittests/test_apply_pass_to_program.py | 169 +++++++++-- 18 files changed, 966 insertions(+), 34 deletions(-) create mode 100644 paddle/fluid/operators/share_buffer_op.cc create mode 100644 paddle/fluid/operators/share_buffer_op.cu create mode 100644 paddle/fluid/operators/share_buffer_op.h create mode 100644 python/paddle/fluid/ir.py diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc index 404c4e32f897e7..c225d4090ab9ab 100644 --- a/paddle/fluid/framework/block_desc.cc +++ b/paddle/fluid/framework/block_desc.cc @@ -238,5 +238,41 @@ BlockDesc *BlockDesc::ForwardBlock() const { return prog_->MutableBlock(static_cast(desc_->forward_block_idx())); } +void BlockDesc::MoveFrom(BlockDesc *block) { + PADDLE_ENFORCE_NOT_NULL( + block, platform::errors::InvalidArgument("Block must be provided.")); + if (this == block) { + return; + } + + for (auto &pair : block->vars_) { + const auto &name = pair.first; + auto &var_ptr = pair.second; + auto &old_var_ptr = vars_[name]; + if (old_var_ptr == nullptr) { + VLOG(10) << "Create new variable " << var_ptr->Name(); + old_var_ptr = std::move(var_ptr); + } else { + // NOTE(zjl): cannot release old_var_ptr, because Python + // Variable holds the reference of the C++ VarDesc object. + // If the C++ VarDesc object is destructed, any call to the + // methods of Python Variable may raise segmentation fault. + VLOG(10) << "Update old variable " << var_ptr->Name(); + *old_var_ptr = *var_ptr; + } + } + ops_.clear(); + for (const auto &src_op : block->ops_) { + AppendOp()->CopyFrom(*src_op); + } + need_update_ = true; + Flush(); + + block->ops_.clear(); + block->vars_.clear(); + block->need_update_ = true; + block->Flush(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h index 83d31fc2f24f86..e4e5a71a46c860 100644 --- a/paddle/fluid/framework/block_desc.h +++ b/paddle/fluid/framework/block_desc.h @@ -111,6 +111,8 @@ class BlockDesc { ProgramDesc *Program() const { return this->prog_; } + void MoveFrom(BlockDesc *block); + private: ProgramDesc *prog_; // not_own proto::BlockDesc *desc_; // not_own diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 3f8a27f3d5a020..9dcfb0ff32da2f 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -180,6 +180,11 @@ struct BuildStrategy { bool IsFinalized() const { return is_finalized_; } + void ClearFinalized() { + pass_builder_ = nullptr; + is_finalized_ = false; + } + bool IsMultiDevPass(const std::string &pass_name) const; // Apply the passes built by the pass_builder_. The passes will be diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 384f80395c7784..02e8b7b237e279 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -50,7 +50,7 @@ if (WITH_TESTING) endif(WITH_TESTING) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PATTERN_DETECTOR_DEPS}) -cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector op_def_api) +cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector op_def_api pass) cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor) cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS op_compat_sensible_pass) cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass) diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt index 5434678ccb04ac..ee63b314adedb1 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt +++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt @@ -10,7 +10,7 @@ cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_h cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handle reference_count_pass_helper share_tensor_buffer_op_handle graph pass multi_devices_helper) -cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass) +cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass executor_gc_helper) cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass) cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass) diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc index 74d1acac60d6ab..bf7cd55fab2689 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc @@ -15,6 +15,7 @@ #include #include "glog/logging.h" +#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/platform/enforce.h" @@ -30,6 +31,9 @@ class BufferSharedInplaceOpPass : public MemoryReusePass { std::string ReuseType() const override { return "inplace"; } void Run(Graph *graph) const override; + + void ApplyImpl(ProgramDesc *main_program, + ProgramDesc *startup_program) const override; }; void BufferSharedInplaceOpPass::Run(Graph *graph) const { @@ -149,6 +153,141 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const { } } +static std::string GetFirstVarName(const OpDesc &op, const std::string &slot, + bool is_input) { + const auto &name_map = is_input ? op.Inputs() : op.Outputs(); + auto iter = name_map.find(slot); + if (iter != name_map.end() && !iter->second.empty()) { + return iter->second[0]; + } + return kEmptyVarName; +} + +static std::vector>> +GetInplaceVars(const BlockDesc &block, bool use_cuda, + const std::vector &skip_vars) { + PADDLE_ENFORCE_EQ(block.ID(), 0, platform::errors::Unimplemented( + "Inplace can only perform in block 0.")); + // only take block 0 gc_vars + const auto op_gc_vars = + GetEagerDeletionCleanVars(*block.Program(), skip_vars)[0]; + const auto all_ops = block.AllOps(); + PADDLE_ENFORCE_EQ(op_gc_vars.size(), all_ops.size(), + platform::errors::PermissionDenied( + "GC analysis error: op number not match.")); + size_t n = all_ops.size(); + std::unordered_set visited_vars; + std::unordered_set reused_in_vars(skip_vars.begin(), + skip_vars.end()); + std::unordered_set reused_out_vars(skip_vars.begin(), + skip_vars.end()); + for (const auto *op : all_ops) { + if (op->Type() == "share_buffer" || op->Type() == "share_data") { + const auto &inputs = op->Input("X"); + const auto &outputs = op->Output("Out"); + reused_in_vars.insert(inputs.begin(), inputs.end()); + reused_out_vars.insert(outputs.begin(), outputs.end()); + } + } + + std::vector>> result(n); + for (size_t i = 0; i < n; ++i) { + const auto &op = *all_ops[i]; + const auto &gc_vars = op_gc_vars[i]; + const auto inputs = op.InputArgumentNames(); + const auto outputs = op.OutputArgumentNames(); + visited_vars.insert(inputs.begin(), inputs.end()); + + auto &infer_inplace = OpInfoMap::Instance().Get(op.Type()).infer_inplace_; + if (gc_vars.empty() || !infer_inplace) { + visited_vars.insert(outputs.begin(), outputs.end()); + continue; + } + + const auto var_pair = infer_inplace(use_cuda); + std::unordered_multiset input_set(inputs.begin(), + inputs.end()); + std::unordered_multiset output_set(outputs.begin(), + outputs.end()); + std::unordered_set valid_vars; + for (const auto &var : gc_vars) { + if (var != kEmptyVarName && input_set.count(var) == 1 && + output_set.count(var) == 0 && + block.FindVar(var)->GetType() == proto::VarType::LOD_TENSOR) { + valid_vars.insert(var); + } + } + + if (valid_vars.empty()) { + visited_vars.insert(outputs.begin(), outputs.end()); + continue; + } + + for (const auto &pair : var_pair) { + const auto &input_slot = pair.first; + const auto &output_slot = pair.second; + auto input_var = GetFirstVarName(op, input_slot, /*is_input=*/true); + if (input_var == kEmptyVarName || valid_vars.count(input_var) == 0) { + continue; + } + auto output_var = GetFirstVarName(op, output_slot, /*is_input=*/false); + if (output_var == kEmptyVarName || visited_vars.count(output_var) > 0) { + continue; + } + auto output_var_desc = block.FindVar(output_var); + if (output_var_desc == nullptr || output_var_desc->Persistable() || + output_var_desc->GetType() != proto::VarType::LOD_TENSOR) { + continue; + } + + if (reused_in_vars.count(input_var) > 0 || + reused_out_vars.count(output_var) > 0) { + continue; + } + + // input_var -> output_var is reusable + VLOG(10) << "inplace occurs at op " << i << " " << op.Type() << ": " + << input_var << " -> " << output_var; + result[i].emplace_back(input_var, output_var); + reused_in_vars.insert(input_var); + reused_out_vars.insert(output_var); + } + visited_vars.insert(outputs.begin(), outputs.end()); + std::sort(result[i].begin(), result[i].end()); + } + return result; +} + +void BufferSharedInplaceOpPass::ApplyImpl(ProgramDesc *main_program, + ProgramDesc *startup_program) const { + bool use_cuda = Get(kUseCuda); + auto skip_vars = Get>("mem_opt_skip_vars"); + + auto *block = main_program->MutableBlock(0); + auto inplace_vars = GetInplaceVars(*block, use_cuda, skip_vars); + PADDLE_ENFORCE_EQ(inplace_vars.size(), block->OpSize(), + platform::errors::PermissionDenied( + "Inplace analysis error: op number not match.")); + int64_t n = static_cast(inplace_vars.size()); + for (int64_t i = n - 1; i >= 0; --i) { + if (inplace_vars[i].empty()) continue; + auto *op = block->InsertOp(i); + std::vector inputs, outputs; + inputs.reserve(inplace_vars[i].size()); + outputs.reserve(inplace_vars[i].size()); + for (const auto &pair : inplace_vars[i]) { + inputs.push_back(pair.first); + outputs.push_back(pair.second); + } + op->SetType("share_buffer"); + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); + op->SetOutput("XOut", inputs); // add necessary dependency + op->SetAttr("share_dims", std::vector(inputs.size(), false)); + } + block->Flush(); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc index 58857bb490edc8..849d0dabab7796 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include "glog/logging.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h" @@ -40,6 +41,9 @@ class InplaceAddToOpPass : public MemoryReusePass { void Run(Graph *graph) const override; + void ApplyImpl(ProgramDesc *main_program, + ProgramDesc *startup_program) const override; + private: // 1. Add last living op of in_var, add any last living op of out_var // 2. Set reference count of in_var to be 2 @@ -216,6 +220,264 @@ void InplaceAddToOpPass::Run(Graph *graph) const { } } +static bool IsValidConv2DGradDataGradNode(const Node &node) { + if (node.inputs.empty()) return false; + auto *generated_op = node.inputs[0]; + auto *op_desc = generated_op->Op(); + if (op_desc == nullptr || op_desc->Type() != "conv2d_grad") { + return false; + } + const auto &outputs = op_desc->Outputs(); + auto iter = outputs.find(GradVarName("Input")); + return iter != outputs.end() && !iter->second.empty() && + iter->second[0] == node.Name() && + !op_desc->GetAttrIfExists("use_addto"); +} + +static bool IsDownstreamNode(const Node &upstream, const Node &downstream) { + std::queue q; + std::unordered_set visited; + q.push(&upstream); + visited.insert(&upstream); + while (!q.empty()) { + const auto *cur = q.front(); + q.pop(); + if (cur == &downstream) { + return true; + } + + for (const auto *out : cur->outputs) { + if (visited.count(out) == 0) { + visited.insert(out); + q.push(out); + } + } + } + return false; +} + +static void BuildInplaceAddToGraph(Node *in_var_0, Node *in_var_1, + Node *out_var, Graph *graph) { + auto *grad_add_op = out_var->inputs[0]; + + // Cut the connection between in_var_0 and grad_add_op + in_var_0->outputs.erase(std::remove(in_var_0->outputs.begin(), + in_var_0->outputs.end(), grad_add_op), + in_var_0->outputs.end()); + grad_add_op->inputs.erase(std::remove(grad_add_op->inputs.begin(), + grad_add_op->inputs.end(), in_var_0), + grad_add_op->inputs.end()); + + // Replace grad_add_op with share_buffer op + auto *grad_add_op_desc = grad_add_op->Op(); + grad_add_op_desc->SetType("share_buffer"); + grad_add_op_desc->SetInput("X", {in_var_1->Name()}); + grad_add_op_desc->SetOutput("Out", {out_var->Name()}); + grad_add_op_desc->SetOutput("XOut", {in_var_1->Name()}); + grad_add_op_desc->SetAttr("share_dims", std::vector(1, true)); + + // Add share_buffer op between in_var_0 and in_var_1 + OpDesc share_buffer_op; + share_buffer_op.SetType("share_buffer"); + share_buffer_op.SetInput("X", {in_var_0->Name()}); + share_buffer_op.SetOutput("Out", {in_var_1->Name()}); + share_buffer_op.SetOutput("XOut", {in_var_0->Name()}); + share_buffer_op.SetAttr("share_dims", std::vector(1, false)); + + auto *new_share_buffer_op = graph->CreateOpNode(&share_buffer_op); + new_share_buffer_op->inputs.push_back(in_var_0); + in_var_0->outputs.push_back(new_share_buffer_op); + new_share_buffer_op->outputs.push_back(in_var_1); + in_var_1->inputs.push_back(new_share_buffer_op); + + auto *dep_var = graph->CreateControlDepVar(); + new_share_buffer_op->outputs.push_back(dep_var); + dep_var->inputs.push_back(new_share_buffer_op); + + auto in_var_1_gen_op = in_var_1->inputs[0]; + in_var_1_gen_op->inputs.push_back(dep_var); + dep_var->outputs.push_back(in_var_1_gen_op); + + in_var_1_gen_op->Op()->SetAttr("use_addto", true); +} + +static std::unordered_map> +GetAllVersionVarsMap(const Graph &graph) { + const auto &nodes = graph.Nodes(); + std::unordered_map deps; + std::vector sorted_nodes; + sorted_nodes.reserve(nodes.size()); + + std::queue q; + for (auto *node : nodes) { + size_t in_degree = node->inputs.size(); + if (in_degree == 0) { + q.push(node); + sorted_nodes.push_back(node); + } else { + deps[node] = node->inputs.size(); + } + } + + while (!q.empty()) { + auto *cur = q.front(); + q.pop(); + for (auto *node : cur->outputs) { + if (--deps.at(node) == 0) { + sorted_nodes.push_back(node); + q.push(node); + } + } + } + + PADDLE_ENFORCE_EQ( + sorted_nodes.size(), nodes.size(), + platform::errors::PermissionDenied("Wrong toplogical sort algorithm.")); + std::unordered_map> result; + for (auto *node : sorted_nodes) { + if (node->IsVar() && !node->IsCtrlVar()) { + result[node->Name()].push_back(node); + } + } + return result; +} + +void InplaceAddToOpPass::ApplyImpl(ProgramDesc *main_program, + ProgramDesc *startup_program) const { + if (!Get(kUseCuda)) return; + + Graph graph(*main_program); + auto all_ver_vars = GetAllVersionVarsMap(graph); + + const auto all_nodes = graph.Nodes(); // Deep copy + std::unordered_set reused_in_vars; + std::unordered_set reused_out_vars; + for (auto *node : all_nodes) { + if (!node->IsOp() || node->Op() == nullptr || + node->Op()->Type() != "grad_add") { + continue; + } + + VLOG(10) << "Found grad_add op"; + + // Step 1: find input vars first + std::vector input_vars; + input_vars.reserve(2); + for (auto *in : node->inputs) { + if (in->IsCtrlVar() || in->Name() == kEmptyVarName) { + continue; + } + PADDLE_ENFORCE_LT(input_vars.size(), 2, + platform::errors::InvalidArgument( + "The size of inputs of grad_add should be 2.")); + input_vars.push_back(in); + } + + if (input_vars.size() != 2) { // may have kEmptyVarName + continue; + } + + bool is_first_var_valid = IsValidConv2DGradDataGradNode(*input_vars[0]); + bool is_second_var_valid = IsValidConv2DGradDataGradNode(*input_vars[1]); + if (!is_first_var_valid && !is_second_var_valid) { + continue; + } + + VLOG(10) << "validation " << is_first_var_valid << " " + << is_second_var_valid; + + // make sure that input_vars[1] is always the Input@GRAD of conv2d_grad op + if (is_first_var_valid) { + std::swap(input_vars[0], input_vars[1]); + } + + // Step 2: find the unique output var + Node *output_var = nullptr; + std::string output_var_name = node->Op()->Output("Out")[0]; + PADDLE_ENFORCE_NE(output_var_name, kEmptyVarName, + platform::errors::InvalidArgument( + "Output of grad_add should be provided.")); + for (auto *out : node->outputs) { + if (output_var_name == out->Name()) { + output_var = out; + break; + } + } + PADDLE_ENFORCE_NOT_NULL(output_var, + platform::errors::InvalidArgument( + "Output of grad_add should be provided.")); + + VLOG(10) << "Check inplace chain: " << input_vars[0]->Name() << " -> " + << input_vars[1]->Name() << " -> " << output_var->Name(); + + // Step 3: check whether input_vars[0]->generated_op is not the downstream + // op of input_vars[0]->generated_op. If yes, circle would occur. + if (!input_vars[0]->inputs.empty() && !input_vars[1]->inputs.empty()) { + auto *gen_op_0 = input_vars[0]->inputs[0]; + auto *gen_op_1 = input_vars[1]->inputs[0]; + if (IsDownstreamNode(*gen_op_1, *gen_op_0)) { + VLOG(10) << "Downstream node detected, cannot inplace addto"; + continue; + } + } + + // Step 4: name not the same + if (input_vars[0]->Name() == input_vars[1]->Name() || + input_vars[0]->Name() == output_var->Name() || + input_vars[1]->Name() == output_var->Name()) { + continue; + } + + // Step 5: check var version. The inplace var chain is: input_vars[0] -> + // input_vars[1] -> output_var + // Therefore, input_vars[0] must be last version, input_vars[1] must be 1st + // version and last version, and output_var must be the 1st version. + auto iter = all_ver_vars.find(input_vars[0]->Name()); + PADDLE_ENFORCE_EQ(iter != all_ver_vars.end(), true, + platform::errors::InvalidArgument( + "Variable %s not found.", input_vars[0]->Name())); + if (iter->second[iter->second.size() - 1] != input_vars[0]) continue; + + iter = all_ver_vars.find(input_vars[1]->Name()); + if (iter->second.size() != 1) continue; + PADDLE_ENFORCE_EQ(iter->second[0], input_vars[1], + platform::errors::InvalidArgument( + "Variable %s not found.", input_vars[1]->Name())); + iter = all_ver_vars.find(output_var->Name()); + if (iter->second[0] != output_var) continue; + + // Step 6: input_vars[0] and input_vars[1] should only have one output op! + // This output op must be grad_add op. + if (input_vars[0]->outputs.size() != 1 || + input_vars[1]->outputs.size() != 1) { + continue; + } + + // Step 7: check whether the var has been reused + if (reused_in_vars.count(input_vars[0]->Name()) > 0 || + reused_in_vars.count(input_vars[1]->Name()) > 0 || + reused_out_vars.count(input_vars[1]->Name()) > 0 || + reused_out_vars.count(output_var->Name()) > 0) { + continue; + } + + VLOG(10) << "inplace occurs at " << input_vars[0]->Name() << " -> " + << input_vars[1]->Name() << " -> " << output_var->Name(); + // Step 8: inplace addto can be performed now! + BuildInplaceAddToGraph(input_vars[0], input_vars[1], output_var, &graph); + reused_in_vars.insert(input_vars[0]->Name()); + reused_in_vars.insert(input_vars[1]->Name()); + reused_out_vars.insert(input_vars[1]->Name()); + reused_out_vars.insert(output_var->Name()); + } + + // Convert Graph to main_program + ProgramDesc tmp; + GraphToProgram(graph, &tmp); + main_program->CopyFrom(*tmp.Proto()); + main_program->Flush(); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index 42b6244788da09..350f00ae2ab490 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/pass.h" +#include #include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { @@ -31,17 +32,18 @@ namespace paddle { namespace framework { namespace ir { -Graph* Pass::Apply(Graph* graph) const { +Graph *Pass::Apply(Graph *graph) const { + VLOG(10) << "start to apply pass " << Type() << " to graph"; CheckPrevPass(); PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); - for (const std::string& attr : required_pass_attrs_) { + for (const std::string &attr : required_pass_attrs_) { PADDLE_ENFORCE_NE( attrs_.find(attr), attrs_.end(), platform::errors::InvalidArgument( "Required atrribute %s for pass < %s > is not set.", attr, Type())); } - for (const std::string& attr : required_graph_attrs_) { + for (const std::string &attr : required_graph_attrs_) { PADDLE_ENFORCE_EQ(graph->Has(attr), true, platform::errors::InvalidArgument( "Required atrribute %s for graph is not set.", attr)); @@ -66,30 +68,103 @@ Graph* Pass::Apply(Graph* graph) const { // Passes can change params, tensors, so caching need to be discarded ClearMKLDNNCache(paddle::platform::CPUPlace()); #endif + VLOG(10) << "finish to apply pass " << Type() << " to graph"; return graph; } -void Pass::Apply(ProgramDesc* main_program, - ProgramDesc* startup_program) const { +void Pass::Apply(ProgramDesc *main_program, + ProgramDesc *startup_program) const { + VLOG(10) << "apply pass " << Type() << " to program"; PADDLE_ENFORCE_NOT_NULL(main_program, platform::errors::InvalidArgument( "main program must be provided")); PADDLE_ENFORCE_NOT_NULL( startup_program, platform::errors::InvalidArgument("startup program must be provided")); + ApplyImpl(main_program, startup_program); + VLOG(10) << "finish to apply pass " << Type() << " to program"; +} + +template +static void VisitAllElements(Container &&container, Visitor &&visitor, + bool reverse) { + if (reverse) { + std::for_each(container.rbegin(), container.rend(), visitor); + } else { + std::for_each(container.begin(), container.end(), visitor); + } +} + +void Pass::MergePrograms(ProgramDesc *dst, const details::ProgramDescs &srcs, + bool append) { + PADDLE_ENFORCE_NOT_NULL( + dst, platform::errors::InvalidArgument("Dst program must be provided.")); + bool reverse = !append; + + auto create_var_visitor = [dst](const ProgramDesc &src) { + PADDLE_ENFORCE_EQ(src.Size(), 1, platform::errors::Unimplemented( + "MergePrograms can only support to " + "merge program with only one block.")); + const auto &src_block = src.Block(0); + auto *dst_block = dst->MutableBlock(0); + for (const auto *src_new_var : src_block.AllVars()) { + if (dst_block->FindVar(src_new_var->Name())) continue; + auto *dst_new_var = dst_block->Var(src_new_var->Name()); + *dst_new_var = *src_new_var; + VLOG(10) << "Create new variable " << dst_new_var->Name(); + } + }; + VisitAllElements(srcs, create_var_visitor, reverse); + + auto create_op_visitor = [dst, reverse](const ProgramDesc &src) { + auto ops = src.Block(0).AllOps(); + auto copy_op_visitor = [dst, reverse](const OpDesc *src_op) { + auto *dst_block = dst->MutableBlock(0); + auto *op = reverse ? dst_block->PrependOp() : dst_block->AppendOp(); + op->CopyFrom(*src_op); + VLOG(10) << (reverse ? "Prepend" : "Append") << " op " << op->Type(); + // FIXME(zjl): some passes does not add VarDesc to program, + // we should fix this bug later... + for (const auto &in_var_name : op->InputArgumentNames()) { + dst_block->Var(in_var_name); + } + for (const auto &out_var_name : op->OutputArgumentNames()) { + dst_block->Var(out_var_name); + } + }; + VisitAllElements(ops, copy_op_visitor, reverse); + }; + VisitAllElements(srcs, create_op_visitor, reverse); +} + +void Pass::ApplyImpl(ProgramDesc *main_program, + ProgramDesc *startup_program) const { Graph graph(*main_program); Apply(&graph); - // TODO(zjl): support details::kStartupProgramDescs and details::kProgramDescs ProgramDesc new_main_program; GraphToProgram(graph, &new_main_program); main_program->CopyFrom(*new_main_program.Proto()); + if (graph.Has(details::kStartupProgramDescs)) { + const auto &startups = + graph.Get(details::kStartupProgramDescs); + VLOG(10) << "Merge startup programs"; + MergePrograms(startup_program, startups, /*append=*/true); + } + + if (graph.Has(details::kProgramDescs)) { + const auto &mains = + graph.Get(details::kProgramDescs); + VLOG(10) << "Merge main programs"; + MergePrograms(main_program, mains, /*append=*/false); + } + startup_program->Flush(); main_program->Flush(); } -PassRegistry& PassRegistry::Instance() { +PassRegistry &PassRegistry::Instance() { static PassRegistry g_pass_info_map; return g_pass_info_map; } diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index fecdfc404e6dca..1d1ebcb17ea63a 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -148,6 +148,12 @@ class Pass { "The virtual pass called is not implemented.")); } + virtual void ApplyImpl(ProgramDesc *main_program, + ProgramDesc *startup_program) const; + + static void MergePrograms(ProgramDesc *dst, const details::ProgramDescs &srcs, + bool append); + // Some Pass must be placed before this Pass, and some // Pass must be placed after this Pass. virtual void CheckPrevPass() const {} diff --git a/paddle/fluid/operators/share_buffer_op.cc b/paddle/fluid/operators/share_buffer_op.cc new file mode 100644 index 00000000000000..a161b9272b7b20 --- /dev/null +++ b/paddle/fluid/operators/share_buffer_op.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/share_buffer_op.h" + +namespace paddle { +namespace operators { + +class ShareBufferOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // dtype is not important + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + return expected_kernel_type; + } +}; + +class ShareBufferOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensors of share buffer op") + .AsDuplicable(); + AddOutput("Out", "(Tensor), The output tensors of share buffer op") + .AsDuplicable(); + AddOutput("XOut", + "(Tensor), The output tensors which are the same as X. It is " + "used to build the graph dependency") + .AsDuplicable(); + AddAttr>("share_dims", "Whether to share dims") + .SetDefault(std::vector()); + AddComment( + R"DOC(Operator used to perform inplace memory reuse. It should be not exposed to Python APIs.)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(share_buffer, ops::ShareBufferOp, ops::ShareBufferOpMaker); + +// dtype is not important +REGISTER_OP_CPU_KERNEL(share_buffer, ops::ShareBufferOpKernel); diff --git a/paddle/fluid/operators/share_buffer_op.cu b/paddle/fluid/operators/share_buffer_op.cu new file mode 100644 index 00000000000000..15c106df746fc2 --- /dev/null +++ b/paddle/fluid/operators/share_buffer_op.cu @@ -0,0 +1,18 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/share_buffer_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(share_buffer, ops::ShareBufferOpKernel); diff --git a/paddle/fluid/operators/share_buffer_op.h b/paddle/fluid/operators/share_buffer_op.h new file mode 100644 index 00000000000000..5138ad9d54b79a --- /dev/null +++ b/paddle/fluid/operators/share_buffer_op.h @@ -0,0 +1,61 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +template +class ShareBufferOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto inputs = ctx.MultiInput("X"); + auto outputs = ctx.MultiOutput("Out"); + size_t n = inputs.size(); + PADDLE_ENFORCE_EQ(n, outputs.size(), platform::errors::PermissionDenied( + "Variable number not match.")); + const auto &share_dims = ctx.Attr>("share_dims"); + if (!share_dims.empty()) { + PADDLE_ENFORCE_EQ( + n, share_dims.size(), + platform::errors::PermissionDenied( + "Attribute share_dims number not match input variable number.")); + } + + const std::vector *input_args = nullptr, + *output_args = nullptr; + if (VLOG_IS_ON(10)) { + input_args = &ctx.GetOp().Inputs("X"); + output_args = &ctx.GetOp().Outputs("Out"); + } + for (size_t i = 0; i < n; ++i) { + if (inputs[i] == nullptr || outputs[i] == nullptr) { + continue; + } + outputs[i]->ShareBufferWith(*inputs[i]); + VLOG(10) << "Share tensor buffer " << (*input_args)[i] << " -> " + << (*output_args)[i]; + if (!share_dims.empty() && share_dims[i]) { + outputs[i]->Resize(inputs[i]->dims()); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index 4a4c34b149e400..788d8d15ff0cdb 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -301,6 +301,7 @@ void BindPass(py::module *m) { // pass_attr_types to indicate the type of "nranks" explicitly, // i.e. pass_attr_types = {"nranks": "size_t"} means that the type of // "nranks" is size_t in C++. + REGISTER_PASS_ATTR_GETTER_SETTER("bool", bool); REGISTER_PASS_ATTR_GETTER_SETTER("int", int64_t); REGISTER_PASS_ATTR_GETTER_SETTER("long", int64_t); REGISTER_PASS_ATTR_GETTER_SETTER("size_t", size_t); @@ -309,6 +310,7 @@ void BindPass(py::module *m) { REGISTER_PASS_ATTR_GETTER_SETTER("float", double); REGISTER_PASS_ATTR_GETTER_SETTER("bytes", std::string); REGISTER_PASS_ATTR_GETTER_SETTER("str", std::string); + REGISTER_PASS_ATTR_GETTER_SETTER("list[str]", std::vector); m->def( "apply_pass", diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 7cd21785a47591..7caa2494dc0144 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -156,7 +156,8 @@ void BindBlockDesc(pybind11::module *m) { pybind11::return_value_policy::reference) .def("op_size", &pd::BlockDesc::OpSize) .def("op", &pd::BlockDesc::Op, pybind11::return_value_policy::reference) - .def("serialize_to_string", SerializeMessage); + .def("serialize_to_string", SerializeMessage) + .def("_move_from", &pd::BlockDesc::MoveFrom); } void BindVarDsec(pybind11::module *m) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 7137115ac0a396..5001cc4a0172fc 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2553,6 +2553,7 @@ All parameter, weight, gradient are variables in Paddle. .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized); build_strategy.def(py::init()) + .def("_clear_finalized", &BuildStrategy::ClearFinalized) .def_property( "reduce_strategy", [](const BuildStrategy &self) { return self.reduce_; }, @@ -3074,6 +3075,12 @@ All parameter, weight, gradient are variables in Paddle. [](BuildStrategy &self, bool fix_op_run_order) { self.fix_op_run_order_ = fix_op_run_order; }) + .def("_copy", + [](const BuildStrategy &self) { + auto new_bs = self; + new_bs.ClearFinalized(); + return new_bs; + }) .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(true); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 10b7292a0b6bb5..343ce352c3eaaf 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -3353,6 +3353,12 @@ def _clone_variable(self, var, force_persistable=True): return ret_var +# NOTE(zjl): you should be careful that after you call this method, +# some Python Variable and all Python Operators should not be used +# again. Because all Python Variables and all Python Operators are +# re-constructed inside this method. The underlying VarDesc(OpDesc) +# of some old Python Variables(all old Python Operators) may have +# been destructed. def _apply_pass(main_program, startup_program, pass_name, @@ -4286,6 +4292,14 @@ def __init__(self): self._graph = None def _find_var_class_kwargs(self, new_desc): + # NOTE: not all variables support shape/dtype/lod_level methods. + # For example: RAW, STEP_SCOPES, etc. + def get_var_desc_attr_or_none(var_desc, attr_name, allowed_types): + if var_desc.type() in allowed_types: + return getattr(var_desc, attr_name)() + else: + return None + old_desc = self.desc all_new_vars = [] block_num = new_desc.num_blocks() @@ -4302,9 +4316,21 @@ def _find_var_class_kwargs(self, new_desc): kwargs = { 'type': new_var_desc.type(), 'name': new_var_desc.name(), - 'shape': new_var_desc.shape(), - 'dtype': new_var_desc.dtype(), - 'lod_level': new_var_desc.lod_level(), + 'shape': get_var_desc_attr_or_none(new_var_desc, "shape", [ + core.VarDesc.VarType.LOD_TENSOR, + core.VarDesc.VarType.SELECTED_ROWS, + core.VarDesc.VarType.LOD_TENSOR_ARRAY, + ]), + 'dtype': get_var_desc_attr_or_none(new_var_desc, "dtype", [ + core.VarDesc.VarType.LOD_TENSOR, + core.VarDesc.VarType.SELECTED_ROWS, + core.VarDesc.VarType.LOD_TENSOR_ARRAY, + ]), + 'lod_level': + get_var_desc_attr_or_none(new_var_desc, "lod_level", [ + core.VarDesc.VarType.LOD_TENSOR, + core.VarDesc.VarType.LOD_TENSOR_ARRAY, + ]), 'error_clip': old_var.error_clip if old_var is not None else None, 'stop_gradient': old_var.stop_gradient @@ -4343,14 +4369,20 @@ def _rebuild_from_desc(self, desc): all_new_vars = self._find_var_class_kwargs(desc) block_num = desc.num_blocks() assert block_num == len(all_new_vars) + assert block_num == self.desc.num_blocks() # clear old blocks and desc - self.blocks = [] - self.desc = None + for idx in range(block_num): + block = self.blocks[idx] + block.vars.clear() + block.ops.clear() + + for idx in range(block_num): + block_desc = self.blocks[idx].desc + new_block_desc = desc.block(idx) + block_desc._move_from(new_block_desc) - # create new blocks and set desc - self.desc = desc - self.blocks = [Block(self, idx) for idx in range(block_num)] + del desc # add new vars first for idx in range(block_num): diff --git a/python/paddle/fluid/ir.py b/python/paddle/fluid/ir.py new file mode 100644 index 00000000000000..765272f9dc98e7 --- /dev/null +++ b/python/paddle/fluid/ir.py @@ -0,0 +1,83 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import copy +from .framework import _apply_pass + + +def get_data_vars(program): + data_vars = [] + for var_name, var in program.global_block().vars.items(): + if var.is_data: + data_vars.append(var_name) + return data_vars + + +def apply_build_strategy(main_program, startup_program, build_strategy, + pass_attrs): + def update_attr(attrs, attr_types, name, value, typ=None): + if name not in attrs: + attrs[name] = value + if typ: + attr_types[name] = typ + + def apply_pass(name): + attrs = dict(pass_attrs) + attr_types = {} + update_attr(attrs, attr_types, "nranks", 1, "size_t") + update_attr(attrs, attr_types, "use_cuda", False, "bool") + # TODO(zjl): how to skip fetch variables ? + update_attr(attrs, attr_types, "mem_opt_skip_vars", + get_data_vars(main_program), "list[str]") + _apply_pass(main_program, startup_program, name, attrs, attr_types) + + use_cuda = pass_attrs.get("use_cuda", False) + build_strategy = build_strategy._copy() + if build_strategy.sync_batch_norm: + apply_pass("sync_batch_norm_pass") + build_strategy.sync_batch_norm = False + if build_strategy.fuse_relu_depthwise_conv and use_cuda: + apply_pass("fuse_relu_depthwise_conv_pass") + build_strategy.fuse_relu_depthwise_conv = False + if build_strategy.fuse_bn_act_ops and use_cuda: + apply_pass("fuse_bn_act_pass") + build_strategy.fuse_bn_act_ops = False + if build_strategy.fuse_bn_add_act_ops and use_cuda: + apply_pass("fuse_bn_add_act_pass") + build_strategy.fuse_bn_add_act_ops = False + if build_strategy.enable_auto_fusion and use_cuda: + apply_pass("fusion_group_pass") + build_strategy.enable_auto_fusion = False + if build_strategy.fuse_elewise_add_act_ops: + apply_pass("fuse_elewise_add_act_pass") + build_strategy.fuse_elewise_add_act_ops = False + if build_strategy.fuse_all_optimizer_ops: + apply_pass("fuse_adam_op_pass") + apply_pass("fuse_sgd_op_pass") + apply_pass("fuse_momentum_op_pass") + build_strategy.fuse_all_optimizer_ops = False + # TODO(zjl): support fuse all reduce ops + if build_strategy.cache_runtime_context: + apply_pass("runtime_context_cache_pass") + build_strategy.cache_runtime_context = False + if build_strategy.enable_addto and use_cuda: + # NOTE: how to get fetch vars to skip memory optimization? + apply_pass("inplace_addto_op_pass") + build_strategy.enable_addto = False + if build_strategy.enable_inplace: + apply_pass("buffer_shared_inplace_pass") + build_strategy.enable_inplace = False + build_strategy._clear_finalized() + return build_strategy diff --git a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py index b35fc9bae651a8..422cb58ff9ab6b 100644 --- a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py +++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py @@ -16,20 +16,16 @@ from paddle.vision.models import resnet50 from paddle.nn import CrossEntropyLoss from paddle.fluid.framework import _apply_pass +from paddle.fluid.ir import apply_build_strategy +import paddle.fluid as fluid import unittest +import numpy as np -class TestApplyPassToProgram(unittest.TestCase): - def setUp(self): - paddle.enable_static() - - def global_block_contains_op(self, program, op_type): - for op in program.global_block().ops: - if op.type == op_type: - return True - return False - - def test_case(self): +def get_resnet50_model(): + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): image = paddle.static.data( name="image", shape=[None, 3, 224, 224], dtype="float32") label = paddle.static.data(name="label", shape=[None, 1], dtype="int64") @@ -37,14 +33,27 @@ def test_case(self): loss_fn = CrossEntropyLoss() pred = model(image) loss = loss_fn(pred, label) - optimizer = paddle.optimizer.SGD(learning_rate=1e-3) + optimizer = paddle.optimizer.Adam(learning_rate=1e-3) optimizer.minimize(loss) - startup = paddle.static.default_startup_program() - main = paddle.static.default_main_program() + return main, startup, image, label, loss + + +def global_block_contains_op(program, op_type): + for op in program.global_block().ops: + if op.type == op_type: + return True + return False + +class TestApplyPassToProgram(unittest.TestCase): + def setUp(self): + paddle.enable_static() + + def test_case(self): + main, startup, image, label, loss = get_resnet50_model() fused_op = "fused_elemwise_add_activation" - self.assertFalse(self.global_block_contains_op(main, fused_op)) + self.assertFalse(global_block_contains_op(main, fused_op)) attrs = { "int_attr": -3, "size_t_attr": 10, @@ -59,7 +68,135 @@ def test_case(self): ret_attrs = _apply_pass(main, startup, "fuse_elewise_add_act_pass", attrs, attr_types) self.assertEqual(attrs, ret_attrs) - self.assertTrue(self.global_block_contains_op(main, fused_op)) + self.assertTrue(global_block_contains_op(main, fused_op)) + + +class TestIRPassBase(unittest.TestCase): + def setUp(self): + paddle.enable_static() + if paddle.is_compiled_with_cuda(): + fluid.set_flags({ + 'FLAGS_cudnn_deterministic': 1, + 'FLAGS_max_inplace_grad_add': 6, + }) + self.place = paddle.CUDAPlace(0) + else: + self.place = paddle.CPUPlace() + self.use_cuda = isinstance(self.place, paddle.CUDAPlace) + self.executor = paddle.static.Executor(self.place) + self.num_classes = 1000 + self.seed = 1 + + def get_strategy(self): + return { + 'enable_inplace': True, + 'enable_addto': True, + 'fuse_all_optimizer_ops': True, + 'fuse_elewise_add_act_ops': True, + 'fuse_relu_depthwise_conv': True, + 'fuse_bn_act_ops': True, + } + + def check_before_applied(self, main, startup): + self.assertFalse(global_block_contains_op(main, "share_buffer")) + self.assertFalse(global_block_contains_op(main, "coalesce_tensor")) + self.assertFalse( + global_block_contains_op(main, "fused_elemwise_add_activation")) + + adam_cnt = 0 + for op in main.global_block().ops: + if op.type == "adam": + adam_cnt += 1 + self.assertGreater(adam_cnt, 1) + + def check_after_applied(self, main, startup): + self.assertTrue(global_block_contains_op(main, "share_buffer")) + # fused all optimizer pass requires this + if paddle.is_compiled_with_cuda(): + self.assertTrue(global_block_contains_op(main, "coalesce_tensor")) + self.assertTrue( + global_block_contains_op(main, "fused_elemwise_add_activation")) + + share_dims_cnt = 0 + non_share_dims_cnt = 0 + for op in main.global_block().ops: + if op.type != "share_buffer": + continue + + share_dims = op.attr("share_dims") + if share_dims: + for i in range(len(share_dims)): + self.assertEqual(share_dims[0], share_dims[i]) + if share_dims[0] is True: + share_dims_cnt += 1 + else: + non_share_dims_cnt += 1 + else: + non_share_dims_cnt += 1 + if self.use_cuda: + self.assertGreaterEqual(share_dims_cnt, 1) + else: + self.assertEqual(share_dims_cnt, 0) + self.assertGreaterEqual(non_share_dims_cnt, 1) + + if paddle.is_compiled_with_cuda(): + adam_cnt = 0 + for op in main.global_block().ops: + if op.type == "adam": + adam_cnt += 1 + self.assertEqual(adam_cnt, 1) + + def test_main(self): + if self.use_cuda: + batch_num = 20 + batch_size = 4 + else: + batch_num = 3 + batch_size = 2 + + paddle.seed(self.seed) + main1, startup1, image, label, loss1 = get_resnet50_model() + main2, startup2, image, label, loss2 = get_resnet50_model() + + build_strategy = paddle.static.BuildStrategy() + for k, v in self.get_strategy().items(): + setattr(build_strategy, k, v) + self.check_before_applied(main2, startup2) + apply_build_strategy(main2, startup2, build_strategy, + {"use_cuda": self.use_cuda}) + self.check_after_applied(main2, startup2) + + image_shape = [batch_size] + list(image.shape)[1:] + label_shape = [batch_size] + list(label.shape)[1:] + + paddle.seed(self.seed) + scope1 = paddle.static.Scope() + with paddle.static.scope_guard(scope1): + self.executor.run(startup1) + + paddle.seed(self.seed) + scope2 = paddle.static.Scope() + with paddle.static.scope_guard(scope2): + self.executor.run(startup2) + + for idx in range(batch_num): + feed = { + image.name: np.random.rand(*image_shape).astype('float32'), + label.name: np.random.randint( + low=0, + high=self.num_classes, + size=label_shape, + dtype='int64'), + } + with paddle.static.scope_guard(scope1): + loss_value1 = self.executor.run(main1, + feed=feed, + fetch_list=[loss1])[0] + with paddle.static.scope_guard(scope2): + loss_value2 = self.executor.run(main2, + feed=feed, + fetch_list=[loss2])[0] + self.assertEqual(loss_value1, loss_value2, "batch {}".format(idx)) if __name__ == "__main__": From 10f9644cc4cb4eb23807007d678df880db4b0336 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 16 Aug 2021 22:50:47 -0500 Subject: [PATCH 069/126] Align CTC grad scale same with ESPNet (#34729) * dygraph support more ctc grad scale * scale for 1.x * fix unitest * fix unitest * format code * fix unittest * fix log info * unittest cov * fix format;notest,test=cpu,coverage * skip ctc_loss egs;test=cpu * warpctc grad cov;test=coverage * add dygraph test;test=coverage * format;test=cpu,coverage * format;test=cpu * add api compat;test=cpu * add cpu test * rename * rename * fix * fix test * format * eigen cpu * eigen gpu grad pass * cuda gpu pass * format * fix ci --- paddle/fluid/operators/CMakeLists.txt | 4 +- .../fluid/operators/math/sequence_padding.cc | 31 ++- .../fluid/operators/math/sequence_padding.cu | 24 +- .../fluid/operators/math/sequence_padding.h | 4 + .../operators/math/sequence_padding_test.cc | 4 +- .../operators/sequence_ops/sequence_pad_op.h | 4 +- .../sequence_ops/sequence_unpad_op.h | 5 +- paddle/fluid/operators/warpctc_op.cc | 29 +++ paddle/fluid/operators/warpctc_op.cu | 203 +++++++++++++++++ paddle/fluid/operators/warpctc_op.cu.cc | 24 -- paddle/fluid/operators/warpctc_op.h | 110 +++++---- python/paddle/fluid/layers/loss.py | 25 +- .../fluid/tests/unittests/test_warpctc_op.py | 215 ++++++++++++++++++ python/paddle/nn/functional/loss.py | 16 +- python/paddle/nn/layer/loss.py | 8 +- 15 files changed, 602 insertions(+), 104 deletions(-) create mode 100644 paddle/fluid/operators/warpctc_op.cu delete mode 100644 paddle/fluid/operators/warpctc_op.cu.cc diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 0956410041bb23..ff232b7ea59afb 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -81,10 +81,10 @@ op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS execu if (WITH_GPU OR WITH_ROCM) if(WITH_ROCM) - op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) + op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu) # warpctc_op needs cudnn 7 above elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) - op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) + op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu) else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc index e29313e9f742ca..dca58f796a76f5 100644 --- a/paddle/fluid/operators/math/sequence_padding.cc +++ b/paddle/fluid/operators/math/sequence_padding.cc @@ -33,7 +33,8 @@ void CopyValidData(framework::Tensor* dst_tensor, const framework::Tensor* src_tensor, const framework::Vector& seq_offsets, int pad_seq_len, int step_width, bool norm_by_len, - CopyType type, PadLayout layout) { + bool norm_by_batchsize, bool norm_by_total_logits_len, + int total_logits_len, CopyType type, PadLayout layout) { int seq_num = seq_offsets.size() - 1; const T* src_data = src_tensor->data(); T* dst_data = dst_tensor->data(); @@ -54,7 +55,21 @@ void CopyValidData(framework::Tensor* dst_tensor, int pad_data_offset = layout == kBatchLengthWidth ? seq_idx * pad_seq_len * step_width : seq_idx * step_width; - float scale = 1.0f / static_cast(valid_seq_len); + + float scale = 1.0f; + if (norm_by_total_logits_len) { + scale = 1.0f / static_cast(total_logits_len); + VLOG(3) << "[warpctc grad][norm_by_total_logits_len]: scale " << scale + << "total_logits_len " << total_logits_len; + } else if (norm_by_batchsize) { + scale = 1.0f / static_cast(seq_num); + VLOG(3) << "[warpctc grad][norm_by_batchsize]: scale " << scale << "B " + << seq_num; + } else if (norm_by_len) { + scale = 1.0f / static_cast(valid_seq_len); + VLOG(3) << "[warpctc grad][norm_by_len]: scale " << scale << "T " + << valid_seq_len; + } for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) { const T* src = @@ -97,6 +112,8 @@ class PaddingLoDTensorFunctor { framework::LoDTensor* pad_tensor, const framework::LoDTensor& pad_value, int pad_seq_len = -1, int lod_level = 0, bool norm_by_times = false, + bool norm_by_batchsize = false, + bool norm_by_total_logits_len = false, const PadLayout layout = kBatchLengthWidth) { auto seq_lod = seq_tensor.lod(); const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; @@ -131,7 +148,8 @@ class PaddingLoDTensorFunctor { } CopyValidData(pad_tensor, &seq_tensor, seq_offsets, pad_seq_len, - step_width, norm_by_times, kSeqToPad, layout); + step_width, norm_by_times, false, false, 0, kSeqToPad, + layout); } }; @@ -142,6 +160,8 @@ class UnpaddingLoDTensorFunctor { const framework::LoDTensor& pad_tensor, framework::LoDTensor* seq_tensor, int pad_seq_len = -1, int lod_level = 0, bool norm_by_times = false, + bool norm_by_batchsize = false, + bool norm_by_total_logits_len = false, const PadLayout layout = kBatchLengthWidth) { auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level]; const auto& seq_tensor_dims = seq_tensor->dims(); @@ -149,13 +169,16 @@ class UnpaddingLoDTensorFunctor { if (pad_seq_len == -1) { pad_seq_len = MaximumSequenceLength(seq_offsets); } + int total_logits_len = TotalSequenceLength(seq_offsets); int step_width = seq_tensor->numel() / seq_tensor_dims[0]; CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len, step_width, layout); CopyValidData(seq_tensor, &pad_tensor, seq_offsets, pad_seq_len, - step_width, norm_by_times, kPadToSeq, layout); + step_width, norm_by_times, norm_by_batchsize, + norm_by_total_logits_len, total_logits_len, kPadToSeq, + layout); } }; diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu index 19c3af03411b8c..3578d7e91fd8c6 100644 --- a/paddle/fluid/operators/math/sequence_padding.cu +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -23,7 +23,9 @@ template __global__ void SequencePaddingKernel( T* dst, const T* src, const T* pad_value, bool is_constant_pad, const size_t* seq_offsets, const size_t seq_num, const size_t pad_seq_len, - const size_t step_width, bool norm_by_len, const PadLayout layout) { + const size_t step_width, bool norm_by_len, bool norm_by_batchsize, + bool norm_by_total_logits_len, int total_logits_len, + const PadLayout layout) { size_t seq_idx = blockIdx.y; size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx]; @@ -38,7 +40,15 @@ __global__ void SequencePaddingKernel( src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset); if (step_idx < seq_len) { - float scale = norm_by_len ? (1.0f / static_cast(seq_len)) : 1.0f; + float scale = 1.0f; + if (norm_by_total_logits_len) { + scale = 1.0f / static_cast(total_logits_len); + } else if (norm_by_batchsize) { + scale = 1.0f / static_cast(seq_num); + } else if (norm_by_len) { + scale = norm_by_len ? (1.0f / static_cast(seq_len)) : 1.0f; + } + for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) { dst_data[i] = scale * src_data[i]; } @@ -57,6 +67,8 @@ class PaddingLoDTensorFunctor { framework::LoDTensor* pad_tensor, const framework::LoDTensor& pad_value, int pad_seq_len = -1, int lod_level = 0, bool norm_by_times = false, + bool norm_by_batchsize = false, + bool norm_by_total_logits_len = false, const PadLayout layout = kBatchLengthWidth) { auto seq_lod = seq_tensor.lod(); const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; @@ -107,7 +119,7 @@ class PaddingLoDTensorFunctor { SequencePaddingKernel<<>>( pad_data, seq_data, pad_value_data, pad_value.numel() == 1, seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, - step_width, norm_by_times, layout); + step_width, norm_by_times, false, false, 0, layout); } }; @@ -118,6 +130,8 @@ class UnpaddingLoDTensorFunctor { const framework::LoDTensor& pad_tensor, framework::LoDTensor* seq_tensor, int pad_seq_len = -1, int lod_level = 0, bool norm_by_times = false, + bool norm_by_batchsize = false, + bool norm_by_total_logits_len = false, const PadLayout layout = kBatchLengthWidth) { auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level]; const auto& seq_tensor_dims = seq_tensor->dims(); @@ -126,6 +140,7 @@ class UnpaddingLoDTensorFunctor { if (pad_seq_len == -1) { pad_seq_len = max_seq_len; } + int total_logits_len = TotalSequenceLength(seq_offsets); int step_width = seq_tensor->numel() / seq_tensor_dims[0]; int seq_num = seq_offsets.size() - 1; @@ -159,7 +174,8 @@ class UnpaddingLoDTensorFunctor { SequencePaddingKernel<<>>( seq_data, pad_data, nullptr, false, seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, - step_width, norm_by_times, layout); + step_width, norm_by_times, norm_by_batchsize, norm_by_total_logits_len, + total_logits_len, layout); } }; diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h index 956a4ff6a2d45c..308e1eedebd37d 100644 --- a/paddle/fluid/operators/math/sequence_padding.h +++ b/paddle/fluid/operators/math/sequence_padding.h @@ -107,6 +107,8 @@ class PaddingLoDTensorFunctor { framework::LoDTensor* pad_tensor, const framework::LoDTensor& pad_value, int pad_seq_len = -1, int lod_level = 0, bool norm_by_times = false, + bool norm_by_batchsize = false, + bool norm_by_total_logits_len = false, const PadLayout layout = kBatchLengthWidth); }; @@ -117,6 +119,8 @@ class UnpaddingLoDTensorFunctor { const framework::LoDTensor& pad_tensor, framework::LoDTensor* seq_tensor, int pad_seq_len = -1, int lod_level = 0, bool norm_by_times = false, + bool norm_by_batchsize = false, + bool norm_by_total_logits_len = false, const PadLayout layout = kBatchLengthWidth); }; diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc index ea31b10c5558f6..590d1d6191de43 100644 --- a/paddle/fluid/operators/math/sequence_padding_test.cc +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -66,13 +66,13 @@ void TestSequencePadding(const DeviceContext &context, } paddle::operators::math::PaddingLoDTensorFunctor()( - context, seq, &padding, pad_value, -1, 0, false, + context, seq, &padding, pad_value, -1, 0, false, false, false, paddle::operators::math::kLengthBatchWidth); seq_back.set_lod(lod); seq_back.mutable_data(seq_dims, place); paddle::operators::math::UnpaddingLoDTensorFunctor()( - context, padding, &seq_back, -1, 0, false, + context, padding, &seq_back, -1, 0, false, false, false, paddle::operators::math::kLengthBatchWidth); if (paddle::platform::is_cpu_place(place)) { diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h index a9660f05c3c6b6..d8ae0b200df7d4 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h @@ -46,7 +46,7 @@ class SequencePadOpKernel : public framework::OpKernel { math::PaddingLoDTensorFunctor()( ctx.template device_context(), *x, out, *pad_value, - padded_length, 0, false, math::kBatchLengthWidth); + padded_length, 0, false, false, false, math::kBatchLengthWidth); LoDTensor seq_len; seq_len.Resize(len_t->dims()); @@ -72,7 +72,7 @@ class SequencePadGradOpKernel : public framework::OpKernel { math::UnpaddingLoDTensorFunctor()( ctx.template device_context(), *d_out, d_x, - padded_length, 0, false, math::kBatchLengthWidth); + padded_length, 0, false, false, false, math::kBatchLengthWidth); } } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h index 60ba4797db1e2a..398c3bba075693 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h @@ -69,7 +69,8 @@ class SequenceUnpadOpKernel : public framework::OpKernel { int64_t padded_length = x_t->dims()[1]; math::UnpaddingLoDTensorFunctor()( - dev_ctx, *x_t, out_t, padded_length, 0, false, math::kBatchLengthWidth); + dev_ctx, *x_t, out_t, padded_length, 0, false, false, false, + math::kBatchLengthWidth); } }; @@ -93,7 +94,7 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel { math::PaddingLoDTensorFunctor()( ctx.template device_context(), *d_out, d_x, zero_pads, - padded_length, 0, false, math::kBatchLengthWidth); + padded_length, 0, false, false, false, math::kBatchLengthWidth); } } }; diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc index f38f5d9f723579..92862929159d4b 100644 --- a/paddle/fluid/operators/warpctc_op.cc +++ b/paddle/fluid/operators/warpctc_op.cc @@ -125,6 +125,17 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker { "normalize the gradients by the number of time-step, " "which is also the sequence's length.") .SetDefault(false); + AddAttr( + "norm_by_batchsize", + "(bool, default: false), normalize the loss by the batch size." + "If True, supersedes norm_by_times") + .SetDefault(false); + AddAttr( + "norm_by_total_logits_len", + "(bool, default: false), normalize the loss by the total number of " + "frames" + "in the batch. If True, supersedes norm_by_batchsize and norm_by_times") + .SetDefault(false); AddComment(R"DOC( An operator integrating the open-source [warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in @@ -206,3 +217,21 @@ REGISTER_OP_CPU_KERNEL( warpctc_grad, ops::WarpCTCGradKernel, ops::WarpCTCGradKernel); + +REGISTER_OP_VERSION(warpctc) + .AddCheckpoint( + R"ROC( + Upgrade warpctc add a new attribute [norm_by_batchsize] and [norm_by_total_logits_len])ROC", + paddle::framework::compatible::OpVersionDesc() + .NewAttr( + "norm_by_batchsize", + "(bool, default: false), normalize the loss by the batch size." + "If True, supersedes norm_by_times", + false) + .NewAttr("norm_by_total_logits_len", + "(bool, default: false), normalize the loss by the total " + "number of " + "frames" + "in the batch. If True, supersedes norm_by_batchsize and " + "norm_by_times", + false)); \ No newline at end of file diff --git a/paddle/fluid/operators/warpctc_op.cu b/paddle/fluid/operators/warpctc_op.cu new file mode 100644 index 00000000000000..27c17eb6de8ab4 --- /dev/null +++ b/paddle/fluid/operators/warpctc_op.cu @@ -0,0 +1,203 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/warpctc_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +using platform::PADDLE_CUDA_NUM_THREADS; + +template +void PrintTensor(const framework::LoDTensor& src, + const framework::ExecutionContext& ctx) { + std::vector vec(src.numel()); + TensorToVector(src, ctx.device_context(), &vec); + for (int i = 0; i < static_cast(vec.size()); ++i) { + VLOG(3) << "vec[" << i << "] : " << vec[i]; + } +} + +template +__global__ void ReduceSumKernel(const T* d_in, T* d_out) { + // Allocate shared memory + extern __shared__ int partial_sum[]; + + // Calculate thread ID + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + // Load elements into shared memory + partial_sum[threadIdx.x] = d_in[tid]; + __syncthreads(); + + // Start at 1/2 block stride and divide by two each iteration + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + // Each thread does work unless it is further than the stride + if (threadIdx.x < s) { + partial_sum[threadIdx.x] += partial_sum[threadIdx.x + s]; + } + __syncthreads(); + } + + // Let the thread 0 for this block write it's result to main memory + // Result is inexed by this block + if (threadIdx.x == 0) { + d_out[blockIdx.x] = partial_sum[0]; + } +} + +template +__global__ void CTCGradScaleKernel(T* d_out, const T* d_ctc, const T* d_loss, + int scale, int Tmax, int B, int D) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int n_elems = Tmax * B * D; + int b_idx = (tid / D) % B; + for (; tid < n_elems; tid += gridDim.x * blockDim.x) { + d_out[tid] = d_ctc[tid] * d_loss[b_idx] / static_cast(scale); + } +} + +template +__global__ void CTCGradScaleKernel(T* d_out, const T* d_ctc, const T* d_loss, + int64_t* scale, int Tmax, int B, int D) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int n_elems = Tmax * B * D; + int b_idx = (tid / D) % B; + for (; tid < n_elems; tid += gridDim.x * blockDim.x) { + d_out[tid] = d_ctc[tid] * d_loss[b_idx] / static_cast(scale[0]); + } +} + +template +__global__ void CTCGradBatchScaleKernel(T* d_out, const T* d_ctc, + const T* d_loss, const int64_t* scales, + int Tmax, int B, int D) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int n_elems = Tmax * B * D; + int b_idx = (tid / D) % B; + // scale is vector, (B) + for (; tid < n_elems; tid += gridDim.x * blockDim.x) { + d_out[tid] = d_ctc[tid] * d_loss[b_idx] / scales[b_idx]; + } +} + +template +class WarpCTCGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* warpctc_grad = ctx.Input("WarpCTCGrad"); + auto* logits_grad = ctx.Output(framework::GradVarName("Logits")); + const Tensor* loss_grad = ctx.Input(framework::GradVarName("Loss")); + + logits_grad->mutable_data(ctx.GetPlace()); + bool norm_by_times = ctx.Attr("norm_by_times"); + bool norm_by_batchsize = ctx.Attr("norm_by_batchsize"); + bool norm_by_total_logits_len = ctx.Attr("norm_by_total_logits_len"); + + if ((norm_by_times && norm_by_batchsize) || + (norm_by_times && norm_by_total_logits_len) || + (norm_by_batchsize && norm_by_total_logits_len)) { + PADDLE_THROW(platform::errors::InvalidArgument( + "[warpctc grad] norm_by_times, norm_by_batchsize and " + "norm_by_total_logits_len " + "should one be true.")); + } + + if (ctx.HasInput("LogitsLength")) { + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); + int max_seq_length = warpctc_grad->dims()[0]; // Tmax + int num_sequences = warpctc_grad->dims()[1]; // B + int seq_width = warpctc_grad->dims()[2]; // D + + auto* logits_length = ctx.Input("LogitsLength"); + const int64_t* logits_length_ptr = logits_length->data(); + + int n_elems = max_seq_length * num_sequences * seq_width; + int num_blocks = + (n_elems + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; + int shm_bytes = PADDLE_CUDA_NUM_THREADS * sizeof(T); + + auto logits_grad_ptr = + logits_grad->mutable_data(ctx.GetPlace()); // (Tmax, B, D) + auto warpctc_grad_ptr = warpctc_grad->data(); // (Tmax, B, D) + auto loss_grad_ptr = loss_grad->data(); // (B, 1) + + if (norm_by_total_logits_len) { + VLOG(3) << "norm_by_total_logits_len no impl "; + // total length + Tensor total_length; + int64_t* total_length_ptr = + total_length.mutable_data({1}, ctx.GetPlace()); + int bytes = num_sequences * sizeof(int64_t); + ReduceSumKernel<<<1, num_sequences, bytes, stream>>>( + logits_length_ptr, total_length_ptr); + + CTCGradScaleKernel< + T><<>>( + logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, total_length_ptr, + max_seq_length, num_sequences, seq_width); + + } else if (norm_by_batchsize) { + VLOG(3) << "norm_by_batchsize "; + CTCGradScaleKernel< + T><<>>( + logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, num_sequences, + max_seq_length, num_sequences, seq_width); + } else if (norm_by_times) { + VLOG(3) << "norm_by_times "; + CTCGradBatchScaleKernel< + T><<>>( + logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, logits_length_ptr, + max_seq_length, num_sequences, seq_width); + } else { + VLOG(3) << "default "; + CTCGradScaleKernel< + T><<>>( + logits_grad_ptr, warpctc_grad_ptr, loss_grad_ptr, 1, max_seq_length, + num_sequences, seq_width); + } + } else { + math::UnpaddingLoDTensorFunctor()( + ctx.template device_context(), *warpctc_grad, + logits_grad, -1, 0, norm_by_times, norm_by_batchsize, + norm_by_total_logits_len, math::kLengthBatchWidth); + + const T* loss_grad_data = loss_grad->data(); + math::ScaleLoDTensorFunctor()( + ctx.template device_context(), loss_grad_data, + logits_grad); + } + } +}; + +} // operators +} // paddle + +namespace ops = paddle::operators; + +// register forward and backward of CUDA OP must in same *.cu file. +// Eigen can be used on GPU device, but must be in *.cu file not *.cu.cc file. +// *.cu.cc also using GCC compiler. *.cu using NVCC compiler +REGISTER_OP_CUDA_KERNEL( + warpctc, ops::WarpCTCKernel, + ops::WarpCTCKernel); +REGISTER_OP_CUDA_KERNEL( + warpctc_grad, + ops::WarpCTCGradCUDAKernel, + ops::WarpCTCGradCUDAKernel); diff --git a/paddle/fluid/operators/warpctc_op.cu.cc b/paddle/fluid/operators/warpctc_op.cu.cc deleted file mode 100644 index a42093aaa29e33..00000000000000 --- a/paddle/fluid/operators/warpctc_op.cu.cc +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/warpctc_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - warpctc, ops::WarpCTCKernel, - ops::WarpCTCKernel); -REGISTER_OP_CUDA_KERNEL( - warpctc_grad, - ops::WarpCTCGradKernel, - ops::WarpCTCGradKernel); diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h index f5b51da3d85831..b515adc43fdfe4 100644 --- a/paddle/fluid/operators/warpctc_op.h +++ b/paddle/fluid/operators/warpctc_op.h @@ -15,7 +15,9 @@ limitations under the License. */ #pragma once #include +#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence_padding.h" #include "paddle/fluid/operators/math/sequence_scale.h" @@ -150,7 +152,7 @@ class WarpCTCFunctor { PADDLE_ENFORCE_EQ( CTC_STATUS_SUCCESS, status, platform::errors::PreconditionNotMet( - "warp-ctc [version %d] Error in get_workspace_size: %s", + "warp-ctc [version %d] Error in ComputeCtcLossFunctor: %s", warpctc_version_, platform::dynload::ctcGetStatusString(status))); } @@ -313,8 +315,8 @@ class WarpCTCKernel : public framework::OpKernel { math::PaddingLoDTensorFunctor()( ctx.template device_context(), *logits, - &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */, - math::kLengthBatchWidth); + &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */, false, + false, math::kLengthBatchWidth); } const T* warpctc_logits_data = warpctc_logits.data(); @@ -349,7 +351,7 @@ class WarpCTCKernel : public framework::OpKernel { math::UnpaddingLoDTensorFunctor()( ctx.template device_context(), *label, &warpctc_label, label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/, - false /*norm_by_times*/, math::kBatchLengthWidth); + false /*norm_by_times*/, false, false, math::kBatchLengthWidth); } else { LoDTensor gpu_label; gpu_label.mutable_data( @@ -359,7 +361,7 @@ class WarpCTCKernel : public framework::OpKernel { math::UnpaddingLoDTensorFunctor()( ctx.template device_context(), *label, &gpu_label, label->dims()[1] /*pad_seq_len*/, 0 /*lod_level*/, - false /*norm_by_times*/, math::kBatchLengthWidth); + false /*norm_by_times*/, false, false, math::kBatchLengthWidth); TensorCopySync(gpu_label, platform::CPUPlace(), &warpctc_label); } } else { @@ -388,62 +390,74 @@ template class WarpCTCGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* loss_grad = ctx.Input(framework::GradVarName("Loss")); auto* warpctc_grad = ctx.Input("WarpCTCGrad"); auto* logits_grad = ctx.Output(framework::GradVarName("Logits")); - const Tensor* loss_grad = ctx.Input(framework::GradVarName("Loss")); logits_grad->mutable_data(ctx.GetPlace()); bool norm_by_times = ctx.Attr("norm_by_times"); + bool norm_by_batchsize = ctx.Attr("norm_by_batchsize"); + bool norm_by_total_logits_len = ctx.Attr("norm_by_total_logits_len"); + + if ((norm_by_times && norm_by_batchsize) || + (norm_by_times && norm_by_total_logits_len) || + (norm_by_batchsize && norm_by_total_logits_len)) { + PADDLE_THROW(platform::errors::InvalidArgument( + "[warpctc grad] norm_by_times, norm_by_batchsize and " + "norm_by_total_logits_len " + "should one be true.")); + } if (ctx.HasInput("LogitsLength")) { - size_t max_seq_length = warpctc_grad->dims()[0]; - size_t num_sequences = warpctc_grad->dims()[1]; - size_t seq_width = warpctc_grad->dims()[2]; + int max_seq_length = warpctc_grad->dims()[0]; // Tmax + int num_sequences = warpctc_grad->dims()[1]; // B + int seq_width = warpctc_grad->dims()[2]; // D auto* logits_length = ctx.Input("LogitsLength"); - framework::Tensor logits_length_cpu; - framework::TensorCopy(*logits_length, platform::CPUPlace(), - &logits_length_cpu); - - LoDTensor logits_grad_with_lod; - auto logits_grad_dims = - framework::make_ddim({static_cast(max_seq_length), - static_cast(num_sequences), - static_cast(seq_width)}); - T* logits_grad_cpu_data = logits_grad_with_lod.mutable_data( - logits_grad_dims, platform::CPUPlace()); - - TensorCopySync(*warpctc_grad, platform::CPUPlace(), - &logits_grad_with_lod); - - Tensor loss_grad_cpu; - loss_grad_cpu.mutable_data(loss_grad->dims(), platform::CPUPlace()); - TensorCopySync(*loss_grad, platform::CPUPlace(), &loss_grad_cpu); - - LoDTensor scaled_logits; - T* scaled_logits_data = - scaled_logits.mutable_data(logits_grad_dims, platform::CPUPlace()); - - const T* loss_grad_data = loss_grad_cpu.data(); - for (size_t i = 0; i < max_seq_length; ++i) { - for (size_t j = 0; j < num_sequences; ++j) { - T scale = 1.0; - if (norm_by_times) { - scale = 1.0 / static_cast(logits_length_cpu.data()[j]); - } - for (size_t k = 0; k < seq_width; ++k) { - size_t idx = i * (num_sequences * seq_width) + j * seq_width + k; - scaled_logits_data[idx] = - logits_grad_cpu_data[idx] * loss_grad_data[j] * scale; - } - } + // B + auto logits_len_e = + framework::EigenTensor::From(*logits_length); + // (B, 1) + auto loss_grad_e = framework::EigenTensor::From(*loss_grad); + // (T, B, D) + auto warpctc_grad_e = framework::EigenTensor::From(*warpctc_grad); + + auto logits_grad_e = framework::EigenTensor::From(*logits_grad); + + Eigen::DSizes grad_shape(1, num_sequences, 1); + Eigen::DSizes bcast(max_seq_length, 1, seq_width); + auto logits_g = warpctc_grad_e * + loss_grad_e.reshape(grad_shape).broadcast(bcast).eval(); + + auto* place = ctx.template device_context().eigen_device(); + if (norm_by_total_logits_len) { + // Compute the avg. log-probability per batch sample and frame. + // Rank is 0 + auto inv_len = logits_len_e.sum().cast().inverse().eval(); + logits_grad_e.device(*place) = + logits_g * + inv_len.reshape(Eigen::DSizes{1, 1, 1}) + .broadcast(Eigen::DSizes{max_seq_length, num_sequences, + seq_width}); + } else if (norm_by_batchsize) { + // Compute the avg. log-probability per batch sample. + T scale = 1.0 / static_cast(num_sequences); + logits_grad_e.device(*place) = logits_g * scale; + } else if (norm_by_times) { + auto scales = logits_len_e.cast() + .inverse() + .reshape(grad_shape) + .broadcast(bcast) + .eval(); + logits_grad_e.device(*place) = logits_g * scales; + } else { + logits_grad_e.device(*place) = logits_g; } - - TensorCopySync(scaled_logits, ctx.GetPlace(), logits_grad); } else { math::UnpaddingLoDTensorFunctor()( ctx.template device_context(), *warpctc_grad, - logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth); + logits_grad, -1, 0, norm_by_times, norm_by_batchsize, + norm_by_total_logits_len, math::kLengthBatchWidth); const T* loss_grad_data = loss_grad->data(); math::ScaleLoDTensorFunctor()( diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py index 0e5291c24de56e..0954fe7f548d31 100644 --- a/python/paddle/fluid/layers/loss.py +++ b/python/paddle/fluid/layers/loss.py @@ -478,7 +478,9 @@ def warpctc(input, blank=0, norm_by_times=False, input_length=None, - label_length=None): + label_length=None, + norm_by_batchsize=False, + norm_by_total_logits_len=False): """ An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc) @@ -515,6 +517,12 @@ def warpctc(input, of Tensor type, it should have shape `[batch_size]` and dtype int64. label_length(Variable): The length for each label sequence if it is of Tensor type, it should have shape `[batch_size]` and dtype int64. + norm_by_batchsize (bool): normalize the loss by the batch size. + If `True`, supersedes `norm_by_times` + (default: `False`) + norm_by_total_logits_len (bool): normalize the loss by the total number of frames + in the batch. If `True`, supersedes `norm_by_batchsize` and `norm_by_times` + (default: `False`) Returns: Variable: The Connectionist Temporal Classification (CTC) loss, @@ -602,15 +610,12 @@ def warpctc(input, "input_length and label_length must not be None in dygraph mode!" ) grad, loss_out = _C_ops.warpctc( - input, - label, - input_length, - label_length, - 'blank', - blank, - 'norm_by_times', - norm_by_times, ) + input, label, input_length, label_length, 'blank', blank, + 'norm_by_times', norm_by_times, 'norm_by_batchsize', + norm_by_batchsize, 'norm_by_total_logits_len', + norm_by_total_logits_len) return loss_out + helper = LayerHelper('warpctc', **locals()) check_variable_and_dtype(input, 'input', ['float32', 'float64'], "warpctc") check_variable_and_dtype(label, 'label', ['int32'], "warpctc") @@ -634,6 +639,8 @@ def warpctc(input, attrs={ 'blank': blank, 'norm_by_times': norm_by_times, + 'norm_by_batchsize': norm_by_batchsize, + 'norm_by_total_logits_len': norm_by_total_logits_len, }) return loss_out diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py index 53f3b3cf53d765..6358cbcf0bbb22 100644 --- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py +++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py @@ -18,6 +18,7 @@ import unittest import numpy as np from op_test import OpTest +from op_test import skip_check_grad_ci from test_softmax_op import stable_softmax import paddle.fluid as fluid import paddle.fluid.core as core @@ -456,6 +457,220 @@ def test_check_grad(self): self.check_grad(["Logits"], "Loss") +@skip_check_grad_ci(reason="For warpctc, not check grad.") +class TestWarpCTCOpAttr(OpTest): + def config(self): + self.batch_size = 4 + self.num_classes = 8 + self.logits_lod = [[4, 1, 5, 5]] + self.labels_lod = [[3, 1, 4, 2]] + self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64) + self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64) + self.blank = self.num_classes - 1 + self.norm_by_times = False + self.norm_by_batchsize = False + self.norm_by_total_logits_len = False + + def setUp(self): + self.op_type = "warpctc" + self.config() + + logits = np.random.uniform( + 0.1, 1.0, + [sum(self.logits_length), self.num_classes]).astype("float64") + softmax = np.apply_along_axis(stable_softmax, 1, logits) + # labels should not be blank + labels = np.random.randint( + 0, + self.num_classes - 1, [sum(self.labels_length), 1], + dtype="int32") + + ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod, + self.num_classes, self.batch_size, self.blank, + self.norm_by_times) + loss = ctc.forward() + + max_sequence_length = 0 + for i in range(self.batch_size): + max_sequence_length = max(max_sequence_length, + self.logits_length[i]) + # reshape logits to T*N*S + new_logits = np.zeros( + [max_sequence_length, self.batch_size, self.num_classes], + dtype=logits.dtype) + + cur = 0 + for batch_id in range(self.batch_size): + for i in range(self.logits_length[batch_id]): + for j in range(self.num_classes): + new_logits[i, batch_id, j] = logits[cur + i, j] + cur = cur + self.logits_length[batch_id] + + # reshape labels to N*S + max_target_seq_length = 0 + for i in range(self.batch_size): + max_target_seq_length = max(max_target_seq_length, + self.labels_length[i]) + new_labels = np.zeros( + [self.batch_size, max_target_seq_length], dtype="int32") + + cur = 0 + for batch_id in range(self.batch_size): + for i in range(self.labels_length[batch_id]): + new_labels[batch_id, i] = labels[cur + i] + cur = cur + self.labels_length[batch_id] + + self.gradient = np.zeros( + [max_sequence_length, self.batch_size, self.num_classes], + dtype=logits.dtype) + + self.inputs = { + "Logits": new_logits, + "Label": new_labels, + "LogitsLength": self.logits_length, + "LabelLength": self.labels_length + } + self.outputs = {"Loss": loss} + self.attrs = { + "blank": self.blank, + "norm_by_times": self.norm_by_times, + "norm_by_batchsize": self.norm_by_batchsize, + "norm_by_total_logits_len": self.norm_by_total_logits_len, + } + + def test_check_output(self): + self.check_output() + + +@skip_check_grad_ci(reason="For warpctc, not check grad.") +class TestWarpCTCOpFp64NormByTimes(TestWarpCTCOpAttr): + def config(self): + self.batch_size = 4 + self.num_classes = 8 + self.logits_lod = [[4, 1, 5, 5]] + self.labels_lod = [[3, 1, 4, 2]] + self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64) + self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64) + self.blank = self.num_classes - 1 + self.norm_by_times = True + self.norm_by_batchsize = False + self.norm_by_total_logits_len = False + + +@skip_check_grad_ci(reason="For warpctc, not check grad.") +class TestWarpCTCOpFp64SizeAverage(TestWarpCTCOpAttr): + def config(self): + self.batch_size = 4 + self.num_classes = 8 + self.logits_lod = [[4, 1, 5, 5]] + self.labels_lod = [[3, 1, 4, 2]] + self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64) + self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64) + self.blank = self.num_classes - 1 + self.norm_by_times = False + self.norm_by_batchsize = True + self.norm_by_total_logits_len = False + + +@skip_check_grad_ci(reason="For warpctc, not check grad.") +class TestWarpCTCOpFp64LengthAverage(TestWarpCTCOpAttr): + def config(self): + self.batch_size = 4 + self.num_classes = 8 + self.logits_lod = [[4, 1, 5, 5]] + self.labels_lod = [[3, 1, 4, 2]] + self.logits_length = np.array([4, 1, 5, 5], dtype=np.int64) + self.labels_length = np.array([3, 1, 4, 2], dtype=np.int64) + self.blank = self.num_classes - 1 + self.norm_by_times = False + self.norm_by_batchsize = False + self.norm_by_total_logits_len = True + + +class TestWarpCTCOpDygraph(unittest.TestCase): + def test_dygraph(self): + places = ['cpu'] + if paddle.is_compiled_with_cuda(): + places += ['gpu:0'] + + for p in places: + paddle.set_device(p) + paddle.disable_static() + paddle.seed(1) + np.random.seed(1) + #(B=2) + log_probs = np.array( + [[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04], + [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]], [ + [1.86260208e-01, 3.45560730e-01, 3.96767467e-01], + [5.38816750e-01, 4.19194520e-01, 6.85219526e-01] + ], [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02], + [6.70467496e-01, 4.17304814e-01, 5.58689833e-01]], + [[1.40386939e-01, 1.98101491e-01, 8.00744593e-01], + [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]], + [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02], + [3.90547849e-02, 1.69830427e-01, + 8.78142476e-01]]]).astype("float32") + labels = np.array([[1, 2, 2], [1, 2, 2]]).astype("int32") + input_lengths = np.array([5, 5]).astype("int64") + label_lengths = np.array([3, 3]).astype("int64") + + log_probs = paddle.to_tensor(log_probs, stop_gradient=False) + labels = paddle.to_tensor(labels) + input_lengths = paddle.to_tensor(input_lengths) + label_lengths = paddle.to_tensor(label_lengths) + + loss = paddle.nn.CTCLoss( + blank=0, reduction='sum')(log_probs, + labels, + input_lengths, + label_lengths, + norm_by_times=False, + norm_by_batchsize=False, + norm_by_total_logits_len=False) + self.assertTrue(np.allclose(loss, [6.82563686], atol=1)) + loss.backward() + log_probs.clear_gradient() + + loss = paddle.nn.CTCLoss( + blank=0, reduction='sum')(log_probs, + labels, + input_lengths, + label_lengths, + norm_by_times=True, + norm_by_batchsize=False, + norm_by_total_logits_len=False) + self.assertTrue(np.allclose(loss, [6.82563686], atol=1)) + loss.backward() + log_probs.clear_gradient() + + loss = paddle.nn.CTCLoss( + blank=0, reduction='sum')(log_probs, + labels, + input_lengths, + label_lengths, + norm_by_times=False, + norm_by_batchsize=True, + norm_by_total_logits_len=False) + self.assertTrue(np.allclose(loss, [6.82563686], atol=1)) + loss.backward() + log_probs.clear_gradient() + + loss = paddle.nn.CTCLoss( + blank=0, reduction='sum')(log_probs, + labels, + input_lengths, + label_lengths, + norm_by_times=False, + norm_by_batchsize=False, + norm_by_total_logits_len=True) + self.assertTrue(np.allclose(loss, [6.82563686], atol=1)) + loss.backward() + log_probs.clear_gradient() + + paddle.enable_static() + + class TestWarpCTCOpError(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index d7b781c84767f2..a1cd80e42f7bb2 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1001,7 +1001,9 @@ def ctc_loss(log_probs, label_lengths, blank=0, reduction='mean', - norm_by_times=False): + norm_by_times=False, + norm_by_batchsize=False, + norm_by_total_logits_len=False): """ An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc) @@ -1017,7 +1019,9 @@ def ctc_loss(log_probs, blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0. reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``. norm_by_times (bool, default False) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'. - + norm_by_batchsize (bool): normalize the loss by the batch size (default: `False`). If `True`, supersedes `norm_by_times` (default: `False`) + norm_by_total_logits_len (bool): normalize the loss by the total number of frames in the batch. If `True`, supersedes `norm_by_batchsize` and `norm_by_times` (default: `False`) + Returns: Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``. @@ -1025,6 +1029,7 @@ def ctc_loss(log_probs, .. code-block:: python + # required: skiptest # declarative mode import paddle.nn.functional as F import numpy as np @@ -1081,9 +1086,10 @@ def ctc_loss(log_probs, """ loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times, - input_lengths, label_lengths) + input_lengths, label_lengths, + norm_by_batchsize, norm_by_total_logits_len) - loss_out = fluid.layers.squeeze(loss_out, [-1]) + loss_out = fluid.layers.squeeze(loss_out, [-1]) # (B) assert reduction in ['mean', 'sum', 'none'] if reduction == 'mean': loss_out = paddle.mean(loss_out / label_lengths) @@ -1536,7 +1542,7 @@ def cross_entropy(input, Indicate how to average the loss by batch_size, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; - If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. + If :attr:`norm_by_batchsize` is ``'sum'``, the reduced sum loss is returned. If :attr:`reduction` is ``'none'``, the unreduced loss is returned. Default is ``'mean'``. diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py index 3ac0d675fb72c6..781e13867f2432 100644 --- a/python/paddle/nn/layer/loss.py +++ b/python/paddle/nn/layer/loss.py @@ -1119,7 +1119,9 @@ def forward(self, labels, input_lengths, label_lengths, - norm_by_times=False): + norm_by_times=False, + norm_by_batchsize=False, + norm_by_total_logits_len=False): return paddle.nn.functional.ctc_loss( log_probs, labels, @@ -1127,7 +1129,9 @@ def forward(self, label_lengths, self.blank, self.reduction, - norm_by_times=norm_by_times) + norm_by_times=norm_by_times, + norm_by_batchsize=norm_by_batchsize, + norm_by_total_logits_len=norm_by_total_logits_len) class SmoothL1Loss(Layer): From 9d4f00bc5420c85af5c20b977e76c9fb5be72db0 Mon Sep 17 00:00:00 2001 From: Peihan Date: Tue, 17 Aug 2021 13:18:49 +0800 Subject: [PATCH 070/126] add mkl multi-thread test cases in PR-CI-INFERENCE (#34946) * add mkl multi-thread test cases * fix codestyle * fix codestyle & enable ernie mkl test --- paddle/fluid/inference/tests/infer_ut/run.sh | 26 ++++ .../tests/infer_ut/test_det_mv3_db.cc | 46 +++++- .../tests/infer_ut/test_ernie_text_cls.cc | 137 ++++++++++++++++++ .../inference/tests/infer_ut/test_suite.h | 41 +++++- 4 files changed, 243 insertions(+), 7 deletions(-) create mode 100644 paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh index 64ada23767f1fa..7d17bb647a1103 100755 --- a/paddle/fluid/inference/tests/infer_ut/run.sh +++ b/paddle/fluid/inference/tests/infer_ut/run.sh @@ -77,6 +77,12 @@ for model_name in $clas_download_list; do download $url_prefix $model_name done +nlp_download_list='ernie_text_cls' +for model_name in $nlp_download_list; do + url_prefix="https://paddle-qa.bj.bcebos.com/inference_model/2.1.1/nlp" + download $url_prefix $model_name +done + # compile and run test cd $current_dir mkdir -p build @@ -144,6 +150,26 @@ if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then fi fi +# ---------gpu ernie_text_cls on linux--------- +if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then + cmake .. -DPADDLE_LIB=${inference_install_dir} \ + -DWITH_MKL=$TURN_ON_MKL \ + -DDEMO_NAME=test_ernie_text_cls \ + -DWITH_GPU=$TEST_GPU_CPU \ + -DWITH_STATIC_LIB=OFF \ + -DUSE_TENSORRT=$USE_TENSORRT \ + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_GTEST=ON + make -j$(nproc) + ./test_ernie_text_cls \ + --modeldir=$DATA_DIR/ernie_text_cls/ernie_text_cls \ + --gtest_output=xml:test_ernie_text_cls.xml + if [ $? -ne 0 ]; then + echo "test_ernie_text_cls runs failed" >> ${current_dir}/build/test_summary.txt + EXIT_CODE=1 + fi +fi + if [[ -f ${current_dir}/build/test_summary.txt ]];then echo "=====================test summary======================" cat ${current_dir}/build/test_summary.txt diff --git a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc index c5920d3b2d8d55..ce7b8ce4637279 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc @@ -123,7 +123,7 @@ TEST(test_det_mv3_db, multi_thread2_trt_fp32_dynamic_shape_bz2) { FLAGS_modeldir + "/inference.pdiparams"); config.EnableUseGpu(100, 0); config.EnableTensorRtEngine( - 1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, false, false); + 1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, true, false); PrepareDynamicShape(&config, 4); // get groudtruth by disbale ir paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); @@ -149,6 +149,50 @@ TEST(test_det_mv3_db, multi_thread2_trt_fp32_dynamic_shape_bz2) { std::cout << "finish multi-thread test" << std::endl; } +TEST(test_det_mv3_db, multi_thread2_mkl_fp32_bz2) { + int thread_num = 2; // thread > 2 may OOM + // init input data + std::map my_input_data_map; + my_input_data_map["x"] = PrepareInput(2, 640); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config.DisableGpu(); + config.EnableMKLDNN(); + config.SetMkldnnCacheCapacity(10); + config.SetCpuMathLibraryNumThreads(10); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, + &truth_output_data, 1); + + // get infer results from multi threads + std::vector threads; + services::PredictorPool pred_pool(config, thread_num); + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool.Retrive(i), &my_input_data_map, + &infer_output_data, 2); + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + threads[i].join(); + CompareRecord(&truth_output_data, &infer_output_data, 1e-4); + } + + std::cout << "finish multi-thread test" << std::endl; +} + } // namespace paddle_infer int main(int argc, char** argv) { diff --git a/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc b/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc new file mode 100644 index 00000000000000..f73803fe593356 --- /dev/null +++ b/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test_suite.h" // NOLINT + +DEFINE_string(modeldir, "", "Directory of the inference model."); + +namespace paddle_infer { + +template +T cRandom(int min, int max) { + unsigned int seed = 100; + return (min + + static_cast(max * rand_r(&seed) / static_cast(RAND_MAX + 1))); +} + +std::map PrepareInput(int batch_size) { + // init input data + int digit_length = 115; + paddle::test::Record input_ids, segment_ids; + int input_num = batch_size * digit_length; + std::vector input_data(input_num, 1); + std::vector segment_data(input_num, 0); + srand((unsigned)time(NULL)); + for (int x = 0; x < input_data.size(); x++) { + input_data[x] = cRandom(1, 100); + } + input_ids.data = std::vector(input_data.begin(), input_data.end()); + input_ids.shape = std::vector{batch_size, digit_length}; + input_ids.type = paddle::PaddleDType::INT64; + + segment_ids.data = + std::vector(segment_data.begin(), segment_data.end()); + segment_ids.shape = std::vector{batch_size, digit_length}; + segment_ids.type = paddle::PaddleDType::INT64; + + std::map my_input_data_map; + my_input_data_map.insert({"input_ids", input_ids}); + my_input_data_map.insert({"token_type_ids", segment_ids}); + + return my_input_data_map; +} + +TEST(test_ernie_text_cls, analysis_gpu_bz2_buffer) { + // init input data + auto my_input_data_map = PrepareInput(2); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config_no_ir.SwitchIrOptim(false); + + // prepare inference config from buffer + std::string prog_file = FLAGS_modeldir + "/inference.pdmodel"; + std::string params_file = FLAGS_modeldir + "/inference.pdiparams"; + std::string prog_str = paddle::test::read_file(prog_file); + std::string params_str = paddle::test::read_file(params_file); + config.SetModelBuffer(prog_str.c_str(), prog_str.size(), params_str.c_str(), + params_str.size()); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, + &truth_output_data, 1); + // get infer results + paddle_infer::services::PredictorPool pred_pool(config, 1); + SingleThreadPrediction(pred_pool.Retrive(0), &my_input_data_map, + &infer_output_data); + // check outputs + CompareRecord(&truth_output_data, &infer_output_data); + std::cout << "finish test" << std::endl; +} + +TEST(test_ernie_text_cls, multi_thread4_mkl_fp32_bz2) { + int thread_num = 4; + // init input data + auto my_input_data_map = PrepareInput(2); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config.DisableGpu(); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config.DisableGpu(); + config.EnableMKLDNN(); + config.SetMkldnnCacheCapacity(10); + config.SetCpuMathLibraryNumThreads(10); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, + &truth_output_data, 1); + + // get infer results from multi threads + std::vector threads; + services::PredictorPool pred_pool(config, thread_num); + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool.Retrive(i), &my_input_data_map, + &infer_output_data, 2); + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + threads[i].join(); + CompareRecord(&truth_output_data, &infer_output_data); + } + + std::cout << "finish multi-thread test" << std::endl; +} + +} // namespace paddle_infer + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ::google::ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); +} diff --git a/paddle/fluid/inference/tests/infer_ut/test_suite.h b/paddle/fluid/inference/tests/infer_ut/test_suite.h index b0da828998ca24..2f1034d4df9a65 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_suite.h +++ b/paddle/fluid/inference/tests/infer_ut/test_suite.h @@ -37,6 +37,12 @@ class Record { paddle::PaddleDType type; }; +std::string read_file(std::string filename) { + std::ifstream file(filename); + return std::string((std::istreambuf_iterator(file)), + std::istreambuf_iterator()); +} + void SingleThreadPrediction(paddle_infer::Predictor *predictor, std::map *input_data_map, std::map *output_data_map, @@ -44,14 +50,37 @@ void SingleThreadPrediction(paddle_infer::Predictor *predictor, // prepare input tensor auto input_names = predictor->GetInputNames(); for (const auto & [ key, value ] : *input_data_map) { - auto input_tensor = predictor->GetInputHandle(key); - input_tensor->Reshape(value.shape); - input_tensor->CopyFromCpu(value.data.data()); + switch (value.type) { + case paddle::PaddleDType::INT64: { + std::vector input_value = + std::vector(value.data.begin(), value.data.end()); + auto input_tensor = predictor->GetInputHandle(key); + input_tensor->Reshape(value.shape); + input_tensor->CopyFromCpu(input_value.data()); + break; + } + case paddle::PaddleDType::INT32: { + std::vector input_value = + std::vector(value.data.begin(), value.data.end()); + auto input_tensor = predictor->GetInputHandle(key); + input_tensor->Reshape(value.shape); + input_tensor->CopyFromCpu(input_value.data()); + break; + } + case paddle::PaddleDType::FLOAT32: { + std::vector input_value = + std::vector(value.data.begin(), value.data.end()); + auto input_tensor = predictor->GetInputHandle(key); + input_tensor->Reshape(value.shape); + input_tensor->CopyFromCpu(input_value.data()); + break; + } + } } // inference for (size_t i = 0; i < repeat_times; ++i) { - CHECK(predictor->Run()); + ASSERT_TRUE(predictor->Run()); } // get output data to Record @@ -112,8 +141,8 @@ void CompareRecord(std::map *truth_output_data, size_t numel = value.data.size() / sizeof(float); EXPECT_EQ(value.data.size(), truth_record.data.size()); for (size_t i = 0; i < numel; ++i) { - CHECK_LT(fabs(value.data.data()[i] - truth_record.data.data()[i]), - epislon); + ASSERT_LT(fabs(value.data.data()[i] - truth_record.data.data()[i]), + epislon); } } } From 8ef1bf870c890afe989130cce976efca11d94a07 Mon Sep 17 00:00:00 2001 From: shangliang Xu Date: Tue, 17 Aug 2021 14:14:00 +0800 Subject: [PATCH 071/126] [bug fix] fix unfold negative_size_param (#34943) * [bug fix] fix unfold negative_size_param --- paddle/fluid/operators/unfold_op.cc | 19 +++++++++++++++ .../fluid/tests/unittests/test_layers.py | 24 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc index 5c0eb64993b556..d4155960bebe59 100644 --- a/paddle/fluid/operators/unfold_op.cc +++ b/paddle/fluid/operators/unfold_op.cc @@ -154,6 +154,25 @@ class UnfoldOp : public framework::OperatorWithKernel { paddings[2], strides[0]); int output_width = CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1], paddings[1], paddings[3], strides[1]); + // check output height and width + PADDLE_ENFORCE_GT( + output_height, 0, + platform::errors::InvalidArgument( + "The sliding blocks calculated from input spatial size (%d, %d), " + "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " + "is (%d, %d), which should be a positive integer.", + in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1], + strides[0], strides[1], dilations[0], dilations[1], output_height, + output_width)); + PADDLE_ENFORCE_GT( + output_width, 0, + platform::errors::InvalidArgument( + "The sliding blocks calculated from input spatial size (%d, %d), " + "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " + "is (%d, %d), which should be a positive integer.", + in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1], + strides[0], strides[1], dilations[0], dilations[1], output_height, + output_width)); int output_col_length = output_height * output_width; out_dims.push_back(output_col_length); diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index ad53c815cd1c81..1bd5e08e28ef0f 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3320,6 +3320,30 @@ def test_roi_align(self): dy_res_value = dy_res.numpy() self.assertTrue(np.array_equal(static_res, dy_res_value)) + def test_dice_loss(self): + num_classes = 4 + eps = 1e-6 + input_np = np.random.rand(2, 3, num_classes).astype('float32') + label_np = np.random.randint(0, num_classes, [2, 3, 1], dtype=np.int64) + + with self.static_graph(): + input_ = layers.data( + name="input", shape=[None, 3, num_classes], dtype="float32") + label_ = layers.data( + name="label", shape=[None, 3, 1], dtype="int64") + output = layers.dice_loss(input_, label_, eps) + static_res = self.get_static_graph_result( + feed={'input': input_np, + 'label': label_np}, + fetch_list=[output])[0] + + with self.dynamic_graph(): + input_ = base.to_variable(input_np) + label_ = base.to_variable(label_np) + dy_res = layers.dice_loss(input_, label_, eps) + dy_res_value = dy_res.numpy() + self.assertTrue(np.array_equal(static_res, dy_res_value)) + def test_roi_perspective_transform(self): # TODO(minqiyang): dygraph do not support lod now with self.static_graph(): From 01a3a2e06f87e8de33ec61e4fdfa70720498f282 Mon Sep 17 00:00:00 2001 From: WeiXin Date: Tue, 17 Aug 2021 15:00:06 +0800 Subject: [PATCH 072/126] Modify the name of class in unittest with the same name (#34952) * polish unittest. * polish code * polish code --- .../tests/unittests/test_strided_slice_op.py | 114 +++++++++--------- 1 file changed, 54 insertions(+), 60 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py index 3c1a2649a7dfad..9d89c7cbe13971 100644 --- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py @@ -745,173 +745,167 @@ def forward(self, inps): self.assertFalse(result.place.is_cuda_pinned_place()) def test_strided_slice_tensor_array(self): - class Net(ArrayLayer): + class Net01(ArrayLayer): def array_slice(self, tensors): return tensors[::-1] - self.create_case(Net(array_size=10)) + self.create_case(Net01(array_size=10)) - class Net(ArrayLayer): + class Net02(ArrayLayer): def array_slice(self, tensors): return tensors[::-2] - self.create_case(Net(input_size=112, array_size=11)) + self.create_case(Net02(input_size=112, array_size=11)) - class Net(ArrayLayer): + class Net03(ArrayLayer): def array_slice(self, tensors): return tensors[::-3] - self.create_case(Net(input_size=112, array_size=9)) + self.create_case(Net03(input_size=112, array_size=9)) - class Net(ArrayLayer): + class Net04(ArrayLayer): def array_slice(self, tensors): return tensors[1::-4] - self.create_case(Net(input_size=112, array_size=9)) + self.create_case(Net04(input_size=112, array_size=9)) - class Net(ArrayLayer): + class Net05(ArrayLayer): def array_slice(self, tensors): return tensors[:7:-4] - self.create_case(Net(input_size=112, array_size=9)) + self.create_case(Net05(input_size=112, array_size=9)) - class Net(ArrayLayer): + class Net06(ArrayLayer): def array_slice(self, tensors): return tensors[8:0:-4] - self.create_case(Net(input_size=112, array_size=9)) + self.create_case(Net06(input_size=112, array_size=9)) - class Net(ArrayLayer): + class Net07(ArrayLayer): def array_slice(self, tensors): return tensors[8:1:-4] - self.create_case(Net(input_size=112, array_size=9)) + self.create_case(Net07(input_size=112, array_size=9)) - class Net(ArrayLayer): + class Net08(ArrayLayer): def array_slice(self, tensors): return tensors[::2] - self.create_case(Net(input_size=112, array_size=11)) + self.create_case(Net08(input_size=112, array_size=11)) - class Net(ArrayLayer): + class Net09(ArrayLayer): def array_slice(self, tensors): return tensors[::3] - self.create_case(Net(input_size=112, array_size=9)) + self.create_case(Net09(input_size=112, array_size=9)) - class Net(ArrayLayer): + class Net10(ArrayLayer): def array_slice(self, tensors): return tensors[1::4] - self.create_case(Net(input_size=112, array_size=9)) + self.create_case(Net10(input_size=112, array_size=9)) - class Net(ArrayLayer): + class Net11(ArrayLayer): def array_slice(self, tensors): return tensors[:8:4] - self.create_case(Net(input_size=112, array_size=9)) + self.create_case(Net11(input_size=112, array_size=9)) - class Net(ArrayLayer): + class Net12(ArrayLayer): def array_slice(self, tensors): return tensors[1:8:4] - self.create_case(Net(input_size=112, array_size=9)) + self.create_case(Net12(input_size=112, array_size=9)) - class Net(ArrayLayer): + class Net13(ArrayLayer): def array_slice(self, tensors): return tensors[8:10:4] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net13(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net14(ArrayLayer): def array_slice(self, tensors): return tensors[3:10:4] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net14(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net15(ArrayLayer): def array_slice(self, tensors): return tensors[2:10:4] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net15(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net16(ArrayLayer): def array_slice(self, tensors): return tensors[3:10:3] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net16(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net17(ArrayLayer): def array_slice(self, tensors): return tensors[3:15:3] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net17(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net18(ArrayLayer): def array_slice(self, tensors): return tensors[0:15:3] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net18(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net19(ArrayLayer): def array_slice(self, tensors): return tensors[-1:-5:-3] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net19(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net20(ArrayLayer): def array_slice(self, tensors): return tensors[-1:-6:-3] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net20(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net21(ArrayLayer): def array_slice(self, tensors): return tensors[-3:-6:-3] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net21(input_size=112, array_size=13)) - self.create_case(Net(input_size=112, array_size=13)) - - class Net(ArrayLayer): + class Net22(ArrayLayer): def array_slice(self, tensors): return tensors[-5:-1:3] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net22(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net23(ArrayLayer): def array_slice(self, tensors): return tensors[-6:-1:3] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net23(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net24(ArrayLayer): def array_slice(self, tensors): return tensors[-6:-3:3] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net24(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net25(ArrayLayer): def array_slice(self, tensors): return tensors[0::3] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net25(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net26(ArrayLayer): def array_slice(self, tensors): return tensors[-60:20:3] - self.create_case(Net(input_size=112, array_size=13)) + self.create_case(Net26(input_size=112, array_size=13)) - class Net(ArrayLayer): + class Net27(ArrayLayer): def array_slice(self, tensors): return tensors[-3:-60:-3] - self.create_case(Net(input_size=112, array_size=13)) - - class Net(ArrayLayer): - def array_slice(self, tensors): - return tensors[-1:-60:-3] + self.create_case(Net27(input_size=112, array_size=13)) if __name__ == "__main__": From 7b3295a4cc68cd3c14c208d388f384a7af16234f Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Tue, 17 Aug 2021 15:45:49 +0800 Subject: [PATCH 073/126] add exclude rules of pre-commit for paddle/utils and third_party (#34880) * add exclude rules of pre-commit to paddle/utils and third_party * remove exclude direction distributed/third_party * remove exclude of paddle/utils for format cpplint check --- .pre-commit-config.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fc7e70619e3be1..df2e59b7647bf0 100755 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -49,4 +49,7 @@ repos: entry: python ./tools/codestyle/copyright.hook language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$ - exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ + exclude: | + (?x)^( + paddle/utils/.* + )$ From f1c1d9e00260c5eb0187702d3e10b60e9d60a942 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Tue, 17 Aug 2021 09:51:48 +0200 Subject: [PATCH 074/126] [oneDNN ] disabling more ops caching (#34830) * - disabled caching of layer norm - fix in compilation - compilation fix - transpose caching disabled - compilation fix - more compilation fixes - sum caching disabled - compilation fix * - LRN with disabled cache * lint fixes --- .../operators/mkldnn/layer_norm_mkldnn_op.cc | 88 +++++------- .../fluid/operators/mkldnn/lrn_mkldnn_op.cc | 136 +++++++++--------- .../fluid/operators/mkldnn/sum_mkldnn_op.cc | 75 ++++------ .../operators/mkldnn/transpose_mkldnn_op.cc | 76 ++++++++-- paddle/fluid/platform/mkldnn_reuse.h | 93 ------------ 5 files changed, 197 insertions(+), 271 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc index cc4bfbae2665fe..e84266caa227c9 100644 --- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc @@ -19,45 +19,36 @@ namespace paddle { namespace operators { template -class LayerNormMKLDNNHandler - : public platform::MKLDNNHandlerT { +class LayerNormMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT< + T, dnnl::layer_normalization_forward> { public: LayerNormMKLDNNHandler(const std::vector& dims, const float& epsilon, const dnnl::normalization_flags& flags, const bool& is_test, const MKLDNNMemoryFormat fmt, - const platform::MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, - const std::string& uniq_name) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, dims, uniq_name)) { - if (!this->isCached()) { - auto md = dnnl::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); - if (!is_test) { - // TODO(grygielski) Delete forcing stats_md after DNNL 1.2 is introduced - auto stats_md = dnnl::memory::desc( - {begin(dims), end(dims) - 1}, platform::MKLDNNGetDataType(), - platform::MKLDNNFormatForSize(dims.size() - 1, - MKLDNNMemoryFormat::nchw)); - this->AcquireForwardPrimitiveDescriptor( - dnnl::prop_kind::forward_training, md, stats_md, epsilon, flags); - } else { - this->AcquireForwardPrimitiveDescriptor( - dnnl::prop_kind::forward_inference, md, epsilon, flags); - } + const mkldnn::engine engine, platform::Place cpu_place) + : platform::MKLDNNHandlerNoCachingT( + engine, cpu_place) { + auto md = dnnl::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); + if (!is_test) { + // TODO(grygielski) Delete forcing stats_md after DNNL 1.2 is introduced + auto stats_md = dnnl::memory::desc( + {begin(dims), end(dims) - 1}, platform::MKLDNNGetDataType(), + platform::MKLDNNFormatForSize(dims.size() - 1, + MKLDNNMemoryFormat::nchw)); + this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training, + md, stats_md, epsilon, flags); + } else { + this->AcquireForwardPrimitiveDescriptor( + dnnl::prop_kind::forward_inference, md, epsilon, flags); } } - std::shared_ptr AcquireScaleShiftMemory() { - return this->AcquireMemoryFromPrimitive("@scaleshift_mem_p"); - } - std::shared_ptr AcquireScaleShiftMemory( std::vector& scaleshift_data) { // scaleshift_data comes from temporary buffer so we need to copy it into // created memory primitivie - auto scaleshift_mem = this->AcquireMemoryFromPrimitive( - this->fwd_pd_->weights_desc(), "@scaleshift_mem_p"); + auto scaleshift_mem = + this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc()); auto data_ptr = scaleshift_mem->get_data_handle(); std::size_t num_bytes = scaleshift_data.size() * sizeof(float); std::memcpy(data_ptr, scaleshift_data.data(), num_bytes); @@ -68,7 +59,7 @@ class LayerNormMKLDNNHandler T* mean_data = mean->mutable_data(this->place_, this->fwd_pd_->mean_desc().get_size()); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(), - mean_data, "@mean_mem_p"); + mean_data); } std::shared_ptr AcquireVarianceMemory( @@ -76,7 +67,7 @@ class LayerNormMKLDNNHandler T* variance_data = variance->mutable_data( this->place_, this->fwd_pd_->variance_desc().get_size()); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(), - variance_data, "@variance_mem_p"); + variance_data); } }; @@ -95,6 +86,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); auto src_tz = paddle::framework::vectorize(x->dims()); PADDLE_ENFORCE_EQ(begin_norm_axis, (src_tz.size() - 1), @@ -112,8 +104,8 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { } LayerNormMKLDNNHandler handler(src_tz, epsilon, flags, is_test, - x->format(), dev_ctx, ctx.GetPlace(), - ctx.OutputName("Y")); + x->format(), mkldnn_engine, + ctx.GetPlace()); auto src_memory = handler.AcquireSrcMemory(x); auto dst_memory = handler.AcquireDstMemory(y); @@ -139,24 +131,22 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { args.insert({DNNL_ARG_VARIANCE, *variance_memory}); } - auto scaleshift_memory = handler.AcquireScaleShiftMemory(); + std::shared_ptr scaleshift_memory; if (with_scaleshift) { - if (scaleshift_memory == nullptr || !is_test) { - auto scale_tz = paddle::framework::vectorize(scale->dims()); - const unsigned int C = scale_tz[0]; - - // MKLDNN requires a single piece of memory for scale and shift/bias - // data - std::vector scaleshift_data; - scaleshift_data.reserve(2 * C); - scaleshift_data.insert(scaleshift_data.begin(), scale->data(), - scale->data() + C); - - scaleshift_data.insert(scaleshift_data.end(), bias->data(), - bias->data() + C); - - scaleshift_memory = handler.AcquireScaleShiftMemory(scaleshift_data); - } + auto scale_tz = paddle::framework::vectorize(scale->dims()); + const unsigned int C = scale_tz[0]; + + // MKLDNN requires a single piece of memory for scale and shift/bias + // data + std::vector scaleshift_data; + scaleshift_data.reserve(2 * C); + scaleshift_data.insert(scaleshift_data.begin(), scale->data(), + scale->data() + C); + + scaleshift_data.insert(scaleshift_data.end(), bias->data(), + bias->data() + C); + + scaleshift_memory = handler.AcquireScaleShiftMemory(scaleshift_data); args.insert({DNNL_ARG_SCALE_SHIFT, *scaleshift_memory}); } diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index 5b563e666af0aa..8a89499e4b5744 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -21,86 +21,78 @@ using paddle::framework::Tensor; using paddle::platform::MKLDNNDeviceContext; template -class LRNMKLDNNHandler : public platform::MKLDNNHandlerT { +class LRNMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { public: LRNMKLDNNHandler(const framework::ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, const mkldnn::engine mkldnn_engine, - platform::Place cpu_place, const Tensor* input, - const std::string& unique_name) - - : platform::MKLDNNHandlerT( - dev_ctx, mkldnn_engine, cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), - unique_name)) { - if (!this->isCached()) { - const int n = ctx.Attr("n"); - // MKL-DNN implements LRN in a caffe way: - // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html - // Where sum of squares is divided by size of normalization window - // this is not the case for PaddlePaddle LRN. - // Hence we need to compensate for this diffrence by - // multipliing alpha by size of window(n) - const float alpha = ctx.Attr("alpha") * static_cast(n); - const float beta = ctx.Attr("beta"); - const float k = ctx.Attr("k"); - bool is_test = ctx.Attr("is_test"); - - auto dims = framework::vectorize(input->dims()); - - auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), - input->format()); - - this->AcquireForwardPrimitiveDescriptor( - is_test ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training, - mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); - } + platform::Place cpu_place, const Tensor* input) + + : platform::MKLDNNHandlerNoCachingT(mkldnn_engine, + cpu_place) { + const int n = ctx.Attr("n"); + // MKL-DNN implements LRN in a caffe way: + // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html + // Where sum of squares is divided by size of normalization window + // this is not the case for PaddlePaddle LRN. + // Hence we need to compensate for this diffrence by + // multipliing alpha by size of window(n) + const float alpha = ctx.Attr("alpha") * static_cast(n); + const float beta = ctx.Attr("beta"); + const float k = ctx.Attr("k"); + bool is_test = ctx.Attr("is_test"); + + auto dims = framework::vectorize(input->dims()); + + auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), + input->format()); + + this->AcquireForwardPrimitiveDescriptor( + is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training, + mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); } LRNMKLDNNHandler(const framework::ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* in_x, - const Tensor* out_grad, Tensor* in_x_grad, - const std::string& unique_name) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()), - unique_name)) { - if (!this->isBwdCached()) { - PADDLE_ENFORCE_EQ( - ctx.Attr("is_test"), false, - platform::errors::PreconditionNotMet( - "is_test attribute should be set to False in training phase.")); - - const int n = ctx.Attr("n"); - const float alpha = ctx.Attr("alpha") * static_cast(n); - const float beta = ctx.Attr("beta"); - const float k = ctx.Attr("k"); - - auto dims = framework::vectorize(in_x->dims()); - - auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), - in_x->format()); - auto diff_md = mkldnn::memory::desc( - dims, platform::MKLDNNGetDataType(), out_grad->format()); - - this->AcquireForwardPrimitiveDescriptor( - mkldnn::prop_kind::forward_training, - mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); - - this->AcquireBackwardPrimitiveDescriptor( - mkldnn::algorithm::lrn_across_channels, src_md, diff_md, n, alpha, - beta, k); - } + const Tensor* out_grad, Tensor* in_x_grad) + : platform::MKLDNNHandlerNoCachingT(mkldnn_engine, + cpu_place) { + PADDLE_ENFORCE_EQ( + ctx.Attr("is_test"), false, + platform::errors::PreconditionNotMet( + "is_test attribute should be set to False in training phase.")); + + const int n = ctx.Attr("n"); + const float alpha = ctx.Attr("alpha") * static_cast(n); + const float beta = ctx.Attr("beta"); + const float k = ctx.Attr("k"); + + auto dims = framework::vectorize(in_x->dims()); + + auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), + in_x->format()); + auto diff_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), + out_grad->format()); + + this->AcquireForwardPrimitiveDescriptor( + mkldnn::prop_kind::forward_training, + mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); + + this->AcquireBackwardPrimitiveDescriptor( + mkldnn::algorithm::lrn_across_channels, src_md, diff_md, n, alpha, beta, + k); } std::shared_ptr AcquireWorkspaceMemory(Tensor* workspace) { T* ptr = workspace->mutable_data( this->place_, this->fwd_pd_->workspace_desc().get_size()); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(), - ptr, "@wrk_mem_p"); + ptr); } std::shared_ptr AcquireBackwardWorkspaceMemory( @@ -108,7 +100,7 @@ class LRNMKLDNNHandler : public platform::MKLDNNHandlerTdata(); return this->AcquireMemoryFromPrimitive( this->fwd_pd_->workspace_desc(), - platform::to_void_cast(workspace_data), "@bwd-wrk_mem_p"); + platform::to_void_cast(workspace_data)); } }; @@ -131,8 +123,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto out = ctx.Output("Out"); auto mid = ctx.Output("MidOut"); - LRNMKLDNNHandler handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, - ctx.OutputName("Out")); + LRNMKLDNNHandler handler(ctx, mkldnn_engine, ctx.GetPlace(), x); auto src_memory = handler.AcquireSrcMemory(x); auto dst_memory = handler.AcquireDstMemory(out); @@ -178,9 +169,10 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto in_x_grad = ctx.Output(framework::GradVarName("X")); auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); - LRNMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), in_x, out_grad, - in_x_grad, ctx.InputName("Out")); + LRNMKLDNNHandler handler(ctx, mkldnn_engine, ctx.GetPlace(), in_x, + out_grad, in_x_grad); auto src_memory = handler.AcquireSrcMemory(in_x); auto workspace = handler.AcquireBackwardWorkspaceMemory(mid); diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index 1813aabf1d8548..4cc9f53b9b6b22 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -45,44 +45,35 @@ using paddle::platform::MKLDNNDeviceContext; using platform::to_void_cast; template -class SumMKLDNNHandler : public platform::MKLDNNHandlerT { +class SumMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { public: - SumMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, + SumMKLDNNHandler(mkldnn::engine engine, platform::Place cpu_place, const std::vector& in_vars, - framework::LoDTensor* z, const std::string& uniq_name) + framework::LoDTensor* z) - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(z->dims()), - uniq_name)), + : platform::MKLDNNHandlerNoCachingT(engine, cpu_place), num_inputs_(0) { - for (size_t i = 0; i < in_vars.size(); i++) { - srcs_suffix_.push_back(std::string("-") + std::to_string(i)); - } + auto dst_tz = framework::vectorize(z->dims()); + auto src_tz = dst_tz; - if (!this->isCached()) { - auto dst_tz = framework::vectorize(z->dims()); - auto src_tz = dst_tz; - - std::vector srcs_md; - for (size_t i = 0; i < in_vars.size(); i++) { - auto& input_it = in_vars[i]->Get(); - if (input_it.numel() == 0) { - continue; - } - MKLDNNMemoryFormat input_format = input_it.format(); - srcs_md.push_back(mkldnn::memory::desc( - src_tz, platform::MKLDNNGetDataType(), input_format)); - ++num_inputs_; + std::vector srcs_md; + for (size_t i = 0; i < in_vars.size(); i++) { + auto& input_it = in_vars[i]->Get(); + if (input_it.numel() == 0) { + continue; } - std::vector scales(num_inputs_, 1.0); + MKLDNNMemoryFormat input_format = input_it.format(); + srcs_md.push_back(mkldnn::memory::desc( + src_tz, platform::MKLDNNGetDataType(), input_format)); + ++num_inputs_; + } + std::vector scales(num_inputs_, 1.0); - auto dst_md = mkldnn::memory::desc( - dst_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::any); + auto dst_md = mkldnn::memory::desc(dst_tz, platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); - this->AcquireForwardPrimitiveDescriptor(dst_md, scales, srcs_md); - } + this->AcquireForwardPrimitiveDescriptor(dst_md, scales, srcs_md); } // (jczaja) sum oneDNN prim is not having .desc attribute so @@ -90,37 +81,27 @@ class SumMKLDNNHandler : public platform::MKLDNNHandlerT { void AcquireForwardPrimitiveDescriptor( const mkldnn::memory::desc& dst_md, const std::vector& scales, const std::vector& srcs_md) { - // Sum op does not have backward so no passing from FWD to BWD is needed - const std::string key_pd = this->key_ + "@fwd_pd"; - this->fwd_pd_ = std::static_pointer_cast( - this->dev_ctx_.GetBlob(key_pd)); - if (this->fwd_pd_ == nullptr) { - this->fwd_pd_.reset(new dnnl::sum::primitive_desc(dst_md, scales, srcs_md, - this->engine_)); - this->dev_ctx_.SetBlob(key_pd, this->fwd_pd_); - } + this->fwd_pd_.reset( + new dnnl::sum::primitive_desc(dst_md, scales, srcs_md, this->engine_)); } std::shared_ptr AcquireSrcMemory( const framework::Tensor& input, int i) { const T* input_data = input.data(); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src_desc(i), - to_void_cast(input_data), - "@src_mem_p" + srcs_suffix_[i]); + to_void_cast(input_data)); } - using platform::MKLDNNHandlerT::AcquireDstMemory; + using platform::MKLDNNHandlerNoCachingT::AcquireDstMemory; std::shared_ptr AcquireDstMemory(void) { - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), - "@dst_mem_p"); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc()); } inline int GetNumInputs(void) { return num_inputs_; } private: int num_inputs_; - std::vector srcs_suffix_; }; template @@ -131,6 +112,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::platform::errors::PreconditionNotMet( "Operator DNNL Sum must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); auto in_vars = ctx.MultiInputVar("X"); PADDLE_ENFORCE_NE(in_vars.empty(), true, platform::errors::InvalidArgument( @@ -140,8 +122,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { bool in_place = (input0.numel() > 0) && input0.IsSharedBufferWith(*output); - SumMKLDNNHandler handler(dev_ctx, ctx.GetPlace(), in_vars, output, - ctx.OutputName("Out")); + SumMKLDNNHandler handler(mkldnn_engine, ctx.GetPlace(), in_vars, output); // Create list of SRC MEMs std::vector> srcs_mem; diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index 4c46a92700996a..a92e8e6cb047f9 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -24,6 +24,70 @@ namespace operators { using Tensor = framework::Tensor; using framework::DataLayout; +template +class TransposeMKLDNNHandler { + public: + TransposeMKLDNNHandler(std::vector& dims, // NOLINT + std::vector& axis, // NOLINT + mkldnn::engine engine) + : dims_(dims), + axis_(axis), + logical_axis_(dims.size(), 0), + engine_(engine) {} + + std::shared_ptr AcquireSrcMemory( + const MKLDNNMemoryFormat& fmt, void* ptr) { + // Make memory descriptor using input format, unless it + // cannot be trusted (nchw) then make up memory fmt manually + for (size_t i = 0; i < this->logical_axis_.size(); ++i) { + this->logical_axis_[i] = i; + } + + auto src_md = fmt != MKLDNNMemoryFormat::nchw + ? platform::MKLDNNMemDesc( + dims_, platform::MKLDNNGetDataType(), fmt) + : Axis2MemoryDesc(dims_, logical_axis_); + return std::make_shared(src_md, engine_, ptr); + } + + std::shared_ptr AcquireDstMemory(framework::Tensor* output, + platform::Place place) { + auto dst_md = Axis2MemoryDesc(dims_, axis_); + auto dst_data = output->mutable_data(place, dst_md.get_size()); + return std::make_shared(dst_md, engine_, dst_data); + } + + std::shared_ptr AcquireTranspose( + std::shared_ptr dst_memory_p, + std::shared_ptr src_memory_p) { + return std::make_shared(*(src_memory_p), *(dst_memory_p)); + } + + protected: + mkldnn::memory::desc Axis2MemoryDesc(std::vector& nchw_tz, // NOLINT + std::vector& axis // NOLINT + ) { + size_t ndims = axis.size(); + + std::vector strides(ndims); + unsigned int total_stride = 1; + for (int i = ndims - 1; i >= 0; --i) { + strides[axis[i]] = total_stride; + total_stride *= nchw_tz[axis[i]]; + } + mkldnn::memory::desc mem_d(nchw_tz, platform::MKLDNNGetDataType(), + strides); + + return mem_d; + } + + private: + std::vector dims_; + std::vector axis_; + std::vector logical_axis_; + mkldnn::engine engine_; +}; + template class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -48,11 +112,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto nchw_tz = paddle::framework::vectorize(input->dims()); - const std::string key = - platform::CreateKey(dev_ctx, nchw_tz, ctx.OutputName("Out")); - - platform::TransposeMKLDNNHandler handler(nchw_tz, axis, dev_ctx, - mkldnn_engine, key); + TransposeMKLDNNHandler handler(nchw_tz, axis, mkldnn_engine); auto transpose_src_memory_p = handler.AcquireSrcMemory( input->format(), platform::to_void_cast(input_data)); @@ -103,11 +163,7 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto nchw_tz = paddle::framework::vectorize(out_grad->dims()); - const std::string key = platform::CreateKey( - dev_ctx, nchw_tz, ctx.OutputName(framework::GradVarName("X"))); - - platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx, - mkldnn_engine, key); + TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, mkldnn_engine); auto transpose_src_memory_p = handler.AcquireSrcMemory( out_grad->format(), platform::to_void_cast(out_grad_data)); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 95b8e0c610b1d4..0b7e96a25477b9 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -1072,99 +1072,6 @@ class ActivationMKLDNNHandler } }; -template -class TransposeMKLDNNHandler : public MKLDNNHandler { - public: - TransposeMKLDNNHandler(std::vector& dims, // NOLINT - std::vector& axis, // NOLINT - const platform::MKLDNNDeviceContext& dev_ctx, - mkldnn::engine engine, const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key), - dims_(dims), - axis_(axis), - logical_axis_(dims.size(), 0) {} - - std::shared_ptr AcquireSrcMemory( - const MKLDNNMemoryFormat& fmt, void* ptr) { - auto local_key = key_ + "@user_src_mem_p"; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - // Make memory descriptor using input format, unless it - // cannot be trusted (nchw) then make up memory fmt manually - for (size_t i = 0; i < logical_axis_.size(); ++i) { - logical_axis_[i] = i; - } - - auto src_md = fmt != MKLDNNMemoryFormat::nchw - ? platform::MKLDNNMemDesc( - dims_, platform::MKLDNNGetDataType(), fmt) - : Axis2MemoryDesc(dims_, logical_axis_); - mem_p = std::make_shared(src_md, engine_, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; - } - - std::shared_ptr AcquireDstMemory(framework::Tensor* output, - platform::Place place) { - auto local_key = key_ + "@user_dst_mem_p"; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - auto dst_md = Axis2MemoryDesc(dims_, axis_); - - auto dst_data = output->mutable_data(place, dst_md.get_size()); - - mem_p = std::make_shared(dst_md, engine_, dst_data); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - auto dst_data = output->mutable_data(place); - mem_p->set_data_handle(dst_data); - } - return mem_p; - } - - std::shared_ptr AcquireTranspose( - std::shared_ptr dst_memory_p, - std::shared_ptr src_memory_p) { - auto prim_key = key_ + "@transpose_p"; - auto transpose_p = - std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); - if (transpose_p == nullptr) { - transpose_p = - std::make_shared(*(src_memory_p), *(dst_memory_p)); - dev_ctx_.SetBlob(prim_key, transpose_p); - } - return transpose_p; - } - - protected: - mkldnn::memory::desc Axis2MemoryDesc(std::vector& nchw_tz, // NOLINT - std::vector& axis // NOLINT - ) { - size_t ndims = axis.size(); - - std::vector strides(ndims); - unsigned int total_stride = 1; - for (int i = ndims - 1; i >= 0; --i) { - strides[axis[i]] = total_stride; - total_stride *= nchw_tz[axis[i]]; - } - mkldnn::memory::desc mem_d(nchw_tz, platform::MKLDNNGetDataType(), - strides); - - return mem_d; - } - - private: - std::vector dims_; - std::vector axis_; - std::vector logical_axis_; -}; - class ReorderMKLDNNHandler : public MKLDNNHandler { public: ReorderMKLDNNHandler(std::vector& dims, // NOLINT From 9be41447baaa8c7d398189d8f98a574beec9c750 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Tue, 17 Aug 2021 16:22:23 +0800 Subject: [PATCH 075/126] Copy boost optional to Paddle (#34780) * copy boost optional.hpp to paddle * copy boost optional.hpp to paddle * move directions * del fluid/utils * modify .hpp to .h * move directions * modify to paddle::optional * add modification description * format code stype for the files in paddle/utils * format code stype --- .../fluid/framework/details/build_strategy.cc | 6 +- .../fluid/framework/details/build_strategy.h | 8 +- .../conv_elementwise_add_mkldnn_fuse_pass.cc | 4 +- .../conv_elementwise_add_mkldnn_fuse_pass.h | 2 +- paddle/fluid/framework/mixed_vector.h | 12 +- paddle/fluid/framework/op_version_registry.h | 5 +- paddle/fluid/operators/flip_op.cc | 2 +- .../operators/mkldnn/concat_mkldnn_op.cc | 2 +- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 2 +- paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 9 +- .../fluid/operators/mkldnn/mul_mkldnn_op.cc | 8 +- .../sequence_ops/sequence_concat_op.h | 4 +- paddle/fluid/platform/mkldnn_reuse.h | 2 +- paddle/fluid/pybind/pybind.cc | 9 +- paddle/fluid/pybind/reader_py.cc | 6 +- paddle/utils/any.h | 295 +++--- paddle/utils/none.h | 42 + paddle/utils/optional.h | 869 ++++++++++++++++++ 18 files changed, 1075 insertions(+), 212 deletions(-) create mode 100644 paddle/utils/none.h create mode 100644 paddle/utils/optional.h diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index b517c232755398..0d55882953db35 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -36,8 +36,8 @@ static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { !strategy.enable_parallel_graph_; } -static inline void ConvertDefaultValue(boost::optional *default_value) { - if (*default_value == boost::none) { +static inline void ConvertDefaultValue(paddle::optional *default_value) { + if (*default_value == paddle::none) { *default_value = true; } } @@ -247,7 +247,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } } - void AppendPassWithCheck(const boost::optional &append_pass, + void AppendPassWithCheck(const paddle::optional &append_pass, const std::string &pass_name) { AppendPassWithCheck(append_pass == true, pass_name); } diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 9dcfb0ff32da2f..e1e9db2ece672c 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -112,8 +112,8 @@ struct BuildStrategy { bool enable_auto_fusion_{false}; // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients // should not be sparse types - boost::optional fuse_all_optimizer_ops_{false}; - boost::optional fuse_all_reduce_ops_{boost::none}; + paddle::optional fuse_all_optimizer_ops_{false}; + paddle::optional fuse_all_reduce_ops_{boost::none}; // fuse_relu_depthwise_conv can fuse the `relu -> // depthwise_conv` bool fuse_relu_depthwise_conv_{false}; @@ -121,7 +121,7 @@ struct BuildStrategy { // faster. Because fusing broadcast OP equals delaying the execution of all // broadcast Ops, in this case, all nccl streams are used only for reduce // operations for a period of time. - boost::optional fuse_broadcast_ops_{boost::none}; + paddle::optional fuse_broadcast_ops_{boost::none}; // replace batch_norm with sync_batch_norm. bool sync_batch_norm_{false}; @@ -135,7 +135,7 @@ struct BuildStrategy { // By default, memory_optimize would be opened if gc is disabled, and // be closed if gc is enabled. // Users can forcely enable/disable memory_optimize by setting True/False. - boost::optional memory_optimize_{boost::none}; + paddle::optional memory_optimize_{boost::none}; // Turn on inplace by default. bool enable_inplace_{true}; diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index b07cc58959faa0..8031f56752ac8a 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -74,11 +74,11 @@ bool IsReachable(ir::Graph* graph, Node* from, Node* to) { } template -boost::optional HasAttribute(const Node& op, const std::string& attr) { +paddle::optional HasAttribute(const Node& op, const std::string& attr) { if (op.Op()->HasAttr(attr)) return BOOST_GET_CONST(T, op.Op()->GetAttr(attr)); else - return boost::none; + return paddle::none; } ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() { diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h index 5b4f941836ce0b..c83335da2f629c 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h @@ -40,7 +40,7 @@ using GraphWithStats = std::pair; void CorrectGraphEdges(Graph* graph, Node* from, Node* to); bool IsReachable(ir::Graph* graph, Node* from, Node* to); -boost::optional HasBias(const Node& op, const std::string& bias_name); +paddle::optional HasBias(const Node& op, const std::string& bias_name); class ResidualConnectionMKLDNNFusePass : public FusePassBase { private: diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 1e9b498bb2bfbf..cf71cdfc6d6510 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -27,6 +27,8 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/utils/none.h" +#include "paddle/utils/optional.h" namespace paddle { namespace framework { @@ -195,10 +197,10 @@ class Vector { std::mutex &Mutex() const { return mtx_; } - boost::optional CUDAPlace() const { + paddle::optional CUDAPlace() const { return gpu_ == nullptr - ? boost::none - : boost::optional( + ? paddle::none + : paddle::optional( BOOST_GET_CONST(platform::CUDAPlace, gpu_->place())); } @@ -389,7 +391,7 @@ class Vector { auto &mtx = m_.Data().Mutex(); std::lock_guard guard(mtx); auto cuda_place = m_.Data().CUDAPlace(); - if (cuda_place == boost::none || + if (cuda_place == paddle::none || cuda_place == BOOST_GET(platform::CUDAPlace, place)) { return m_.Data().CUDAData(place); } @@ -405,7 +407,7 @@ class Vector { auto &mtx = m_.Data().Mutex(); std::lock_guard guard(mtx); auto cuda_place = m_.Data().CUDAPlace(); - if (cuda_place == boost::none || + if (cuda_place == paddle::none || cuda_place == BOOST_GET(platform::CUDAPlace, place)) { return m_.MutableData()->CUDAMutableData(place); } diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h index 5ae8f255d63be5..45ec01868d937a 100644 --- a/paddle/fluid/framework/op_version_registry.h +++ b/paddle/fluid/framework/op_version_registry.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_proto.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/utils/none.h" namespace paddle { namespace framework { @@ -42,7 +43,7 @@ using OpAttrVariantT = std::vector, /* AttrType::INTS */ std::vector, /* AttrType::LONGS */ std::vector, /* AttrType::STRINGS */ - boost::none_t /* None */ + paddle::none_t /* None */ >; struct OpUpdateInfo { @@ -51,7 +52,7 @@ struct OpUpdateInfo { struct OpAttrInfo : OpUpdateInfo { OpAttrInfo(const std::string& name, const std::string& remark, - const OpAttrVariantT& default_value = boost::none) + const OpAttrVariantT& default_value = paddle::none) : name_{name}, default_value_{default_value}, remark_{remark} {} const std::string& name() const { return name_; } diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc index d7ed5fb767cd9a..d062243acf39a1 100644 --- a/paddle/fluid/operators/flip_op.cc +++ b/paddle/fluid/operators/flip_op.cc @@ -161,5 +161,5 @@ REGISTER_OP_VERSION(flip) R"ROC(Upgrade flip, add new attr [axis] and delete attr [dims].)ROC", paddle::framework::compatible::OpVersionDesc() .NewAttr("axis", "The added attr 'axis' doesn't set default value.", - boost::none) + paddle::none) .DeleteAttr("dims", "The attr 'dims' is deleted.")); diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index df4750321e3fce..8901c0afb369ad 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -137,7 +137,7 @@ class ConcatPrimitiveFactory { private: std::vector srcs_d; std::vector srcs; - boost::optional dst_mem; + paddle::optional dst_mem; }; template diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 0065f3ae394832..b353ce4c322e40 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -893,7 +893,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { fuse_residual_conn, propagation, output_shift_scale, sum_scale); } else { conv_pd = handler->AcquireConvolutionPrimitiveDescriptor( - src_md, weights_md, boost::none, dst_md, strides, dilations, + src_md, weights_md, paddle::none, dst_md, strides, dilations, paddings, mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn, propagation, output_shift_scale, sum_scale); } diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index d7e5d9b9e021f1..08fdd3b74c71bb 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -89,7 +89,8 @@ class FCPrimitiveFactory { // descriptor has been divided into separate cases, based on the number // of input dimensions. size_t input_dim_num = input->dims().size(); - boost::optional fc_prim_desc; + paddle::optional + fc_prim_desc; memory::desc usr_weights_desc = {}; switch (input_dim_num) { case 2: @@ -545,11 +546,11 @@ class FCPrimitiveFactory { private: const mkldnn::engine& engine_; - boost::optional input_; - boost::optional output_; + paddle::optional input_; + paddle::optional output_; std::shared_ptr bias_; std::shared_ptr weights_; - boost::optional fc_; + paddle::optional fc_; }; // Attempt to fetch cached primitive factory based on provided parameters diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index b3d970c7f0513f..422944107fb280 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -290,10 +290,10 @@ class MulPrimitiveFactory { } const mkldnn::engine &engine_; - boost::optional x_input_; - boost::optional y_input_; - boost::optional output_; - boost::optional mul_; + paddle::optional x_input_; + paddle::optional y_input_; + paddle::optional output_; + paddle::optional mul_; static constexpr bool is_int8_ = std::is_same::value || std::is_same::value; }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h index 339db996fc9b66..1b8525febe2d49 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h @@ -123,7 +123,7 @@ class SeqConcatGradKernel : public framework::OpKernel { } std::vector sliced_x; - std::vector> sliced_dx; + std::vector> sliced_dx; for (size_t i = 1; i < xs[0]->lod()[0].size(); ++i) { for (size_t j = 0; j < xs.size(); ++j) { @@ -145,7 +145,7 @@ class SeqConcatGradKernel : public framework::OpKernel { if (dx) { sliced_dx.emplace_back(dx->Slice(prev_lod, next_lod)); } else { - sliced_dx.emplace_back(boost::none); + sliced_dx.emplace_back(paddle::none); } } } diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 0b7e96a25477b9..c27bc6c6e55c01 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -1426,7 +1426,7 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { std::shared_ptr AcquireConvolutionPrimitiveDescriptor( const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights, - boost::optional bias, + paddle::optional bias, const mkldnn::memory::desc& dst, const std::vector& strides, const std::vector& dilations, const std::vector& paddings, const mkldnn::engine& engine, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 5001cc4a0172fc..0663da88ac75f1 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -73,6 +73,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/pybind/cuda_streams_py.h" #include "paddle/fluid/pybind/io.h" +#include "paddle/utils/none.h" #ifdef PADDLE_WITH_ASCEND #include "paddle/fluid/pybind/ascend_wrapper_py.h" #endif @@ -2910,7 +2911,7 @@ All parameter, weight, gradient are variables in Paddle. .def_property("fuse_broadcast_ops", [](const BuildStrategy &self) { return self.fuse_broadcast_ops_ == true || - self.fuse_broadcast_ops_ == boost::none; + self.fuse_broadcast_ops_ == paddle::none; }, [](BuildStrategy &self, bool b) { PADDLE_ENFORCE_NE(self.IsFinalized(), true, @@ -2940,7 +2941,7 @@ All parameter, weight, gradient are variables in Paddle. .def_property("fuse_all_optimizer_ops", [](const BuildStrategy &self) { return self.fuse_all_optimizer_ops_ == true || - self.fuse_all_optimizer_ops_ == boost::none; + self.fuse_all_optimizer_ops_ == paddle::none; }, [](BuildStrategy &self, bool b) { PADDLE_ENFORCE_NE(self.IsFinalized(), true, @@ -2989,7 +2990,7 @@ All parameter, weight, gradient are variables in Paddle. [](BuildStrategy &self, const py::handle &value) { auto *py_obj = value.ptr(); if (py_obj == nullptr || py_obj == Py_None) { - self.memory_optimize_ = boost::none; + self.memory_optimize_ = paddle::none; } else if (PyBool_Check(py_obj)) { self.memory_optimize_ = (py_obj == Py_True); } else { @@ -3046,7 +3047,7 @@ All parameter, weight, gradient are variables in Paddle. "fuse_all_reduce_ops", [](const BuildStrategy &self) { return self.fuse_all_reduce_ops_ == true || - self.fuse_all_reduce_ops_ == boost::none; + self.fuse_all_reduce_ops_ == paddle::none; }, [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; }) .def_property("enable_backward_optimizer_op_deps", diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc index abe1977eb6978b..9ed1ed30324b28 100644 --- a/paddle/fluid/pybind/reader_py.cc +++ b/paddle/fluid/pybind/reader_py.cc @@ -44,7 +44,7 @@ namespace reader = operators::reader; // Check whether the tensor shape matches the VarDesc shape // Return the different shape if exists -static boost::optional> DiffTensorShapeWithVarDesc( +static paddle::optional> DiffTensorShapeWithVarDesc( const framework::LoDTensor &tensor, const framework::VarDesc &var_desc, size_t num_places) { auto tensor_shape = tensor.dims(); @@ -56,7 +56,7 @@ static boost::optional> DiffTensorShapeWithVarDesc( if (desc_shape.size() != 0) { // Tensor rank = 0 but desc does not match return framework::vectorize(tensor_shape); } else { - return boost::none; + return paddle::none; } } @@ -92,7 +92,7 @@ static boost::optional> DiffTensorShapeWithVarDesc( } } - return boost::none; + return paddle::none; } static const std::shared_ptr &GetQueue( diff --git a/paddle/utils/any.h b/paddle/utils/any.h index ec803647c11f7e..d0e72b70635791 100644 --- a/paddle/utils/any.h +++ b/paddle/utils/any.h @@ -1,8 +1,8 @@ -//This file copy from boost/any.hpp and boost version: 1.41.0 -//Modified the following points: -//1. modify namespace from boost::any to paddle::any -//2. remove the depending boost header files -//3. remove/modify some macro +// This file copy from boost/any.hpp and boost version: 1.41.0 +// Modified the following points: +// 1. modify namespace from boost::any to paddle::any +// 2. remove the depending boost header files +// 3. remove/modify some macro // See http://www.boost.org/libs/any for Documentation. @@ -17,210 +17,157 @@ // where: tested with BCC 5.5, MSVC 6.0, and g++ 2.95 #include -#include #include +#include // See boost/python/type_id.hpp // TODO: add BOOST_TYPEID_COMPARE_BY_NAME to config.hpp -# if (defined(__GNUC__) && __GNUC__ >= 3) \ - || defined(_AIX) \ - || ( defined(__sgi) && defined(__host_mips)) \ - || (defined(__hpux) && defined(__HP_aCC)) \ - || (defined(linux) && defined(__INTEL_COMPILER) && defined(__ICC)) -# define BOOST_AUX_ANY_TYPE_ID_NAME +#if (defined(__GNUC__) && __GNUC__ >= 3) || defined(_AIX) || \ + (defined(__sgi) && defined(__host_mips)) || \ + (defined(__hpux) && defined(__HP_aCC)) || \ + (defined(linux) && defined(__INTEL_COMPILER) && defined(__ICC)) +#define BOOST_AUX_ANY_TYPE_ID_NAME #include -# endif - -namespace paddle -{ - class any - { - public: // structors - - any() - : content(0) - { - } +#endif - template - any(const ValueType & value) - : content(new holder(value)) - { - } +namespace paddle { +class any { + public: // structors + any() : content(0) {} - any(const any & other) - : content(other.content ? other.content->clone() : 0) - { - } + template + any(const ValueType &value) : content(new holder(value)) {} - ~any() - { - delete content; - } + any(const any &other) : content(other.content ? other.content->clone() : 0) {} - public: // modifiers + ~any() { delete content; } - any & swap(any & rhs) - { - std::swap(content, rhs.content); - return *this; - } + public: // modifiers + any &swap(any &rhs) { + std::swap(content, rhs.content); + return *this; + } - template - any & operator=(const ValueType & rhs) - { - any(rhs).swap(*this); - return *this; - } + template + any &operator=(const ValueType &rhs) { + any(rhs).swap(*this); + return *this; + } - any & operator=(any rhs) - { - rhs.swap(*this); - return *this; - } + any &operator=(any rhs) { + rhs.swap(*this); + return *this; + } - public: // queries + public: // queries + bool empty() const { return !content; } - bool empty() const - { - return !content; - } + const std::type_info &type() const { + return content ? content->type() : typeid(void); + } - const std::type_info & type() const - { - return content ? content->type() : typeid(void); - } + public: // types (public so any_cast can be non-friend) + class placeholder { + public: // structors + virtual ~placeholder() {} - public: // types (public so any_cast can be non-friend) + public: // queries + virtual const std::type_info &type() const = 0; - class placeholder - { - public: // structors + virtual placeholder *clone() const = 0; + }; - virtual ~placeholder() - { - } + template + class holder : public placeholder { + public: // structors + holder(const ValueType &value) : held(value) {} - public: // queries + public: // queries + virtual const std::type_info &type() const { return typeid(ValueType); } - virtual const std::type_info & type() const = 0; + virtual placeholder *clone() const { return new holder(held); } - virtual placeholder * clone() const = 0; + public: // representation + ValueType held; - }; + private: // intentionally left unimplemented + holder &operator=(const holder &); + }; - template - class holder : public placeholder - { - public: // structors + public: // representation (public so any_cast can be non-friend) + placeholder *content; +}; - holder(const ValueType & value) - : held(value) - { - } +class bad_any_cast : public std::bad_cast { + public: + virtual const char *what() const throw() { + return "paddle::bad_any_cast: " + "failed conversion using paddle::any_cast"; + } +}; - public: // queries +template +ValueType *any_cast(any *operand) { + return operand && +#ifdef BOOST_AUX_ANY_TYPE_ID_NAME + std::strcmp(operand->type().name(), + typeid(ValueType).name()) == 0 +#else + operand->type() == typeid(ValueType) +#endif + ? &static_cast *>(operand->content)->held + : 0; +} - virtual const std::type_info & type() const - { - return typeid(ValueType); - } +template +inline const ValueType *any_cast(const any *operand) { + return any_cast(const_cast(operand)); +} - virtual placeholder * clone() const - { - return new holder(held); - } +template +ValueType any_cast(any &operand) { + typedef typename std::remove_reference::type nonref; - public: // representation + // If 'nonref' is still reference type, it means the user has not + // specialized 'remove_reference'. - ValueType held; + // Please use BOOST_BROKEN_COMPILER_TYPE_TRAITS_SPECIALIZATION macro + // to generate specialization of remove_reference for your class + // See type traits library documentation for details + static_assert(!std::is_reference::value, + "!std::is_reference::value"); - private: // intentionally left unimplemented - holder & operator=(const holder &); - }; + nonref *result = any_cast(&operand); + if (!result) throw bad_any_cast(); + return *result; +} - public: // representation (public so any_cast can be non-friend) +template +inline ValueType any_cast(const any &operand) { + typedef typename std::remove_reference::type nonref; - placeholder * content; + // The comment in the above version of 'any_cast' explains when this + // assert is fired and what to do. + static_assert(!std::is_reference::value, + "!std::is_reference::value"); - }; + return any_cast(const_cast(operand)); +} - class bad_any_cast : public std::bad_cast - { - public: - virtual const char * what() const throw() - { - return "paddle::bad_any_cast: " - "failed conversion using paddle::any_cast"; - } - }; +// Note: The "unsafe" versions of any_cast are not part of the +// public interface and may be removed at any time. They are +// required where we know what type is stored in the any and can't +// use typeid() comparison, e.g., when our types may travel across +// different shared libraries. +template +inline ValueType *unsafe_any_cast(any *operand) { + return &static_cast *>(operand->content)->held; +} - template - ValueType * any_cast(any * operand) - { - return operand && -#ifdef BOOST_AUX_ANY_TYPE_ID_NAME - std::strcmp(operand->type().name(), typeid(ValueType).name()) == 0 -#else - operand->type() == typeid(ValueType) -#endif - ? &static_cast *>(operand->content)->held - : 0; - } - - template - inline const ValueType * any_cast(const any * operand) - { - return any_cast(const_cast(operand)); - } - - template - ValueType any_cast(any & operand) - { - typedef typename std::remove_reference::type nonref; - - // If 'nonref' is still reference type, it means the user has not - // specialized 'remove_reference'. - - // Please use BOOST_BROKEN_COMPILER_TYPE_TRAITS_SPECIALIZATION macro - // to generate specialization of remove_reference for your class - // See type traits library documentation for details - static_assert(!std::is_reference::value, "!std::is_reference::value"); - - nonref * result = any_cast(&operand); - if(!result) - throw bad_any_cast(); - return *result; - } - - template - inline ValueType any_cast(const any & operand) - { - typedef typename std::remove_reference::type nonref; - - // The comment in the above version of 'any_cast' explains when this - // assert is fired and what to do. - static_assert(!std::is_reference::value, "!std::is_reference::value"); - - return any_cast(const_cast(operand)); - } - - // Note: The "unsafe" versions of any_cast are not part of the - // public interface and may be removed at any time. They are - // required where we know what type is stored in the any and can't - // use typeid() comparison, e.g., when our types may travel across - // different shared libraries. - template - inline ValueType * unsafe_any_cast(any * operand) - { - return &static_cast *>(operand->content)->held; - } - - template - inline const ValueType * unsafe_any_cast(const any * operand) - { - return unsafe_any_cast(const_cast(operand)); - } +template +inline const ValueType *unsafe_any_cast(const any *operand) { + return unsafe_any_cast(const_cast(operand)); +} } // Copyright Kevlin Henney, 2000, 2001, 2002. All rights reserved. diff --git a/paddle/utils/none.h b/paddle/utils/none.h new file mode 100644 index 00000000000000..20d6f4d2c7ddea --- /dev/null +++ b/paddle/utils/none.h @@ -0,0 +1,42 @@ +// This file copy from boost/none_t.hpp and boost/none.hpp and boost version: +// 1.41.0 +// Modified the following points: +// 1. modify namespace from boost::none to paddle::none +// 2. modify namespace from boost::none_t to paddle::none_t + +// Copyright (C) 2003, Fernando Luis Cacciola Carballal. +// +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// +// See http://www.boost.org/libs/optional for documentation. +// +// You are welcome to contact the author at: +// fernando_cacciola@hotmail.com +// +#ifndef PADDLE_NONE_17SEP2003_HPP +#define PADDLE_NONE_17SEP2003_HPP + +namespace paddle { + +namespace detail { +struct none_helper {}; +} + +typedef int detail::none_helper::*none_t; + +} // namespace boost + +// NOTE: Borland users have to include this header outside any precompiled +// headers +// (bcc<=5.64 cannot include instance data in a precompiled header) +// -- * To be verified, now that there's no unnamed namespace + +namespace paddle { + +none_t const none = ((none_t)0); + +} // namespace boost + +#endif diff --git a/paddle/utils/optional.h b/paddle/utils/optional.h new file mode 100644 index 00000000000000..00d8ae28ee836a --- /dev/null +++ b/paddle/utils/optional.h @@ -0,0 +1,869 @@ +// This file copy from boost/optional/optional.hpp and boost version: 1.41.0 +// Modified the following points: +// 1. modify namespace from boost::optional to paddle::optional +// 2. remove the depending boost header files +// 3. remove/modify some macro +// 4. copy some necessary data structures which are the depended by optional +// 5. replace type_with_alignment with std::aligned_storage + +// Copyright (C) 2003, Fernando Luis Cacciola Carballal. +// +// Use, modification, and distribution is subject to the Boost Software +// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// +// See http://www.boost.org/lib/optional for documentation. +// +// You are welcome to contact the author at: +// fernando_cacciola@hotmail.com +// +#ifndef PADDLE_OPTIONAL_OPTIONAL_FLC_19NOV2002_HPP +#define PADDLE_OPTIONAL_OPTIONAL_FLC_19NOV2002_HPP + +#include +#include +#include +#include + +#include "none.h" + +// Daniel Wallin discovered that bind/apply.hpp badly interacts with the apply<> +// member template of a factory as used in the optional<> implementation. +// He proposed this simple fix which is to move the call to apply<> outside +// namespace boost. +namespace paddle_optional_detail { +template +void construct(Factory const& factory, void* address) { + factory.template apply(address); +} +} + +namespace paddle { +template +class optional; + +class in_place_factory_base {}; +class typed_in_place_factory_base {}; + +// template bool equal_pointees(OP const& x, OP const& y); +// template struct equal_pointees_t; +// +// Being OP a model of OptionalPointee (either a pointer or an optional): +// +// If both x and y have valid pointees, returns the result of (*x == *y) +// If only one has a valid pointee, returns false. +// If none have valid pointees, returns true. +// No-throw +template +inline bool equal_pointees(OptionalPointee const& x, OptionalPointee const& y) { + return (!x) != (!y) ? false : (!x ? true : (*x) == (*y)); +} + +template +struct equal_pointees_t + : std::binary_function { + bool operator()(OptionalPointee const& x, OptionalPointee const& y) const { + return equal_pointees(x, y); + } +}; + +// template bool less_pointees(OP const& x, OP const& y); +// template struct less_pointees_t; +// +// Being OP a model of OptionalPointee (either a pointer or an optional): +// +// If y has not a valid pointee, returns false. +// ElseIf x has not a valid pointee, returns true. +// ElseIf both x and y have valid pointees, returns the result of (*x < *y) +// No-throw +template +inline bool less_pointees(OptionalPointee const& x, OptionalPointee const& y) { + return !y ? false : (!x ? true : (*x) < (*y)); +} + +template +struct less_pointees_t + : std::binary_function { + bool operator()(OptionalPointee const& x, OptionalPointee const& y) const { + return less_pointees(x, y); + } +}; + +namespace detail { + +template +class reference_content { + private: // representation + RefT content_; + + public: // structors + ~reference_content() {} + + reference_content(RefT r) : content_(r) {} + + reference_content(const reference_content& operand) + : content_(operand.content_) {} + + private: // non-Assignable + reference_content& operator=(const reference_content&); + + public: // queries + RefT get() const { return content_; } +}; + +template +struct make_reference_content { + typedef T type; +}; + +template +struct make_reference_content { + typedef reference_content type; +}; + +} // namespace detail + +namespace optional_detail { + +// This local class is used instead of that in "aligned_storage.hpp" +// because I've found the 'official' class to ICE BCB5.5 +// when some types are used with optional<> +// (due to sizeof() passed down as a non-type template parameter) +template +class aligned_storage { + // Borland ICEs if unnamed unions are used for this! + union dummy_u { + char data[sizeof(T)]; + typename std::aligned_storage<::std::alignment_of::value>::type aligner_; + } dummy_; + + public: + void const* address() const { return &dummy_.data[0]; } + void* address() { return &dummy_.data[0]; } +}; + +template +struct types_when_isnt_ref { + typedef T const& reference_const_type; + typedef T& reference_type; + typedef T const* pointer_const_type; + typedef T* pointer_type; + typedef T const& argument_type; +}; +template +struct types_when_is_ref { + typedef typename std::remove_reference::type raw_type; + + typedef raw_type& reference_const_type; + typedef raw_type& reference_type; + typedef raw_type* pointer_const_type; + typedef raw_type* pointer_type; + typedef raw_type& argument_type; +}; + +struct optional_tag {}; + +template +class optional_base : public optional_tag { + private: + typedef + typename ::paddle::detail::make_reference_content::type internal_type; + + typedef aligned_storage storage_type; + + typedef types_when_isnt_ref types_when_not_ref; + typedef types_when_is_ref types_when_ref; + + typedef optional_base this_type; + + protected: + typedef T value_type; + + typedef std::true_type is_reference_tag; + typedef std::false_type is_not_reference_tag; + + typedef typename std::is_reference::type is_reference_predicate; + + typedef typename std::conditional::type types; + + typedef bool (this_type::*unspecified_bool_type)() const; + + typedef typename types::reference_type reference_type; + typedef typename types::reference_const_type reference_const_type; + typedef typename types::pointer_type pointer_type; + typedef typename types::pointer_const_type pointer_const_type; + typedef typename types::argument_type argument_type; + + // Creates an optional uninitialized. + // No-throw + optional_base() : m_initialized(false) {} + + // Creates an optional uninitialized. + // No-throw + optional_base(none_t) : m_initialized(false) {} + + // Creates an optional initialized with 'val'. + // Can throw if T::T(T const&) does + optional_base(argument_type val) : m_initialized(false) { construct(val); } + + // Creates an optional initialized with 'val' IFF cond is true, otherwise + // creates an uninitialzed optional. + // Can throw if T::T(T const&) does + optional_base(bool cond, argument_type val) : m_initialized(false) { + if (cond) construct(val); + } + + // Creates a deep copy of another optional + // Can throw if T::T(T const&) does + optional_base(optional_base const& rhs) : m_initialized(false) { + if (rhs.is_initialized()) construct(rhs.get_impl()); + } + + // This is used for both converting and in-place constructions. + // Derived classes use the 'tag' to select the appropriate + // implementation (the correct 'construct()' overload) + template + explicit optional_base(Expr const& expr, Expr const* tag) + : m_initialized(false) { + construct(expr, tag); + } + + // No-throw (assuming T::~T() doesn't) + ~optional_base() { destroy(); } + + // Assigns from another optional (deep-copies the rhs value) + void assign(optional_base const& rhs) { + if (is_initialized()) { + if (rhs.is_initialized()) + assign_value(rhs.get_impl(), is_reference_predicate()); + else + destroy(); + } else { + if (rhs.is_initialized()) construct(rhs.get_impl()); + } + } + + // Assigns from another _convertible_ optional (deep-copies the rhs value) + template + void assign(optional const& rhs) { + if (is_initialized()) { + if (rhs.is_initialized()) + assign_value(static_cast(rhs.get()), + is_reference_predicate()); + else + destroy(); + } else { + if (rhs.is_initialized()) construct(static_cast(rhs.get())); + } + } + + // Assigns from a T (deep-copies the rhs value) + void assign(argument_type val) { + if (is_initialized()) + assign_value(val, is_reference_predicate()); + else + construct(val); + } + + // Assigns from "none", destroying the current value, if any, leaving this + // UNINITIALIZED + // No-throw (assuming T::~T() doesn't) + void assign(none_t) { destroy(); } + + template + void assign_expr(Expr const& expr, Expr const* tag) { + if (is_initialized()) + assign_expr_to_initialized(expr, tag); + else + construct(expr, tag); + } + + public: + // Destroys the current value, if any, leaving this UNINITIALIZED + // No-throw (assuming T::~T() doesn't) + void reset() { destroy(); } + + // Replaces the current value -if any- with 'val' + void reset(argument_type val) { assign(val); } + + // Returns a pointer to the value if this is initialized, otherwise, + // returns NULL. + // No-throw + pointer_const_type get_ptr() const { + return m_initialized ? get_ptr_impl() : 0; + } + pointer_type get_ptr() { return m_initialized ? get_ptr_impl() : 0; } + + bool is_initialized() const { return m_initialized; } + + protected: + void construct(argument_type val) { + new (m_storage.address()) internal_type(val); + m_initialized = true; + } + + // Constructs in-place using the given factory + template + void construct(Expr const& factory, in_place_factory_base const*) { + static_assert(!is_reference_predicate::value, + "!is_reference_predicate::value"); + paddle_optional_detail::construct(factory, m_storage.address()); + m_initialized = true; + } + + // Constructs in-place using the given typed factory + template + void construct(Expr const& factory, typed_in_place_factory_base const*) { + static_assert(!is_reference_predicate::value, + "!is_reference_predicate::value"); + factory.apply(m_storage.address()); + m_initialized = true; + } + + template + void assign_expr_to_initialized(Expr const& factory, + in_place_factory_base const* tag) { + destroy(); + construct(factory, tag); + } + + // Constructs in-place using the given typed factory + template + void assign_expr_to_initialized(Expr const& factory, + typed_in_place_factory_base const* tag) { + destroy(); + construct(factory, tag); + } + + // Constructs using any expression implicitely convertible to the single + // argument + // of a one-argument T constructor. + // Converting constructions of optional from optional uses this function + // with + // 'Expr' being of type 'U' and relying on a converting constructor of T from + // U. + template + void construct(Expr const& expr, void const*) { + new (m_storage.address()) internal_type(expr); + m_initialized = true; + } + + // Assigns using a form any expression implicitely convertible to the single + // argument + // of a T's assignment operator. + // Converting assignments of optional from optional uses this function + // with + // 'Expr' being of type 'U' and relying on a converting assignment of T from + // U. + template + void assign_expr_to_initialized(Expr const& expr, void const*) { + assign_value(expr, is_reference_predicate()); + } + + void assign_value(argument_type val, is_not_reference_tag) { + get_impl() = val; + } + void assign_value(argument_type val, is_reference_tag) { construct(val); } + + void destroy() { + if (m_initialized) destroy_impl(is_reference_predicate()); + } + + unspecified_bool_type safe_bool() const { + return m_initialized ? &this_type::is_initialized : 0; + } + + reference_const_type get_impl() const { + return dereference(get_object(), is_reference_predicate()); + } + reference_type get_impl() { + return dereference(get_object(), is_reference_predicate()); + } + + pointer_const_type get_ptr_impl() const { + return cast_ptr(get_object(), is_reference_predicate()); + } + pointer_type get_ptr_impl() { + return cast_ptr(get_object(), is_reference_predicate()); + } + + private: + // internal_type can be either T or reference_content + internal_type const* get_object() const { + return static_cast(m_storage.address()); + } + internal_type* get_object() { + return static_cast(m_storage.address()); + } + + // reference_content lacks an implicit conversion to T&, so the following + // is needed to obtain a proper reference. + reference_const_type dereference(internal_type const* p, + is_not_reference_tag) const { + return *p; + } + reference_type dereference(internal_type* p, is_not_reference_tag) { + return *p; + } + reference_const_type dereference(internal_type const* p, + is_reference_tag) const { + return p->get(); + } + reference_type dereference(internal_type* p, is_reference_tag) { + return p->get(); + } + + void destroy_impl(is_not_reference_tag) { + get_ptr_impl()->T::~T(); + m_initialized = false; + } + + void destroy_impl(is_reference_tag) { m_initialized = false; } + + // If T is of reference type, trying to get a pointer to the held value must + // result in a compile-time error. + // Decent compilers should disallow conversions from reference_content* to + // T*, but just in case, + // the following olverloads are used to filter out the case and guarantee an + // error in case of T being a reference. + pointer_const_type cast_ptr(internal_type const* p, + is_not_reference_tag) const { + return p; + } + pointer_type cast_ptr(internal_type* p, is_not_reference_tag) { return p; } + pointer_const_type cast_ptr(internal_type const* p, is_reference_tag) const { + return &p->get(); + } + pointer_type cast_ptr(internal_type* p, is_reference_tag) { + return &p->get(); + } + + bool m_initialized; + storage_type m_storage; +}; + +} // namespace optional_detail + +template +class optional : public optional_detail::optional_base { + typedef optional_detail::optional_base base; + + typedef typename base::unspecified_bool_type unspecified_bool_type; + + public: + typedef optional this_type; + + typedef typename base::value_type value_type; + typedef typename base::reference_type reference_type; + typedef typename base::reference_const_type reference_const_type; + typedef typename base::pointer_type pointer_type; + typedef typename base::pointer_const_type pointer_const_type; + typedef typename base::argument_type argument_type; + + // Creates an optional uninitialized. + // No-throw + optional() : base() {} + + // Creates an optional uninitialized. + // No-throw + optional(none_t none_) : base(none_) {} + + // Creates an optional initialized with 'val'. + // Can throw if T::T(T const&) does + optional(argument_type val) : base(val) {} + + // Creates an optional initialized with 'val' IFF cond is true, otherwise + // creates an uninitialized optional. + // Can throw if T::T(T const&) does + optional(bool cond, argument_type val) : base(cond, val) {} + + // Creates a deep copy of another convertible optional + // Requires a valid conversion from U to T. + // Can throw if T::T(U const&) does + template + explicit optional(optional const& rhs) : base() { + if (rhs.is_initialized()) this->construct(rhs.get()); + } + + // Creates an optional with an expression which can be either + // (a) An instance of InPlaceFactory (i.e. in_place(a,b,...,n); + // (b) An instance of TypedInPlaceFactory ( i.e. in_place(a,b,...,n); + // (c) Any expression implicitely convertible to the single type + // of a one-argument T's constructor. + // (d*) Weak compilers (BCB) might also resolved Expr as optional and + // optional + // even though explicit overloads are present for these. + // Depending on the above some T ctor is called. + // Can throw is the resolved T ctor throws. + template + explicit optional(Expr const& expr) : base(expr, &expr) {} + + // Creates a deep copy of another optional + // Can throw if T::T(T const&) does + optional(optional const& rhs) : base(rhs) {} + + // No-throw (assuming T::~T() doesn't) + ~optional() {} + + // Assigns from an expression. See corresponding constructor. + // Basic Guarantee: If the resolved T ctor throws, this is left UNINITIALIZED + template + optional& operator=(Expr expr) { + this->assign_expr(expr, &expr); + return *this; + } + + // Assigns from another convertible optional (converts && deep-copies the + // rhs value) + // Requires a valid conversion from U to T. + // Basic Guarantee: If T::T( U const& ) throws, this is left UNINITIALIZED + template + optional& operator=(optional const& rhs) { + this->assign(rhs); + return *this; + } + + // Assigns from another optional (deep-copies the rhs value) + // Basic Guarantee: If T::T( T const& ) throws, this is left UNINITIALIZED + // (NOTE: On BCB, this operator is not actually called and left is left + // UNMODIFIED in case of a throw) + optional& operator=(optional const& rhs) { + this->assign(rhs); + return *this; + } + + // Assigns from a T (deep-copies the rhs value) + // Basic Guarantee: If T::( T const& ) throws, this is left UNINITIALIZED + optional& operator=(argument_type val) { + this->assign(val); + return *this; + } + + // Assigns from a "none" + // Which destroys the current value, if any, leaving this UNINITIALIZED + // No-throw (assuming T::~T() doesn't) + optional& operator=(none_t none_) { + this->assign(none_); + return *this; + } + + // Returns a reference to the value if this is initialized, otherwise, + // the behaviour is UNDEFINED + // No-throw + reference_const_type get() const { + assert(this->is_initialized()); + return this->get_impl(); + } + reference_type get() { + assert(this->is_initialized()); + return this->get_impl(); + } + + // Returns a copy of the value if this is initialized, 'v' otherwise + reference_const_type get_value_or(reference_const_type v) const { + return this->is_initialized() ? get() : v; + } + reference_type get_value_or(reference_type v) { + return this->is_initialized() ? get() : v; + } + + // Returns a pointer to the value if this is initialized, otherwise, + // the behaviour is UNDEFINED + // No-throw + pointer_const_type operator->() const { + assert(this->is_initialized()); + return this->get_ptr_impl(); + } + pointer_type operator->() { + assert(this->is_initialized()); + return this->get_ptr_impl(); + } + + // Returns a reference to the value if this is initialized, otherwise, + // the behaviour is UNDEFINED + // No-throw + reference_const_type operator*() const { return this->get(); } + reference_type operator*() { return this->get(); } + + // implicit conversion to "bool" + // No-throw + operator unspecified_bool_type() const { return this->safe_bool(); } + + // This is provided for those compilers which don't like the conversion to + // bool + // on some contexts. + bool operator!() const { return !this->is_initialized(); } +}; + +// Returns optional(v) +template +inline optional make_optional(T const& v) { + return optional(v); +} + +// Returns optional(cond,v) +template +inline optional make_optional(bool cond, T const& v) { + return optional(cond, v); +} + +// Returns a reference to the value if this is initialized, otherwise, the +// behaviour is UNDEFINED. +// No-throw +template +inline typename optional::reference_const_type get(optional const& opt) { + return opt.get(); +} + +template +inline typename optional::reference_type get(optional& opt) { + return opt.get(); +} + +// Returns a pointer to the value if this is initialized, otherwise, returns +// NULL. +// No-throw +template +inline typename optional::pointer_const_type get(optional const* opt) { + return opt->get_ptr(); +} + +template +inline typename optional::pointer_type get(optional* opt) { + return opt->get_ptr(); +} + +// Returns a reference to the value if this is initialized, otherwise, the +// behaviour is UNDEFINED. +// No-throw +template +inline typename optional::reference_const_type get_optional_value_or( + optional const& opt, typename optional::reference_const_type v) { + return opt.get_value_or(v); +} + +template +inline typename optional::reference_type get_optional_value_or( + optional& opt, typename optional::reference_type v) { + return opt.get_value_or(v); +} + +// Returns a pointer to the value if this is initialized, otherwise, returns +// NULL. +// No-throw +template +inline typename optional::pointer_const_type get_pointer( + optional const& opt) { + return opt.get_ptr(); +} + +template +inline typename optional::pointer_type get_pointer(optional& opt) { + return opt.get_ptr(); +} + +// optional's relational operators ( ==, !=, <, >, <=, >= ) have deep-semantics +// (compare values). +// WARNING: This is UNLIKE pointers. Use equal_pointees()/less_pointess() in +// generic code instead. + +// +// optional vs optional cases +// + +template +inline bool operator==(optional const& x, optional const& y) { + return equal_pointees(x, y); +} + +template +inline bool operator<(optional const& x, optional const& y) { + return less_pointees(x, y); +} + +template +inline bool operator!=(optional const& x, optional const& y) { + return !(x == y); +} + +template +inline bool operator>(optional const& x, optional const& y) { + return y < x; +} + +template +inline bool operator<=(optional const& x, optional const& y) { + return !(y < x); +} + +template +inline bool operator>=(optional const& x, optional const& y) { + return !(x < y); +} + +// +// optional vs T cases +// +template +inline bool operator==(optional const& x, T const& y) { + return equal_pointees(x, optional(y)); +} + +template +inline bool operator<(optional const& x, T const& y) { + return less_pointees(x, optional(y)); +} + +template +inline bool operator!=(optional const& x, T const& y) { + return !(x == y); +} + +template +inline bool operator>(optional const& x, T const& y) { + return y < x; +} + +template +inline bool operator<=(optional const& x, T const& y) { + return !(y < x); +} + +template +inline bool operator>=(optional const& x, T const& y) { + return !(x < y); +} + +// +// T vs optional cases +// + +template +inline bool operator==(T const& x, optional const& y) { + return equal_pointees(optional(x), y); +} + +template +inline bool operator<(T const& x, optional const& y) { + return less_pointees(optional(x), y); +} + +template +inline bool operator!=(T const& x, optional const& y) { + return !(x == y); +} + +template +inline bool operator>(T const& x, optional const& y) { + return y < x; +} + +template +inline bool operator<=(T const& x, optional const& y) { + return !(y < x); +} + +template +inline bool operator>=(T const& x, optional const& y) { + return !(x < y); +} + +// +// optional vs none cases +// + +template +inline bool operator==(optional const& x, none_t) { + return equal_pointees(x, optional()); +} + +template +inline bool operator<(optional const& x, none_t) { + return less_pointees(x, optional()); +} + +template +inline bool operator!=(optional const& x, none_t y) { + return !(x == y); +} + +template +inline bool operator>(optional const& x, none_t y) { + return y < x; +} + +template +inline bool operator<=(optional const& x, none_t y) { + return !(y < x); +} + +template +inline bool operator>=(optional const& x, none_t y) { + return !(x < y); +} + +// +// none vs optional cases +// + +template +inline bool operator==(none_t x, optional const& y) { + return equal_pointees(optional(), y); +} + +template +inline bool operator<(none_t x, optional const& y) { + return less_pointees(optional(), y); +} + +template +inline bool operator!=(none_t x, optional const& y) { + return !(x == y); +} + +template +inline bool operator>(none_t x, optional const& y) { + return y < x; +} + +template +inline bool operator<=(none_t x, optional const& y) { + return !(y < x); +} + +template +inline bool operator>=(none_t x, optional const& y) { + return !(x < y); +} + +namespace optional_detail { + +// optional's swap: +// If both are initialized, calls swap(T&, T&). If this swap throws, both will +// remain initialized but their values are now unspecified. +// If only one is initialized, calls U.reset(*I), THEN I.reset(). +// If U.reset(*I) throws, both are left UNCHANGED (U is kept uinitialized and I +// is never reset) +// If both are uninitialized, do nothing (no-throw) +template +inline void optional_swap(optional& x, optional& y) { + if (!x && !!y) { + x.reset(*y); + y.reset(); + } else if (!!x && !y) { + y.reset(*x); + x.reset(); + } else if (!!x && !!y) { + // allow for Koenig lookup + using std::swap; + swap(*x, *y); + } +} + +} // namespace optional_detail + +} // namespace paddle + +#endif From 690f583152b8e3bb2600e00f269f4fdf6cf08787 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Tue, 17 Aug 2021 17:15:26 +0800 Subject: [PATCH 076/126] Update op-benchmark CI (#34962) * fix op-benchmark * test=document_fix --- tools/test_ci_op_benchmark.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh index ff1024ba948ed8..6d5f88c1f57ee9 100644 --- a/tools/test_ci_op_benchmark.sh +++ b/tools/test_ci_op_benchmark.sh @@ -281,7 +281,11 @@ function summary_problems { function cpu_op_benchmark { LOG "[INFO] Start run op benchmark cpu test ..." load_CHANGE_OP_FILES + prepare_benchmark_environment + load_CHANGE_OP_MAP + load_BENCHMARK_OP_MAP build_whl + summary_problems LOG "[INFO] Op benchmark run success and no error!" exit 0 } @@ -289,9 +293,6 @@ function cpu_op_benchmark { function gpu_op_benchmark { LOG "[INFO] Start run op benchmark gpu test ..." - load_CHANGE_OP_FILES - prepare_benchmark_environment - load_CHANGE_OP_MAP load_BENCHMARK_OP_MAP run_op_benchmark_test summary_problems From 1ef21855c4e7f5e0caabb8f34723ac5d5ce7d743 Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Tue, 17 Aug 2021 20:19:08 +0800 Subject: [PATCH 077/126] [NPU] add where_index op and tests (#34951) --- paddle/fluid/operators/where_index_op_npu.cc | 97 ++++++++++++++++ .../unittests/npu/test_where_index_npu.py | 106 ++++++++++++++++++ 2 files changed, 203 insertions(+) create mode 100644 paddle/fluid/operators/where_index_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_where_index_npu.py diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc new file mode 100644 index 00000000000000..da252094df96a2 --- /dev/null +++ b/paddle/fluid/operators/where_index_op_npu.cc @@ -0,0 +1,97 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/where_index_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class NPUWhereIndexKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto& dev_ctx = + context.template device_context(); + auto* condition = context.Input("Condition"); + auto* out = context.Output("Out"); + + auto dims = condition->dims(); + const int rank = dims.size(); + + auto place = context.GetPlace(); + const aclrtStream& stream = dev_ctx.stream(); + + // Run Cast and ReduceSum to get 0 dim of Out + Tensor booled_cond; + if (condition->type() != framework::proto::VarType::BOOL) { + auto bool_type = ConvertToNpuDtype(framework::proto::VarType::BOOL); + booled_cond.mutable_data(dims, place); + const auto& booled_runner = + NpuOpRunner("Cast", {*condition}, {booled_cond}, + {{"dst_type", static_cast(bool_type)}}); + booled_runner.Run(stream); + } else { + booled_cond.ShareDataWith(*condition); + } + Tensor casted_cond; + auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT64); + casted_cond.mutable_data(dims, place); + const auto& cast_runner = + NpuOpRunner("Cast", {booled_cond}, {casted_cond}, + {{"dst_type", static_cast(dst_dtype)}}); + cast_runner.Run(stream); + + Tensor sumed_true_num; + sumed_true_num.mutable_data({1}, place); + Tensor cond_axes; + cond_axes.mutable_data({dims.size()}, place); + std::vector axes_vec; + for (int i = 0; i < dims.size(); ++i) { + axes_vec.push_back(i); + } + framework::TensorFromVector(axes_vec, dev_ctx, &cond_axes); + const auto& sum_runner = + NpuOpRunner("ReduceSum", {casted_cond, cond_axes}, {sumed_true_num}, + {{"keep_dims", false}}); + sum_runner.Run(stream); + + Tensor local_true_num; + TensorCopySync(sumed_true_num, platform::CPUPlace(), &local_true_num); + auto true_num = *local_true_num.data(); + + out->Resize(framework::make_ddim({true_num, rank})); + out->mutable_data(place); + + if (true_num == 0) { + return; + } + + out->set_layout(DataLayout::kAnyLayout); + NpuOpRunner runner{"Where", {*condition}, {*out}}; + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL(where_index, ops::NPUWhereIndexKernel, + ops::NPUWhereIndexKernel, + ops::NPUWhereIndexKernel, + ops::NPUWhereIndexKernel, + ops::NPUWhereIndexKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_where_index_npu.py b/python/paddle/fluid/tests/unittests/npu/test_where_index_npu.py new file mode 100644 index 00000000000000..20d7fb6879d443 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_where_index_npu.py @@ -0,0 +1,106 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import paddle +import sys +sys.path.append("..") +from op_test import OpTest +from paddle.fluid.op import Operator +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard + +paddle.enable_static() + + +class TestWhereIndexOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "where_index" + self.place = paddle.NPUPlace(0) + self.init_config() + + def test_check_output(self): + self.check_output_with_place(self.place) + + def init_config(self): + self.inputs = {'Condition': np.array([True, False, True]), } + + self.outputs = {'Out': np.array([[0], [2]], dtype='int64')} + + def set_npu(self): + self.__class__.use_npu = True + + +class TestNotBool(TestWhereIndexOp): + def init_config(self): + self.inputs = {'Condition': np.array([1, 0, 8]), } + + self.outputs = {'Out': np.array([[0], [2]], dtype='int64')} + + +class TestAllFalse(TestWhereIndexOp): + def init_config(self): + self.inputs = {'Condition': np.array([False, False, False]), } + + self.outputs = {'Out': np.array([], dtype='int64')} + + +class TestRank2(TestWhereIndexOp): + def init_config(self): + self.inputs = {'Condition': np.array([[True, False], [False, True]]), } + + self.outputs = {'Out': np.array([[0, 0], [1, 1]], dtype='int64')} + + +class TestRank3(TestWhereIndexOp): + def init_config(self): + self.inputs = { + 'Condition': np.array([[[True, False], [False, True]], + [[False, True], [True, False]], + [[False, False], [False, True]]]), + } + + self.outputs = { + 'Out': np.array( + [[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0], [2, 1, 1]], + dtype='int64') + } + + +class TestWhereOpError(unittest.TestCase): + def test_api(self): + with program_guard(Program(), Program()): + cond = fluid.layers.data(name='cond', shape=[4], dtype='bool') + result = fluid.layers.where(cond) + + exe = fluid.Executor(paddle.NPUPlace(0)) + exe.run(fluid.default_startup_program()) + cond_i = np.array([True, False, False, False]).astype("bool") + out = exe.run(fluid.default_main_program(), feed={'cond': cond_i}) + + +class TestWhereRaiseError(unittest.TestCase): + def test_errors(self): + def test_type(): + fluid.layers.where([10]) + + self.assertRaises(TypeError, test_type) + + +if __name__ == "__main__": + unittest.main() From b4474fb40d563a9fc9c98247c644b9983e787403 Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Tue, 17 Aug 2021 20:42:12 +0800 Subject: [PATCH 078/126] [NPU]Adamw skip update for npu (#34897) --- paddle/fluid/operators/optimizers/adam_op.cc | 20 ++ .../fluid/operators/optimizers/adam_op_npu.cc | 76 ++++++ paddle/fluid/operators/optimizers/adamw_op.cc | 20 ++ paddle/fluid/operators/optimizers/adamw_op.h | 105 ++++++++ .../meta_optimizers/sharding/fp16_helper.py | 5 +- .../contrib/mixed_precision/decorator.py | 14 +- python/paddle/fluid/optimizer.py | 8 +- .../tests/unittests/npu/test_adamw_op_npu.py | 250 ++++++++++++++++++ .../fluid/tests/unittests/test_adam_op.py | 39 +++ python/paddle/optimizer/adamw.py | 123 ++++++++- 10 files changed, 645 insertions(+), 15 deletions(-) create mode 100644 paddle/fluid/operators/optimizers/adamw_op.cc create mode 100644 paddle/fluid/operators/optimizers/adamw_op.h create mode 100644 python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 130e10a1f8de30..d4355c89f31cc3 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/optimizers/adam_op.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/optimizers/adamw_op.h" namespace paddle { namespace operators { @@ -230,11 +231,30 @@ param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsil )DOC"); } }; + +class AdamWOpMaker : public AdamOpMaker { + public: + void Make() { + AdamOpMaker::Make(); + AddAttr("coeff", + "(float, default 0.01) " + "coeff of the weight decay") + .SetDefault(0.01f); + AddAttr("with_decay", + "(bool, default false) " + "whether to do weight decay") + .SetDefault(false); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker); + +REGISTER_OP_WITHOUT_GRADIENT(adamw, ops::AdamWOp, ops::AdamWOpMaker); + REGISTER_OP_CPU_KERNEL( adam, ops::AdamOpKernel, ops::AdamOpKernel); diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc index d0de480c1a0ccc..1169bc12ac230c 100644 --- a/paddle/fluid/operators/optimizers/adam_op_npu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc @@ -225,6 +225,79 @@ class AdamNPUKernel : public framework::OpKernel { } }; +template +class AdamWNPUKernel : public AdamNPUKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + VLOG(3) << "NPU AdamW Kernel"; + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + VLOG(3) << "Has SkipUpdate"; + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + TensorToVector(*skip_update_tensor, ctx.device_context(), + &skip_update_vec); + skip_update = skip_update_vec[0]; + } + VLOG(3) << "Skip update" << skip_update; + bool with_decay = ctx.Attr("with_decay"); + if (!skip_update && with_decay) { + float coeff = ctx.Attr("coeff"); + auto* lr = ctx.Input("LearningRate"); + + auto place = ctx.GetPlace(); + + auto stream = + ctx.template device_context() + .stream(); + + Tensor one(framework::proto::VarType::FP32); + Tensor decay(framework::proto::VarType::FP32); + Tensor tmp(framework::proto::VarType::FP32); + + tmp.mutable_data({1}, place); + one.mutable_data({1}, place); + decay.mutable_data({1}, place); + + FillNpuTensorWithConstant(&one, 1.0f); + framework::NPUAttributeMap attr_input = {{"value", coeff}}; + + const auto& runner1 = NpuOpRunner("Muls", {*lr}, {tmp}, attr_input); + runner1.Run(stream); + + const auto& runner2 = NpuOpRunner("Sub", {one, tmp}, {decay}, {}); + runner2.Run(stream); + + if (ctx.HasInput("MasterParam")) { + PADDLE_THROW(platform::errors::Unimplemented( + "Master Parma is not supported on npu")); + } else { + auto* param_out = ctx.Output("ParamOut"); + param_out->mutable_data(ctx.GetPlace()); + + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + auto* param = ctx.Input("Param"); + + const auto& runner = + NpuOpRunner("Mul", {*param, decay}, + {*const_cast(param)}, {}); + runner.Run(stream); + } + } + AdamNPUKernel::Compute(ctx); + } +}; + } // namespace operators } // namespace paddle @@ -234,3 +307,6 @@ REGISTER_OP_NPU_KERNEL( adam, ops::AdamNPUKernel, ops::AdamNPUKernel); + +REGISTER_OP_NPU_KERNEL(adamw, ops::AdamWNPUKernel, + ops::AdamWNPUKernel); diff --git a/paddle/fluid/operators/optimizers/adamw_op.cc b/paddle/fluid/operators/optimizers/adamw_op.cc new file mode 100644 index 00000000000000..c2111d53f3a45f --- /dev/null +++ b/paddle/fluid/operators/optimizers/adamw_op.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +namespace ops = paddle::operators; +REGISTER_OP_CPU_KERNEL( + adamw, ops::AdamWOpKernel, + ops::AdamWOpKernel); diff --git a/paddle/fluid/operators/optimizers/adamw_op.h b/paddle/fluid/operators/optimizers/adamw_op.h new file mode 100644 index 00000000000000..3301bc4808e3a8 --- /dev/null +++ b/paddle/fluid/operators/optimizers/adamw_op.h @@ -0,0 +1,105 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +namespace paddle { +namespace operators { + +class AdamWOp : public AdamOp { + using AdamOp::AdamOp; +}; + +struct CPUAdamW; + +template +class AdamWFunctor; + +template +class AdamWFunctor { + private: + const float coeff_; + const float learning_rate_; + T* param_; + + public: + AdamWFunctor(const float& coeff, const float& learning_rate, T* param) + : coeff_(coeff), learning_rate_(learning_rate), param_(param) {} + + inline HOSTDEVICE void operator()(size_t numel) const { + Eigen::Map> param{ + param_, static_cast(numel)}; + // Calculation + param = param * (1.0f - learning_rate_ * coeff_); + } +}; + +template +class AdamWOpKernel : public AdamOpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + + using paddle::framework::LoDTensor; + bool skip_update = false; + // TODO(liupeng): + if (ctx.HasInput("SkipUpdate")) { + VLOG(3) << "Has SkipUpdate"; + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + TensorToVector(*skip_update_tensor, ctx.device_context(), + &skip_update_vec); + skip_update = skip_update_vec[0]; + } + VLOG(3) << "Skip update" << skip_update; + bool with_decay = ctx.Attr("with_decay"); + + if (skip_update || !with_decay) { + AdamOpKernel::Compute(ctx); + return; + } + + float coeff = ctx.Attr("coeff"); + auto* lr = ctx.Input("LearningRate"); + + LoDTensor* param; + + if (ctx.HasInput("MasterParam")) { + // TODO(liupeng): master + param = const_cast(ctx.Input("MasterParam")); + } else { + param = const_cast(ctx.Input("Param")); + } + + // AdamWFunctor(float coeff, const float* learning_rate, T* parma) + AdamWFunctor functor(coeff, *lr->data(), + param->data()); + functor(param->numel()); + + AdamOpKernel::Compute(ctx); + } +}; +} // namespace operators +} // namespace paddle diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py index 07272404768ff7..e939ac765b2c9e 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py @@ -197,7 +197,6 @@ def sync_amp_check_nan_inf(block, ring_ids): if op.type == "update_loss_scaling": update_loss_scaling_op_idx = idx inf_var_name = op.desc.input('FoundInfinite')[0] - op._rename_input(inf_var_name, inf_var_name + "@GLOBAL_WORLD") break # not use amp @@ -246,10 +245,10 @@ def sync_amp_check_nan_inf(block, ring_ids): update_loss_scaling_op_idx, type='cast', inputs={'X': inf_var_int32}, - outputs={'Out': inf_var_global}, + outputs={'Out': inf_var}, attrs={ "in_dtype": inf_var_int32.dtype, - "out_dtype": inf_var_global.dtype, + "out_dtype": inf_var.dtype, OP_ROLE_KEY: OpRole.Optimize }) update_loss_scaling_op_idx += 1 diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py index 22eb2d20f3db7f..563c394c9fbfe8 100644 --- a/python/paddle/fluid/contrib/mixed_precision/decorator.py +++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py @@ -399,12 +399,18 @@ def apply_gradients(self, params_grads): self._decr_ratio, name="update_loss_scaling") # Pass found_inf to adam, to skip update for not only param, but also momentum and beta_pow - if isinstance(self._optimizer, paddle.fluid.optimizer.Adam): + # With fleet, optimizers are nested and the real optimizer set by user is the inner most one. + real_optimizer = self._optimizer + while hasattr(real_optimizer, "inner_opt"): + real_optimizer = real_optimizer.inner_opt + if isinstance(real_optimizer, (paddle.fluid.optimizer.Adam, + paddle.optimizer.AdamW)): # NOTE(zhiqiu): Since found_inf needs to be on cpu in adam op, we # copy it in advance to avoid multiple time copies. - found_inf = paddle.tensor.creation._memcpy(found_inf, - paddle.CPUPlace()) - self._optimizer._set_auxiliary_var('found_inf', found_inf) + with self._train_program._optimized_guard([]): + found_inf = paddle.tensor.creation._memcpy(found_inf, + paddle.CPUPlace()) + real_optimizer._set_auxiliary_var('found_inf', found_inf) optimize_ops = self._optimizer.apply_gradients(params_grads) return optimize_ops diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 3cb6d24c86faf2..9e87681c4bef30 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -4661,12 +4661,8 @@ def _add_op_device_attr_for_op(self, op, idx, block): op._set_attr(self._op_device_key, f"{self._device}:all") else: other_known_ops = [ - 'update_loss_scaling', - 'reduce_any', - 'concat', - 'sum', - 'check_finite_and_unscale', - 'alloc_float_status', + 'update_loss_scaling', 'reduce_any', 'concat', 'sum', + 'check_finite_and_unscale', 'alloc_float_status', 'memcpy' ] assert op.type in other_known_ops, "For other ops without " \ "op_device set, they must be one of {}, but it " \ diff --git a/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py new file mode 100644 index 00000000000000..78ee572d11fee6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_adamw_op_npu.py @@ -0,0 +1,250 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from test_adam_op import adamw_step + +paddle.enable_static() +SEED = 2021 + + +class TestAdamW(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "adamw" + param = np.random.uniform(-1, 1, (105, 102)).astype("float32") + grad = np.random.uniform(-1, 1, (105, 102)).astype("float32") + moment1 = np.random.uniform(-1, 1, (105, 102)).astype("float32") + # The second moment is positive + moment2 = np.random.random((105, 102)).astype("float32") + + learning_rate = 0.5 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32") + } + + self.attrs = { + 'epsilon': epsilon, + 'beta1': beta1, + 'beta2': beta2, + "coeff": 0.9, + "with_decay": True + } + + param_out, moment1_out, \ + moment2_out = adamw_step(self.inputs, self.attrs) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'ParamOut': param_out, + 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, + 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestAdamOpWithSkipUpdate(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "adamw" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32"), + 'Beta1Tensor': np.array([beta1]).astype("float32"), + 'Beta2Tensor': np.array([beta2]).astype("float32"), + 'EpsilonTensor': np.array([epsilon]).astype("float32"), + "SkipUpdate": np.array([True]).astype("bool"), + } + + self.attrs = {'epsilon': epsilon, "coeff": 0.02, "with_decay": True} + + self.outputs = { + 'Moment1Out': moment1, + 'Moment2Out': moment2, + 'ParamOut': param, + 'Beta1PowOut': self.inputs['Beta1Pow'], + 'Beta2PowOut': self.inputs['Beta2Pow'], + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestAdamOpWithoutDecay(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "adamw" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32"), + 'Beta1Tensor': np.array([beta1]).astype("float32"), + 'Beta2Tensor': np.array([beta2]).astype("float32"), + 'EpsilonTensor': np.array([epsilon]).astype("float32"), + "SkipUpdate": np.array([True]).astype("bool"), + } + + self.attrs = {'epsilon': epsilon, "coeff": 0.02, "with_decay": False} + + self.outputs = { + 'Moment1Out': moment1, + 'Moment2Out': moment2, + 'ParamOut': param, + 'Beta1PowOut': self.inputs['Beta1Pow'], + 'Beta2PowOut': self.inputs['Beta2Pow'], + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.pow(sum, 2.0) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + adam = paddle.optimizer.AdamW(learning_rate=0.01, weight_decay=0.02) + adam.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + npu_pred, npu_loss = self._test(True) + cpu_pred, cpu_loss = self._test(False) + self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3)) + self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 27b66c13aecf33..70109164960a33 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -215,6 +215,45 @@ def adam_step(inputs, attributes): return param_out, moment1_out, moment2_out +def adamw_step(inputs, attributes): + ''' + Simulate one step of the adam optimizer + :param inputs: dict of inputs + :param attributes: dict of attributes + :return tuple: tuple of output param, moment1, moment2, + beta1 power accumulator and beta2 power accumulator + ''' + param = inputs['Param'] + grad = inputs['Grad'] + moment1 = inputs['Moment1'] + moment2 = inputs['Moment2'] + lr = inputs['LearningRate'] + beta1_pow = inputs['Beta1Pow'] + beta2_pow = inputs['Beta2Pow'] + + epsilon = attributes['epsilon'] + coeff = attributes["coeff"] + if attributes.get("with_decay", False): + decay = 1.0 - lr * coeff + param2 = param * decay + param = param2.copy() + if 'beta1' in attributes: + beta1 = attributes['beta1'] + else: + beta1 = inputs['Beta1Tensor'][0] + if 'beta2' in attributes: + beta2 = attributes['beta2'] + else: + beta2 = inputs['Beta2Tensor'][0] + + moment1_out = beta1 * moment1 + (1 - beta1) * grad + moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad) + lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) + param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon)) + + return param_out, moment1_out, moment2_out + + def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, lazy_mode): ''' diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 11ba49c0707a37..965785908979bb 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -16,9 +16,12 @@ from .adam import Adam from ..fluid import core from ..fluid import framework +from ..fluid.framework import Variable from ..fluid.dygraph import base as imperative_base import paddle +_C_ops = core.ops + __all__ = [] @@ -173,6 +176,23 @@ def __init__(self, multi_precision=multi_precision) self._default_dict = {'coeff': coeff} + self.type = "adamw" + + # now the adamw op doesn't support cuda + if core.is_compiled_with_cuda(): + self.type = "adam" + # Use _auxiliary_vars together with _set_auxiliary_var/_get_auxiliary_var to achieve that. + self._auxiliary_vars = dict() + + def _set_auxiliary_var(self, key, val): + self._auxiliary_vars[key] = val + + def _get_auxiliary_var(self, key): + if key in self._auxiliary_vars: + return self._auxiliary_vars[key] + else: + return None + def _append_decoupled_weight_decay(self, block, param_and_grad): """ Add decoupled weight decay op. @@ -228,8 +248,107 @@ def _append_decoupled_weight_decay(self, block, param_and_grad): paddle.fluid.layers.assign(input=scaled_param, output=param) def _append_optimize_op(self, block, param_and_grad): - self._append_decoupled_weight_decay(block, param_and_grad) - return super(AdamW, self)._append_optimize_op(block, param_and_grad) + if not core.is_compiled_with_npu(): + self._append_decoupled_weight_decay(block, param_and_grad) + return super(AdamW, self)._append_optimize_op(block, param_and_grad) + + assert isinstance(block, framework.Block) + if isinstance(param_and_grad, dict): + param_and_grad = self._update_param_group(param_and_grad) + param, grad = param_and_grad + + # Whether we should do weight decay for the parameter. + with_decay = True + if self._apply_decay_param_fun is not None \ + and not self._apply_decay_param_fun(param.name): + with_decay = False + + moment1 = self._get_accumulator(self._moment1_acc_str, + param_and_grad[0]) + moment2 = self._get_accumulator(self._moment2_acc_str, + param_and_grad[0]) + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param_and_grad[0]) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param_and_grad[0]) + find_master = self._multi_precision and param_and_grad[ + 0].dtype == core.VarDesc.VarType.FP16 + master_weight = (self._master_weights[param_and_grad[0].name] + if find_master else None) + lr = self._create_param_lr(param_and_grad) + + # create the adam optimize op + if framework.in_dygraph_mode(): + + _beta1 = self._beta1 if not isinstance( + self._beta1, Variable) else self._beta1.numpy().item(0) + _beta2 = self._beta2 if not isinstance( + self._beta2, Variable) else self._beta2.numpy().item(0) + _, _, _, _, _ = _C_ops.adam( + param_and_grad[0], param_and_grad[1], lr, moment1, moment2, + beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1, + moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon, + 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', + 1000, 'beta1', _beta1, 'beta2', _beta2) + + return None + + inputs = { + "Param": [param_and_grad[0]], + "Grad": [param_and_grad[1]], + "LearningRate": [lr], + "Moment1": [moment1], + "Moment2": [moment2], + "Beta1Pow": [beta1_pow_acc], + "Beta2Pow": [beta2_pow_acc], + } + + # Pass found_inf to adamw, to skip update for not only param, but also momentum and beta_pow + found_inf = self._get_auxiliary_var('found_inf') + + if found_inf: + inputs['SkipUpdate'] = found_inf + + outputs = { + "ParamOut": [param_and_grad[0]], + "Moment1Out": [moment1], + "Moment2Out": [moment2], + "Beta1PowOut": [beta1_pow_acc], + "Beta2PowOut": [beta2_pow_acc], + } + attrs = { + "lazy_mode": self._lazy_mode, + "min_row_size_to_use_multithread": 1000, + "multi_precision": find_master, + "with_decay": with_decay, + "coeff": self._coeff, + } + + if isinstance(self._beta1, Variable): + inputs['Beta1Tensor'] = self._beta1 + else: + attrs['beta1'] = self._beta1 + if isinstance(self._beta2, Variable): + inputs['Beta2Tensor'] = self._beta2 + else: + attrs['beta2'] = self._beta2 + if isinstance(self._epsilon, Variable): + inputs['EpsilonTensor'] = self._epsilon + else: + attrs['epsilon'] = self._epsilon + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + + adamw_op = block.append_op( + type=self.type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True) + + return adamw_op def _create_optimization_pass(self, parameters_and_grads): optimize_ops = super( From c7070cb8eaa59e2b715f5bec9a8c2d80e8d54631 Mon Sep 17 00:00:00 2001 From: Wangzheee <634486483@qq.com> Date: Wed, 18 Aug 2021 09:23:13 +0800 Subject: [PATCH 079/126] [Paddle-TRT] unitest_quant_dequant (#34929) * unitest_quant_dequant * fix * fix * deleted: test_trt_quant_conv2d_dequant_fuse_pass.py * fix --- .../ir/inference/inference_pass_test.py | 114 ++---- .../ir/inference/quant_dequant_test.py | 371 ++++++++++++++++++ .../ir/inference/test_trt_fc_fuse_pass.py | 52 +++ ...test_trt_quant_conv2d_dequant_fuse_pass.py | 88 ----- 4 files changed, 460 insertions(+), 165 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py delete mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_quant_conv2d_dequant_fuse_pass.py diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py index 1d9f989782962f..b5a3e1a257ef6a 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py @@ -28,10 +28,6 @@ from paddle.fluid.core import AnalysisConfig from paddle.fluid.core import create_paddle_predictor -from paddle.fluid.framework import IrGraph -from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass -from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass - class InferencePassTest(unittest.TestCase): def __init__(self, methodName='runTest'): @@ -58,25 +54,27 @@ def __init__(self, methodName='runTest'): def _get_place(self): return set([False, core.is_compiled_with_cuda()]) - def _save_models(self, executor, program, scope): + def _save_models(self, dirname, feeded_var_names, target_vars, executor, + program, scope): with fluid.scope_guard(scope): - outs = executor.run(program=program, - feed=self.feeds, - fetch_list=self.fetch_list, - return_numpy=False) # save models as combined to ensure that # there won't be too many useless files # after finishing a couple of tests. - fluid.io.save_inference_model( - dirname=self.path, - feeded_var_names=list(self.feeds.keys()), - target_vars=self.fetch_list, - executor=executor, - main_program=program) + fluid.io.save_inference_model(dirname, feeded_var_names, + target_vars, executor, program) + def _get_paddle_outs(self, executor, program, scope): + ''' + Return PaddlePaddle outputs. + ''' + with fluid.scope_guard(scope): + outs = executor.run(program=program, + feed=self.feeds, + fetch_list=self.fetch_list, + return_numpy=False) return outs - def _get_analysis_outputs(self, config): + def _get_inference_outs(self, config): ''' Return AnalysisPredictor outputs. ''' @@ -170,113 +168,75 @@ def check_output_with_option(self, device = "GPU" if use_gpu else "CPU" with fluid.scope_guard(scope): executor.run(self.startup_program) - - if quant: - main_graph = IrGraph( - core.Graph(self.main_program.desc), for_test=True) - - transform_pass = QuantizationTransformPass( - scope=scope, - place=place, - activation_quantize_type=self.activation_quant_type, - weight_quantize_type=self.weight_quant_type, - quantizable_op_type=[ - 'conv2d', 'mul', 'depthwise_conv2d', 'conv2d_transpose' - ]) - transform_pass.apply(main_graph) - weight_scale_map = { - "conv2d": "conv2d_0.w_0.scale", - "mul": "fc_0.w_0.scale" - } - - weight_scale_tensor = scope.var(weight_scale_map[ - self.quantized_op_type]).get_tensor() - weight_scale = np.ones(self.channels).astype("float32") - weight_scale_tensor.set(weight_scale, place) - - op_nodes = main_graph.all_op_nodes() - for op_node in op_nodes: - if op_node.name() in [self.quantized_op_type, "relu"]: - op_node.op()._set_attr("out_threshold", 0.5) - - with fluid.scope_guard(scope): - executor.run(program=self.main_program, - feed=self.feeds, - fetch_list=self.fetch_list) - - freeze_pass = QuantizationFreezePass( - scope=scope, - place=place, - weight_quantize_type=self.weight_quant_type) - freeze_pass.apply(main_graph) - self.main_program = main_graph.to_program() - - outs = self._save_models(executor, self.main_program, scope) - - analysis_outputs = self._get_analysis_outputs( + self._save_models(self.path, + list(self.feeds.keys()), self.fetch_list, executor, + self.main_program, scope) + paddle_outs = self._get_paddle_outs(executor, self.main_program, scope) + inference_outs = self._get_inference_outs( self._get_analysis_config(use_gpu=use_gpu)) # Check whether the results calculated on CPU and on GPU are the same. self.assertTrue( - len(outs) == len(analysis_outputs), + len(paddle_outs) == len(inference_outs), "The number of outputs is different between inference and training forward at {}". format(device)) - for out, analysis_output in zip(outs, analysis_outputs): - out = np.array(out) + for out, inference_out in zip(paddle_outs, inference_outs): + paddle_out = np.array(out) if flatten: - out = out.flatten() - analysis_output = analysis_output.flatten() + paddle_out = paddle_out.flatten() + inference_out = inference_out.flatten() self.assertTrue( np.allclose( - out, analysis_output, atol=atol), + paddle_out, inference_out, atol=atol), "Output has diff between inference and training forward at {} ". format(device)) # Check whether the trt results and the GPU results are the same. if use_gpu and self.enable_trt: - tensorrt_outputs = self._get_analysis_outputs( + tensorrt_outputs = self._get_inference_outs( self._get_analysis_config( use_gpu=use_gpu, use_trt=self.enable_trt)) if self.trt_parameters.use_static: #deserialize - tensorrt_outputs = self._get_analysis_outputs( + tensorrt_outputs = self._get_inference_outs( self._get_analysis_config( use_gpu=use_gpu, use_trt=self.enable_trt)) self.assertTrue( - len(tensorrt_outputs) == len(outs), + len(tensorrt_outputs) == len(paddle_outs), "The number of outputs is different between GPU and TensorRT. ") - for out, tensorrt_output in zip(outs, tensorrt_outputs): - out = np.array(out) + for paddle_out, tensorrt_output in zip(paddle_outs, + tensorrt_outputs): + paddle_out = np.array(paddle_out) if flatten: - out = out.flatten() + paddle_out = paddle_out.flatten() tensorrt_output = tensorrt_output.flatten() self.assertTrue( np.allclose( - out, tensorrt_output, rtol=rtol, atol=atol), + paddle_out, tensorrt_output, rtol=rtol, atol=atol), "Output has diff between GPU and TensorRT. ") # Check whether the mkldnn results and the CPU results are the same. if (not use_gpu) and self.enable_mkldnn: - mkldnn_outputs = self._get_analysis_outputs( + mkldnn_outputs = self._get_inference_outs( self._get_analysis_config( use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn)) self.assertTrue( - len(outs) == len(mkldnn_outputs), + len(paddle_outs) == len(mkldnn_outputs), "The number of outputs is different between CPU and MKLDNN. ") if self.enable_mkldnn_bfloat16: atol = 0.01 - for out, mkldnn_output in zip(outs, mkldnn_outputs): + for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs): self.assertTrue( np.allclose( - np.array(out), mkldnn_output, atol=atol), + np.array(paddle_out), mkldnn_output, atol=atol), "Output has diff between CPU and MKLDNN. ") class TensorRTParam: diff --git a/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py b/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py new file mode 100644 index 00000000000000..a75911232c50a1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py @@ -0,0 +1,371 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import random +import numpy as np +import six +import paddle.fluid as fluid +import paddle +import warnings +from paddle.fluid.framework import IrGraph +from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass +from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass +from paddle.fluid.contrib.slim.quantization import OutScaleForTrainingPass +from paddle.fluid.contrib.slim.quantization import OutScaleForInferencePass +from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass +from paddle.fluid import (core, Program, Variable, program_guard, layers) +from paddle.fluid.io import prepend_feed_ops, append_fetch_ops +from inference_pass_test import InferencePassTest +from paddle.fluid.core import create_paddle_predictor +from paddle.fluid.core import AnalysisConfig + + +class QuantDequantTest(unittest.TestCase): + def __init__(self, methodName='runTest'): + super(QuantDequantTest, self).__init__(methodName) + paddle.enable_static() + self.main_program = fluid.Program() + self.startup_program = fluid.Program() + self.test_main_program = fluid.Program() + self.test_startup_program = fluid.Program() + self.feeds = None + self.fetch_list = None + self.enable_mkldnn = False + self.enable_mkldnn_bfloat16 = False + self.enable_trt = False + self.enable_tensorrt_oss = True + self.trt_parameters = None + self.dynamic_shape_params = None + self.enable_lite = False + self.lite_parameters = None + self.path = "./inference_pass/" + self.__class__.__name__ + "/" + self.data = None + self.label = None + self.result = None + np.random.seed(1) + random.seed(1) + + # from Paddle release2.1 + def _normalize_program(self, program, feed_vars, fetch_vars): + if not isinstance(program, Program): + raise TypeError( + "program type must be `fluid.Program`, but received `%s`" % + type(program)) + if not isinstance(feed_vars, list): + feed_vars = [feed_vars] + if not all(isinstance(v, Variable) for v in feed_vars): + raise TypeError( + "feed_vars type must be a Variable or a list of Variable.") + if not isinstance(fetch_vars, list): + fetch_vars = [fetch_vars] + if not all(isinstance(v, Variable) for v in fetch_vars): + raise TypeError( + "fetch_vars type must be a Variable or a list of Variable.") + + # remind users to set auc_states to 0 if auc op were found. + for op in program.global_block().ops: + # clear device of Op + device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName( + ) + op._set_attr(device_attr_name, "") + if op.type == 'auc': + warnings.warn("Be sure that you have set auc states to 0 " + "before saving inference model.") + break + + # serialize program + copy_program = program.clone() + global_block = copy_program.global_block() + remove_op_idx = [] + for i, op in enumerate(global_block.ops): + op.desc.set_is_target(False) + if op.type == "feed" or op.type == "fetch": + remove_op_idx.append(i) + for idx in remove_op_idx[::-1]: + global_block._remove_op(idx) + copy_program.desc.flush() + + feed_var_names = [var.name for var in feed_vars] + copy_program = copy_program._prune_with_input( + feeded_var_names=feed_var_names, targets=fetch_vars) + copy_program = copy_program._inference_optimize(prune_read_op=True) + fetch_var_names = [var.name for var in fetch_vars] + prepend_feed_ops(copy_program, feed_var_names) + append_fetch_ops(copy_program, fetch_var_names) + copy_program.desc._set_version() + return copy_program + + def _save_models(self, dirname, feeded_var_names, target_vars, executor, + program, scope): + with fluid.scope_guard(scope): + fluid.io.save_inference_model(dirname, feeded_var_names, + target_vars, executor, program) + + def _get_paddle_outs(self, feed, fetch_list, executor, program, scope): + ''' + Return PaddlePaddle outputs. + ''' + with fluid.scope_guard(scope): + outs = executor.run(program=program, + feed=feed, + fetch_list=fetch_list, + return_numpy=True) + return outs + + def _get_inference_outs(self, config): + ''' + Return AnalysisPredictor outputs. + ''' + predictor = create_paddle_predictor(config) + tensor_shapes = predictor.get_input_tensor_shape() + names = predictor.get_input_names() + for i, name in enumerate(names): + shape = tensor_shapes[name] + shape[0] = 1 + tensor = predictor.get_input_tensor(name) + feed_data = list(self.feeds.values())[i] + tensor.copy_from_cpu(np.array(feed_data)) + if type(feed_data) == fluid.LoDTensor: + tensor.set_lod(feed_data.lod()) + + predictor.zero_copy_run() + + output_names = predictor.get_output_names() + outs = [ + predictor.get_output_tensor(out_name).copy_to_cpu() + for out_name in output_names + ] + return outs + + def _get_analysis_config(self, + use_gpu=False, + use_trt=False, + use_mkldnn=False): + ''' + Return a new object of AnalysisConfig. + ''' + config = AnalysisConfig(self.path) + config.disable_gpu() + config.switch_specify_input_names(True) + config.switch_ir_optim(True) + config.switch_use_feed_fetch_ops(False) + if use_gpu: + config.enable_use_gpu(100, 0) + if use_trt: + config.enable_tensorrt_engine( + self.trt_parameters.workspace_size, + self.trt_parameters.max_batch_size, + self.trt_parameters.min_subgraph_size, + self.trt_parameters.precision, + self.trt_parameters.use_static, + self.trt_parameters.use_calib_mode) + + if self.dynamic_shape_params: + config.set_trt_dynamic_shape_info( + self.dynamic_shape_params.min_input_shape, + self.dynamic_shape_params.max_input_shape, + self.dynamic_shape_params.optim_input_shape, + self.dynamic_shape_params.disable_trt_plugin_fp16) + if self.enable_tensorrt_oss: + config.enable_tensorrt_oss() + + elif use_mkldnn: + config.enable_mkldnn() + if self.enable_mkldnn_bfloat16: + config.enable_mkldnn_bfloat16() + print('config summary:', config.summary()) + return config + + def check_output_with_option(self, + use_gpu, + atol=1e-5, + flatten=False, + quant=False, + rtol=1e-5): + ''' + Check whether calculating on CPU and GPU, enable TensorRT + or disable TensorRT, enable MKLDNN or disable MKLDNN + are all the same. + ''' + place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() + executor = fluid.Executor(place) + scope = fluid.Scope() + device = "GPU" if use_gpu else "CPU" + + with fluid.scope_guard(scope): + executor.run(self.startup_program) + executor.run(self.test_startup_program) + main_graph = IrGraph(core.Graph(self.main_program.desc), for_test=False) + test_graph = IrGraph( + core.Graph(self.test_main_program.desc), for_test=True) + + transform_pass = QuantizationTransformPass( + scope=scope, + place=place, + activation_quantize_type=self.activation_quantize_type, + weight_quantize_type=self.weight_quantize_type) + transform_pass.apply(main_graph) + transform_pass.apply(test_graph) + + add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place) + add_quant_dequant_pass.apply(main_graph) + add_quant_dequant_pass.apply(test_graph) + + scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place) + scale_training_pass.apply(main_graph) + + build_strategy = fluid.BuildStrategy() + build_strategy.memory_optimize = False + build_strategy.enable_inplace = False + build_strategy.fuse_all_reduce_ops = False + binary = fluid.CompiledProgram(main_graph.graph) + + iters = 10 + batch_size = 1 + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=batch_size) + feeder = fluid.DataFeeder( + feed_list=[self.data, self.label], place=place) + with fluid.scope_guard(scope): + for _ in range(iters): + data = next(train_reader()) + loss_v = executor.run(binary, + feed=feeder.feed(data), + fetch_list=[self.loss]) + + scale_inference_pass = OutScaleForInferencePass(scope=scope) + scale_inference_pass.apply(test_graph) + + # Freeze graph for inference, but the weight of fc/conv is still float type. + freeze_pass = QuantizationFreezePass( + scope=scope, + place=place, + weight_quantize_type=self.weight_quantize_type) + freeze_pass.apply(test_graph) + + self.main_program = test_graph.to_program() + + with fluid.scope_guard(scope): + self.main_program = self._normalize_program( + self.main_program, self.data, self.fetch_list) + + self._save_models(self.path, + list(self.feeds.keys()), self.fetch_list, executor, + self.main_program, scope) + + paddle_outs = self._get_paddle_outs(self.feeds, self.fetch_list, + executor, self.main_program, scope) + inference_outs = self._get_inference_outs( + self._get_analysis_config(use_gpu=use_gpu)) + + # Check whether the results calculated on CPU and on GPU are the same. + self.assertTrue( + len(paddle_outs) == len(inference_outs), + "The number of outputs is different between inference and training forward at {}". + format(device)) + + for out, inference_out in zip(paddle_outs, inference_outs): + paddle_out = np.array(out) + + if flatten: + paddle_out = paddle_out.flatten() + inference_out = inference_out.flatten() + + self.assertTrue( + np.allclose( + paddle_out, inference_out, atol=atol), + "Output has diff between inference and training forward at {} ". + format(device)) + + # Check whether the trt results and the GPU results are the same. + if use_gpu and self.enable_trt: + tensorrt_outputs = self._get_inference_outs( + self._get_analysis_config( + use_gpu=use_gpu, use_trt=self.enable_trt)) + + if self.trt_parameters.use_static: + #deserialize + tensorrt_outputs = self._get_inference_outs( + self._get_analysis_config( + use_gpu=use_gpu, use_trt=self.enable_trt)) + + self.assertTrue( + len(tensorrt_outputs) == len(paddle_outs), + "The number of outputs is different between GPU and TensorRT. ") + + for paddle_out, tensorrt_output in zip(paddle_outs, + tensorrt_outputs): + paddle_out = np.array(paddle_out) + + if flatten: + paddle_out = paddle_out.flatten() + tensorrt_output = tensorrt_output.flatten() + + self.assertTrue( + np.allclose( + paddle_out, tensorrt_output, rtol=rtol, atol=atol), + "Output has diff between GPU and TensorRT. ") + + # Check whether the mkldnn results and the CPU results are the same. + if (not use_gpu) and self.enable_mkldnn: + mkldnn_outputs = self._get_inference_outs( + self._get_analysis_config( + use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn)) + + self.assertTrue( + len(paddle_outs) == len(mkldnn_outputs), + "The number of outputs is different between CPU and MKLDNN. ") + + if self.enable_mkldnn_bfloat16: + atol = 0.01 + for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs): + self.assertTrue( + np.allclose( + np.array(paddle_out), mkldnn_output, atol=atol), + "Output has diff between CPU and MKLDNN. ") + + class TensorRTParam: + ''' + Prepare TensorRT subgraph engine parameters. + ''' + + def __init__(self, workspace_size, max_batch_size, min_subgraph_size, + precision, use_static, use_calib_mode): + self.workspace_size = workspace_size + self.max_batch_size = max_batch_size + self.min_subgraph_size = min_subgraph_size + self.precision = precision + self.use_static = use_static + self.use_calib_mode = use_calib_mode + + class DynamicShapeParam: + ''' + Prepare TensorRT subgraph engine dynamic shape parameters. + ''' + + def __init__(self, min_input_shape, max_input_shape, optim_input_shape, + disable_trt_plugin_fp16): + self.min_input_shape = min_input_shape + self.max_input_shape = max_input_shape + self.optim_input_shape = optim_input_shape + self.disable_trt_plugin_fp16 = disable_trt_plugin_fp16 + + def quant_dequant(self): + place = fluid.CPUPlace() + exe = fluid.Executor(place) + scope = fluid.Scope() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py index cde2fa412d7050..7adfb7574825d0 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py @@ -17,9 +17,11 @@ import unittest import numpy as np from inference_pass_test import InferencePassTest +from quant_dequant_test import QuantDequantTest import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.core import AnalysisConfig +from paddle.fluid.core import PassVersionChecker class FCFusePassTRTTest(InferencePassTest): @@ -283,5 +285,55 @@ def test_check_output(self): self.check_output_with_option(use_gpu[i]) +class FcQuantDequantFusePassTRTTest(QuantDequantTest): + def setUp(self): + def network(): + self.data = fluid.data( + name='data', shape=[1, 28, 28], dtype='float32') + self.label = fluid.data(name='label', shape=[1, 1], dtype='int64') + fc_out = fluid.layers.fc(input=self.data, + size=10, + num_flatten_dims=1, + bias_attr=False, + act=None) + result = fluid.layers.relu(fc_out) + loss = fluid.layers.cross_entropy(input=result, label=self.label) + avg_loss = fluid.layers.mean(loss) + return avg_loss, result + + self.main_program.random_seed = 2 + self.startup_program.random_seed = 2 + self.test_main_program.random_seed = 2 + #self.test_startup_program.random_seed = 2 + with fluid.unique_name.guard(): + with fluid.program_guard(self.main_program, self.startup_program): + self.loss, result = network() + opt = fluid.optimizer.Adam(learning_rate=0.0001) + opt.minimize(self.loss) + with fluid.unique_name.guard(): + with fluid.program_guard(self.test_main_program, + self.startup_program): + network() + self.feeds = {"data": np.random.random((1, 28, 28)).astype("float32")} + self.fetch_list = [result] + + self.enable_trt = True + + self.trt_parameters = FcQuantDequantFusePassTRTTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Int8, False, False) + self.activation_quantize_type = 'moving_average_abs_max' + self.weight_quantize_type = 'channel_wise_abs_max' + + def test_check_output(self): + #self.quant_dequant() + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option( + use_gpu, atol=1e-2, flatten=False, rtol=1e-2) + self.assertTrue( + PassVersionChecker.IsCompatible( + 'quant_conv2d_dequant_fuse_pass')) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_quant_conv2d_dequant_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_quant_conv2d_dequant_fuse_pass.py deleted file mode 100644 index 8d6f9a23af3fa5..00000000000000 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_quant_conv2d_dequant_fuse_pass.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -import numpy as np -from inference_pass_test import InferencePassTest -import paddle.fluid as fluid -import paddle.fluid.core as core -from paddle.fluid.framework import IrGraph -from paddle.fluid.core import PassVersionChecker -from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass -from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass -from paddle.fluid.core import AnalysisConfig - - -class QuantDequantTest(InferencePassTest): - def setUp(self): - with fluid.program_guard(self.main_program, self.startup_program): - data = fluid.data( - name="data", shape=[-1, 3, 32, 32], dtype="float32") - param_attr = fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0.0), - trainable=False) - quantized_op_out = self.append_quantized_op(data, param_attr) - relu_out = fluid.layers.relu(quantized_op_out) - self.set_quant_pattern() - - self.feeds = { - "data": np.random.random([1, 3, 32, 32]).astype("float32"), - } - self.enable_trt = True - self.trt_parameters = QuantDequantTest.TensorRTParam( - 1 << 30, 32, 0, AnalysisConfig.Precision.Int8, False, False) - self.fetch_list = [relu_out] - - def append_quantized_op(self, x, param_attr): - return fluid.layers.conv2d( - input=x, - num_filters=3, - filter_size=3, - param_attr=param_attr, - bias_attr=False, - act=None) - - def set_quant_pattern(self): - self.activation_quant_type = 'moving_average_abs_max' - self.weight_quant_type = 'channel_wise_abs_max' - self.quantized_op_type = 'conv2d' - self.channels = 3 - - def test_check_output(self): - if core.is_compiled_with_cuda(): - use_gpu = True - self.check_output_with_option(use_gpu, flatten=True, quant=True) - self.assertTrue( - PassVersionChecker.IsCompatible( - 'quant_conv2d_dequant_fuse_pass')) - - -class QuantFcDequantTest(QuantDequantTest): - def append_quantized_op(self, x, param_attr): - return fluid.layers.fc(x, - size=100, - num_flatten_dims=1, - param_attr=param_attr, - bias_attr=False, - act=None) - - def set_quant_pattern(self): - self.activation_quant_type = 'moving_average_abs_max' - self.weight_quant_type = 'abs_max' - self.quantized_op_type = 'mul' - self.channels = 1 - - -if __name__ == "__main__": - unittest.main() From 100db44fc52324949f51aecd6d95e504621b0348 Mon Sep 17 00:00:00 2001 From: Guoxia Wang Date: Wed, 18 Aug 2021 10:28:32 +0800 Subject: [PATCH 080/126] support class center sample of PartialFC (#34106) * support class center sample of PartialFC --- .../fluid/operators/class_center_sample_op.cc | 147 ++++++ .../fluid/operators/class_center_sample_op.cu | 486 ++++++++++++++++++ .../fluid/operators/class_center_sample_op.h | 114 ++++ .../fluid/tests/unittests/CMakeLists.txt | 6 +- .../unittests/parallel_class_center_sample.py | 110 ++++ .../unittests/test_class_center_sample_op.py | 222 ++++++++ .../test_parallel_class_center_sample.py | 29 ++ .../white_list/no_check_set_white_list.py | 1 + python/paddle/nn/functional/__init__.py | 4 +- python/paddle/nn/functional/common.py | 153 ++++++ tools/static_mode_white_list.py | 1 + 11 files changed, 1271 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/operators/class_center_sample_op.cc create mode 100644 paddle/fluid/operators/class_center_sample_op.cu create mode 100644 paddle/fluid/operators/class_center_sample_op.h create mode 100644 python/paddle/fluid/tests/unittests/parallel_class_center_sample.py create mode 100644 python/paddle/fluid/tests/unittests/test_class_center_sample_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py diff --git a/paddle/fluid/operators/class_center_sample_op.cc b/paddle/fluid/operators/class_center_sample_op.cc new file mode 100644 index 00000000000000..6a1df7ec62c603 --- /dev/null +++ b/paddle/fluid/operators/class_center_sample_op.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/class_center_sample_op.h" + +namespace paddle { +namespace operators { + +class ClassCenterSampleOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", + "ClassCenterSample"); + OP_INOUT_CHECK(ctx->HasOutput("RemappedLabel"), "Output", "RemappedLabel", + "ClassCenterSample"); + OP_INOUT_CHECK(ctx->HasOutput("SampledLocalClassCenter"), "Output", + "SampledLocalClassCenter", "ClassCenterSample"); + + auto x_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(x_dims.size(), 1, + platform::errors::InvalidArgument( + "Rank of Input(Label) should be equal to 1, " + "but the value given is %d.", + x_dims.size())); + + ctx->SetOutputDim("RemappedLabel", x_dims); + auto num_samples = ctx->Attrs().Get("num_samples"); + ctx->SetOutputDim("SampledLocalClassCenter", + framework::make_ddim({num_samples})); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "Label"), + ctx.device_context()); + } +}; + +class ClassCenterSampleOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Label", + "(Tensor) The input of ClassCenterSample op. Each value " + "of Label is an integer label."); + AddOutput("RemappedLabel", + "(Tensor) Output tensor with same shape as Label. " + "Each label is remap using sampled class."); + AddOutput("SampledLocalClassCenter", + "(Tensor) The sampled class center for local rank," + "value in [0, num_classes)."); + AddAttr( + "num_classes", + "A positive integer to specify the number of classes at local rank. " + "Note that num_classes of each GPU can be different."); + AddAttr( + "num_samples", + "A positive integer to specify the number of class center to sample."); + AddAttr("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr("nranks", "(int default 1) The total number of GPUs.") + .SetDefault(1); + AddAttr("rank", "(int default 0) The rank id in nranks.") + .SetDefault(0); + AddAttr("fix_seed", + "A flag indicating whether to use a fixed seed to generate " + "random negative class center. NOTE: DO NOT set this flag to" + "true in training. Setting this flag to true is only useful " + "in unittest or for debug") + .SetDefault(false); + AddAttr("seed", + "Random seed used to generate random negative class center. " + "[default 0].") + .SetDefault(0); + AddComment(R"DOC( + Class center sample method is proposed from the paper PartialFC that only sample a subset of the class centers. + The process of sampling subset class centers is straightforward: 1) First select the positive class centers; + 2) Randomly sample negative class centers. Specifically, given a Label tensor, shape [batch_size], select all + the positive class centers and randomly sample negative class centers, then remap the input label tensor using + the sampled class centers. Note that if the number of the positive class centers is greater than the input + num_samples, it keeps all the positive class centers and the shape of SampledLocalClassCenter will be + [num_positive_class_centers]. The op supports CPU, single GPU and multi GPU. + + For more information, Partial FC: Training 10 Million Identities on a Single Machine + arxiv: https://arxiv.org/abs/2010.05222 + + Examples: + For CPU or only one GPU + Given: + Label: [11, 5 , 1 , 3 , 12, 2 , 15, 19, 18, 19] + num_classes = 20 + num_samples = 6 + Then: + RemappedLabel: [4, 3, 0, 2, 5, 1, 6, 8, 7, 8] + SampledLocalClassCenter: [1 , 2 , 3 , 5 , 11, 12, 15, 18, 19] + + For multi GPU + Given: + rank0: + Label: [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ] + num_classes = 10 + num_samples = 6 + ring_id = 0 + nranks = 2 + rank = 0 + rank1: + Label: [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ] + num_classes = 10 + num_samples = 6 + ring_id = 0 + nranks = 2 + rank = 1 + Then: + rank0: + RemappedLabel: [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ] + SampledLocalClassCenter: [0, 2, 4, 8, 9, 3] + rank1: + RemappedLabel: [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ] + SampledLocalClassCenter: [0, 1, 2, 3, 5, 7, 8] +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_WITHOUT_GRADIENT(class_center_sample, ops::ClassCenterSampleOp, + ops::ClassCenterSampleOpMaker); +REGISTER_OP_CPU_KERNEL(class_center_sample, + ops::ClassCenterSampleCPUKernel, + ops::ClassCenterSampleCPUKernel); diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu new file mode 100644 index 00000000000000..cfcfd04e6fc7c2 --- /dev/null +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -0,0 +1,486 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_HIP +#include +#include +#include +typedef hiprandState curandState; +namespace cub = hipcub; +#else +#include +#include +#include +#endif + +#include +#include +#include "paddle/fluid/operators/class_center_sample_op.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { +#define CUDA_KERNEL_LOOP(i, n) \ + for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x, \ + step = blockDim.x * gridDim.x; \ + i < (n); i += step) + +using Tensor = framework::Tensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +inline int32_t NumBlocks(const int32_t n) { + return std::min((n + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__global__ void RandomSampleClassCenter(const int64_t n, int64_t seed, + int64_t increment, + const int64_t max_val, T* buffer) { + const int id = blockIdx.x * blockDim.x + threadIdx.x; + curandState localState; + size_t local_seed = + (static_cast(seed) + 0x9E3779B9U + + (static_cast(id) << 6U) + (static_cast(id) >> 2U)); +#ifdef PADDLE_WITH_HIP + hiprand_init(local_seed, id, increment, &localState); + CUDA_KERNEL_LOOP(i, n) { + buffer[i] = static_cast(hiprand(&localState) % max_val); + } +#else + curand_init(local_seed, id, increment, &localState); + CUDA_KERNEL_LOOP(i, n) { + buffer[i] = static_cast(curand(&localState) % max_val); + } +#endif +} + +template +__global__ void Range(const int64_t n, T* out) { + CUDA_KERNEL_LOOP(i, n) { out[i] = static_cast(i); } +} + +template +__global__ void MarkPositiveClassCenter(const int64_t n, const int64_t rank, + const T* class_interval_ptr, + const int num_classes, const T* labels, + T* out) { + CUDA_KERNEL_LOOP(i, n) { + T label = labels[i] - class_interval_ptr[rank]; + if (label >= 0 && label < num_classes) { + out[label] = label - num_classes; + } + } +} + +template +__device__ void FindIntervalIndex(const T* class_interval_ptr, + const int64_t nranks, const T value, + int64_t* find_index) { + int64_t start = 0; + int64_t end = nranks; + int64_t mid = ((end - start) >> 1) + start + 1; + while (start < end) { + if (class_interval_ptr[mid] == value) break; + if (class_interval_ptr[mid] > value) + end = mid - 1; + else + start = mid; + mid = ((end - start) >> 1) + start + 1; + } + *find_index = min(mid, end); +} + +template +__global__ void GetClassCenterBound(const int64_t n, const int64_t nranks, + const T* class_interval_ptr, + const T* key_ptr, const T* value_ptr, + T* bound_index, T* bound_value) { + CUDA_KERNEL_LOOP(i, n) { + if (i != 0) { + int64_t cur_index, pre_index; + FindIntervalIndex(class_interval_ptr, nranks, key_ptr[i], &cur_index); + FindIntervalIndex(class_interval_ptr, nranks, key_ptr[i - 1], &pre_index); + if (cur_index > pre_index) { + assert(cur_index < nranks); +#pragma unroll + for (int32_t j = pre_index + 1; j <= cur_index; ++j) { + bound_index[j] = static_cast(i); + bound_value[j] = value_ptr[i]; + } + } + } + } + CUDA_KERNEL_LOOP(i, nranks + 1) { + int64_t first_index, last_index; + FindIntervalIndex(class_interval_ptr, nranks, key_ptr[0], &first_index); + FindIntervalIndex(class_interval_ptr, nranks, key_ptr[n - 1], &last_index); + if (i <= first_index) { + bound_index[i] = 0; + bound_value[i] = value_ptr[0]; + } else if (i > last_index) { + bound_index[i] = n; + bound_value[i] = value_ptr[n - 1] + 1; + } + } +} + +template +__global__ void GetRemappedLabel(const int64_t n, const int64_t nranks, + const T* sampled_class_interval_ptr, + const T* bound_index, const T* bound_value, + const T* label_map_key, T* label_map_value, + T* mapped_label) { + CUDA_KERNEL_LOOP(i, n) { +#pragma unroll + for (int64_t j = 0; j < nranks; j++) { + if (i >= bound_index[j] && i < bound_index[j + 1]) { + label_map_value[i] = + label_map_value[i] - bound_value[j] + sampled_class_interval_ptr[j]; + } + } + mapped_label[label_map_key[i]] = label_map_value[i]; + } +} + +// aligned vector generates vectorized load/store on CUDA +template +struct alignas(sizeof(T) * Size) AlignedVector { + T val[Size]; +}; + +template +inline int VectorizedSize(const T* pointer) { + uint64_t address = reinterpret_cast(pointer); + constexpr int vec4 = std::alignment_of>::value; // NOLINT + if (address % vec4 == 0) { + return 4; + } + return 1; +} + +#undef CUDA_KERNEL_LOOP + +template +class NotEqualToPreviousAdjacentIterator { + public: + using self_type = NotEqualToPreviousAdjacentIterator; + using value_type = T; + using difference_type = std::ptrdiff_t; + using pointer = T*; + using reference = T; + using iterator_category = std::input_iterator_tag; + + public: + __host__ __device__ __forceinline__ + NotEqualToPreviousAdjacentIterator(const T* arr, int64_t offset) + : arr_(arr), offset_(offset) {} + + __host__ __device__ __forceinline__ reference operator*() const { + return offset_ == 0 ? 0 : (arr_[offset_] == arr_[offset_ - 1] ? 0 : 1); + } + + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const { + self_type ret(arr_, offset_ + n); + return ret; + } + + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const { + return *(*this + n); + } + + private: + const T* arr_; + int64_t offset_; +}; + +template +struct ActualNumSampledFunctor { + __host__ __device__ __forceinline__ T operator()(const T& a, + const T& b) const { + return max(num_samples, (b - a)); + } + T num_samples; + explicit ActualNumSampledFunctor(const T num) : num_samples(num) {} +}; + +template +class MemoryBuffer { + public: + MemoryBuffer(const int num_buffer_ele, const int num_temp_ele, + const int nranks, const platform::Place& place) { + offset1 = 0; + offset2 = offset1 + num_buffer_ele; + offset3 = offset2 + num_buffer_ele; + offset4 = offset3 + num_buffer_ele; + offset5 = offset4 + num_buffer_ele; + offset6 = offset5 + (nranks + 1); + offset7 = offset6 + (nranks + 1); + offset8 = offset7 + (nranks + 1); + offset9 = offset8 + num_temp_ele; + + buffer_ptr = buffer.mutable_data( + {4 * num_buffer_ele + 3 * (nranks + 1) + num_temp_ele}, place); + } + + T* cub_sort_keys_ptr() { return buffer_ptr + offset1; } + T* cub_sort_keys_out_ptr() { return buffer_ptr + offset2; } + T* cub_sort_values_ptr() { return buffer_ptr + offset3; } + T* cub_sort_values_out_ptr() { return buffer_ptr + offset4; } + T* bound_index_ptr() { return buffer_ptr + offset5; } + T* bound_value_ptr() { return buffer_ptr + offset6; } + T* class_interval_ptr() { return buffer_ptr + offset7; } + void* cub_temp_storage_ptr() { + return reinterpret_cast(buffer_ptr + offset8); + } + + private: + Tensor buffer; + T* buffer_ptr; + int offset1; + int offset2; + int offset3; + int offset4; + int offset5; + int offset6; + int offset7; + int offset8; + int offset9; +}; + +template +class ClassCenterSampleCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* label = ctx.Input("Label"); + auto* remapped_label = ctx.Output("RemappedLabel"); + auto* sampled_local_class_center = + ctx.Output("SampledLocalClassCenter"); + int num_classes = ctx.Attr("num_classes"); + int num_samples = ctx.Attr("num_samples"); + + int rid = ctx.Attr("ring_id"); + int nranks = ctx.Attr("nranks"); + int rank = ctx.Attr("rank"); + + int seed = ctx.Attr("seed"); + bool fix_seed = ctx.Attr("fix_seed"); + PADDLE_ENFORCE_GT(num_classes, 0, + platform::errors::InvalidArgument( + "The value 'num_classes' for Op(class_center_sample) " + "must be greater than 0, " + "but the value given is %d.", + num_classes)); + + PADDLE_ENFORCE_GT(num_samples, 0, + platform::errors::InvalidArgument( + "The value 'num_samples' for Op(class_center_sample) " + "must be greater than 0, " + "but the value given is %d.", + num_samples)); + + PADDLE_ENFORCE_LE(num_samples, num_classes, + platform::errors::InvalidArgument( + "The value 'num_samples' for Op(class_center_sample) " + "must be less than or equal to %d, " + "but the value given is %d.", + num_classes, num_samples)); + + auto& dev_ctx = ctx.template device_context(); + auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); + + int batch_size = label->numel(); + // Algorithm: + // We first randomly generate a value in [0, num_classes) on each position + // in a array(shape[num_classes]). Then, we mark the element as negative + // value in the array according input label. Now, we can sort the array + // by ascending to ensure that the positive class center always in the + // front of the sorted array. So, we can get the sampled class center + // index by sorted keys. Finally, we can get the rempped label by remap + // the input label according sampled class center. + + // step 1: Calculate num classes per device using nccl all reduce + std::vector shard_dim_vec(nranks + 1, 0); + shard_dim_vec[rank + 1] = num_classes; + Tensor num_classes_per_device; + framework::TensorFromVector(shard_dim_vec, ctx.cuda_device_context(), + &num_classes_per_device); + T* num_classes_per_device_ptr = num_classes_per_device.data(); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (nranks > 1) { + const auto& comm = + platform::NCCLCommContext::Instance().Get(rid, ctx.GetPlace()); + // use global calculate stream + const auto calcu_stream = + static_cast( + platform::DeviceContextPool::Instance().Get(ctx.GetPlace())) + ->stream(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + num_classes_per_device_ptr, num_classes_per_device_ptr, + num_classes_per_device.numel(), + platform::ToNCCLDataType(num_classes_per_device.type()), ncclSum, + comm->comm(), calcu_stream)); + } +#endif + + // step 2: Determine temporary device storage requirements + int num_buffer_ele = std::max(batch_size, num_classes); + size_t cub_sort_temp_store_size = 0; + PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceRadixSort::SortPairs( + nullptr, cub_sort_temp_store_size, nullptr, nullptr, nullptr, nullptr, + num_buffer_ele, 0, sizeof(T) * 8, ctx.cuda_device_context().stream()))); + + size_t cub_sum_temp_store_size = 0; + NotEqualToPreviousAdjacentIterator unique_counting_iter_temp(nullptr, 0); + PADDLE_ENFORCE_CUDA_SUCCESS( + (cub::DeviceScan::InclusiveSum, + T*>( + nullptr, cub_sum_temp_store_size, unique_counting_iter_temp, + nullptr, batch_size, ctx.cuda_device_context().stream()))); + + size_t cub_scan_temp_store_size = 0; + ActualNumSampledFunctor actual_num_sampled_op_temp(num_samples); + PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveScan( + nullptr, cub_scan_temp_store_size, num_classes_per_device_ptr, + num_classes_per_device_ptr, actual_num_sampled_op_temp, nranks + 1, + ctx.cuda_device_context().stream()))); + + size_t cub_temp_storage_bytes = + std::max(std::max(cub_sort_temp_store_size, cub_scan_temp_store_size), + cub_sum_temp_store_size); + int num_temp_ele = cub_temp_storage_bytes / sizeof(T) + 1; + + // step 3: Alloc buffer memory so that we can reuse allocated memory + MemoryBuffer memory_buffer = + MemoryBuffer(num_buffer_ele, num_temp_ele, nranks, ctx.GetPlace()); + + T* cub_sort_keys_ptr = memory_buffer.cub_sort_keys_ptr(); + T* cub_sort_keys_out_ptr = memory_buffer.cub_sort_keys_out_ptr(); + T* cub_sort_values_ptr = memory_buffer.cub_sort_values_ptr(); + T* cub_sort_values_out_ptr = memory_buffer.cub_sort_values_out_ptr(); + T* bound_index_ptr = memory_buffer.bound_index_ptr(); + T* bound_value_ptr = memory_buffer.bound_value_ptr(); + T* class_interval_ptr = memory_buffer.class_interval_ptr(); + void* cub_temp_storage_ptr = memory_buffer.cub_temp_storage_ptr(); + + // step 4: Calculate class interval among nranks + PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveSum( + cub_temp_storage_ptr, cub_temp_storage_bytes, + num_classes_per_device_ptr, class_interval_ptr, nranks + 1, + ctx.cuda_device_context().stream()))); + + // step 5: random sample negative class center + int vec_size = VectorizedSize(cub_sort_keys_ptr); + int increment = ((num_classes - 1) / + (NumBlocks(num_classes) * kNumCUDAThreads * vec_size) + + 1) * + vec_size; + if (!fix_seed) { + std::random_device rnd; + seed = rnd(); + } + RandomSampleClassCenter<<>>( + num_classes, seed + rank, increment, num_classes, cub_sort_keys_ptr); + + // step 6: mark positive class center as negative value + // fill the sort values to index 0, 1, ..., batch_size-1 + MarkPositiveClassCenter<<>>( + batch_size, rank, class_interval_ptr, num_classes, label->data(), + cub_sort_keys_ptr); + Range<<>>(num_buffer_ele, + cub_sort_values_ptr); + + // step 7: sort class center by ascending, so that positive class center + // always be sampled. + PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceRadixSort::SortPairs( + cub_temp_storage_ptr, cub_temp_storage_bytes, cub_sort_keys_ptr, + cub_sort_keys_out_ptr, cub_sort_values_ptr, cub_sort_values_out_ptr, + num_classes, 0, sizeof(T) * 8, ctx.cuda_device_context().stream()))); + + // step 8: sort input label ascending + PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceRadixSort::SortPairs( + cub_temp_storage_ptr, cub_temp_storage_bytes, label->data(), + cub_sort_keys_out_ptr, cub_sort_values_ptr, cub_sort_keys_ptr, + batch_size, 0, sizeof(T) * 8, ctx.cuda_device_context().stream()))); + + // step 9: Calculate new index using InclusiveSum on ascending sorted input + // label + NotEqualToPreviousAdjacentIterator unique_counting_iter( + cub_sort_keys_out_ptr, 0); + PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveSum< + NotEqualToPreviousAdjacentIterator, T*>( + cub_temp_storage_ptr, cub_temp_storage_bytes, unique_counting_iter, + cub_sort_values_ptr, batch_size, ctx.cuda_device_context().stream()))); + + // step 10: Calculate new class center bound among ranks + GetClassCenterBound<<>>( + batch_size, nranks, class_interval_ptr, cub_sort_keys_out_ptr, + cub_sort_values_ptr, bound_index_ptr, bound_value_ptr); + + // step 11: Calculate actual number of sampled class per device. + // Since maybe num_positive_class_center > num_samples, + // we need to ensure all positive class center per device are sampled. + ActualNumSampledFunctor actual_num_sampled_op(num_samples); + PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveScan( + cub_temp_storage_ptr, cub_temp_storage_bytes, bound_value_ptr, + num_classes_per_device_ptr, actual_num_sampled_op, nranks + 1, + ctx.cuda_device_context().stream()))); + + // step 12: Calculate actual sampled class interval among nranks + PADDLE_ENFORCE_CUDA_SUCCESS((cub::DeviceScan::InclusiveSum( + cub_temp_storage_ptr, cub_temp_storage_bytes, + num_classes_per_device_ptr, class_interval_ptr, nranks + 1, + ctx.cuda_device_context().stream()))); + + // step 13: Get remapped label for output + GetRemappedLabel<<>>( + batch_size, nranks, class_interval_ptr, bound_index_ptr, + bound_value_ptr, cub_sort_keys_ptr, cub_sort_values_ptr, + remapped_label->mutable_data(ctx.GetPlace())); + + // step 14: Get sampled class center for output + framework::TensorCopySync(num_classes_per_device, platform::CPUPlace(), + &num_classes_per_device); + T actual_num_samples = num_classes_per_device.data()[rank + 1]; + T* sampled_local_class_center_ptr = + sampled_local_class_center->mutable_data({actual_num_samples}, + ctx.GetPlace()); + memory::Copy(place, sampled_local_class_center_ptr, place, + cub_sort_values_out_ptr, actual_num_samples * sizeof(T), + nullptr); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + class_center_sample, + ops::ClassCenterSampleCUDAKernel, + ops::ClassCenterSampleCUDAKernel); diff --git a/paddle/fluid/operators/class_center_sample_op.h b/paddle/fluid/operators/class_center_sample_op.h new file mode 100644 index 00000000000000..24ce9ace3bf113 --- /dev/null +++ b/paddle/fluid/operators/class_center_sample_op.h @@ -0,0 +1,114 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +class ClassCenterSampleCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* label = ctx.Input("Label"); + auto* remapped_label = ctx.Output("RemappedLabel"); + auto* sampled_local_class_center = + ctx.Output("SampledLocalClassCenter"); + int num_classes = ctx.Attr("num_classes"); + int num_samples = ctx.Attr("num_samples"); + + int seed = ctx.Attr("seed"); + bool fix_seed = ctx.Attr("fix_seed"); + PADDLE_ENFORCE_GT(num_classes, 0, + platform::errors::InvalidArgument( + "The value 'num_classes' for Op(class_center_sample) " + "must be greater than 0, " + "but the value given is %d.", + num_classes)); + + PADDLE_ENFORCE_GT(num_samples, 0, + platform::errors::InvalidArgument( + "The value 'num_samples' for Op(class_center_sample) " + "must be greater than 0, " + "but the value given is %d.", + num_samples)); + + PADDLE_ENFORCE_LE(num_samples, num_classes, + platform::errors::InvalidArgument( + "The value 'num_samples' for Op(class_center_sample) " + "must be less than or equal to %d, " + "but the value given is %d.", + num_classes, num_samples)); + + int64_t numel = label->numel(); + auto* label_ptr = label->data(); + + // get unique positive class center by ascending + std::set> unique_label; + for (int64_t i = 0; i < numel; ++i) { + unique_label.insert(label_ptr[i]); + } + + // constrcut a lookup table and get sampled_local_class_center + std::vector actual_sampled; + std::map new_class_dict; + T idx = 0; + for (auto& t : unique_label) { + new_class_dict[t] = idx; + actual_sampled.push_back(t); + idx++; + } + + if (!fix_seed) { + std::random_device rnd; + seed = rnd(); + } + std::uniform_int_distribution dist(0, num_classes - 1); + auto engine = framework::GetCPURandomEngine(seed); + // sample negative class center randomly + while (unique_label.size() < static_cast(num_samples)) { + T neg = dist(*engine); + if (unique_label.find(neg) == unique_label.end()) { + unique_label.insert(neg); + // unorder for negative class center + actual_sampled.push_back(neg); + } + } + + int actual_num_samples = unique_label.size(); + T* sampled_local_class_center_ptr = + sampled_local_class_center->mutable_data({actual_num_samples}, + ctx.GetPlace()); + idx = 0; + for (auto& t : actual_sampled) { + sampled_local_class_center_ptr[idx] = t; + idx++; + } + + // remap the input label to sampled class + auto* remmaped_label_ptr = remapped_label->mutable_data(ctx.GetPlace()); + for (int64_t i = 0; i < numel; ++i) { + remmaped_label_ptr[i] = new_class_dict[label_ptr[i]]; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 9d8b5fb699e33a..5b9a37cfb6231d 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1,4 +1,5 @@ -file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +file(GLOB TEST_OPS RELATIVE +"${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FLAGS_memory_fraction_of_eager_deletion=1.0) set(dist_ENVS http_proxy="" https_proxy="") @@ -28,6 +29,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers) +list(APPEND DIST_TEST_OPS test_parallel_class_center_sample) list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) #remove distribute unittests. @@ -196,6 +198,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_mixed_precision) LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single) LIST(REMOVE_ITEM TEST_OPS test_dygraph_recompute) + list(REMOVE_ITEM TEST_OPS test_parallel_class_center_sample) LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy) elseif(WITH_GPU) if (${CUDNN_VERSION} VERSION_LESS 7100) @@ -908,6 +911,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_class_center_sample PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT 120) if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/parallel_class_center_sample.py b/python/paddle/fluid/tests/unittests/parallel_class_center_sample.py new file mode 100644 index 00000000000000..e1126138eac841 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_class_center_sample.py @@ -0,0 +1,110 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest + +import paddle +import numpy as np +import random +import paddle.distributed as dist +import paddle.fluid as fluid +import paddle.distributed.fleet as fleet +from paddle import framework + + +def set_random_seed(seed): + """Set random seed for reproducability.""" + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + fleet.meta_parallel.model_parallel_random_seed(seed) + + +def class_center_sample_numpy(label, classes_list, num_samples): + unique_label = np.unique(label) + nranks = len(classes_list) + class_interval = np.cumsum(np.insert(classes_list, 0, 0)) + pos_class_center_per_device = [] + unique_label_per_device = [] + + for i in range(nranks): + index = np.logical_and(unique_label >= class_interval[i], + unique_label < class_interval[i + 1]) + pos_class_center_per_device.append(unique_label[index] - class_interval[ + i]) + unique_label_per_device.append(unique_label[index]) + + num_samples_per_device = [] + for pos_class_center in pos_class_center_per_device: + num_samples_per_device.append(max(len(pos_class_center), num_samples)) + sampled_class_interval = np.cumsum(np.insert(num_samples_per_device, 0, 0)) + + remapped_dict = {} + for i in range(nranks): + for idx, v in enumerate(unique_label_per_device[i], + sampled_class_interval[i]): + remapped_dict[v] = idx + + remapped_label = [] + for l in label: + remapped_label.append(remapped_dict[l]) + + return remapped_label, pos_class_center_per_device + + +class TestParallelClassCenterSampleOp(unittest.TestCase): + def setUp(self): + strategy = fleet.DistributedStrategy() + fleet.init(is_collective=True, strategy=strategy) + + def test_class_center_sample(self): + + rank_id = dist.get_rank() + nranks = dist.get_world_size() + + seed = 1025 + set_random_seed(seed) + paddle.seed(rank_id * 10) + random.seed(seed) + np.random.seed(seed) + + batch_size = 20 + num_samples = 6 + + for dtype in ('int32', 'int64'): + for _ in range(5): + classes_list = np.random.randint(10, 15, (nranks, )) + num_class = np.sum(classes_list) + + np_label = np.random.randint( + 0, num_class, (batch_size, ), dtype=dtype) + label = paddle.to_tensor(np_label, dtype=dtype) + np_remapped_label, np_sampled_class_center_per_device = class_center_sample_numpy( + np_label, classes_list, num_samples) + remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample( + label, classes_list[rank_id], num_samples) + np.testing.assert_allclose(remapped_label.numpy(), + np_remapped_label) + np_sampled_class_index = np_sampled_class_center_per_device[ + rank_id] + np.testing.assert_allclose( + sampled_class_index.numpy()[:len(np_sampled_class_index)], + np_sampled_class_index) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_class_center_sample_op.py b/python/paddle/fluid/tests/unittests/test_class_center_sample_op.py new file mode 100644 index 00000000000000..752ca307dd81ee --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_class_center_sample_op.py @@ -0,0 +1,222 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import math +import random +import paddle +import paddle.fluid.core as core +from op_test import OpTest +from paddle.fluid import Program, program_guard + + +def class_center_sample_numpy(label, classes_list, num_samples): + unique_label = np.unique(label) + nranks = len(classes_list) + class_interval = np.cumsum(np.insert(classes_list, 0, 0)) + pos_class_center_per_device = [] + unique_label_per_device = [] + + for i in range(nranks): + index = np.logical_and(unique_label >= class_interval[i], + unique_label < class_interval[i + 1]) + pos_class_center_per_device.append(unique_label[index] - class_interval[ + i]) + unique_label_per_device.append(unique_label[index]) + + num_samples_per_device = [] + for pos_class_center in pos_class_center_per_device: + num_samples_per_device.append(max(len(pos_class_center), num_samples)) + sampled_class_interval = np.cumsum(np.insert(num_samples_per_device, 0, 0)) + + remapped_dict = {} + for i in range(nranks): + for idx, v in enumerate(unique_label_per_device[i], + sampled_class_interval[i]): + remapped_dict[v] = idx + + remapped_label = [] + for l in label: + remapped_label.append(remapped_dict[l]) + + return np.array(remapped_label), np.array(pos_class_center_per_device) + + +class TestClassCenterSampleOp(OpTest): + def initParams(self): + self.op_type = "class_center_sample" + self.batch_size = 20 + self.num_samples = 6 + self.num_classes = 10 + self.seed = 2021 + + def init_dtype(self): + self.dtype = np.int64 + + def init_fix_seed(self): + self.fix_seed = True + + def setUp(self): + self.initParams() + self.init_dtype() + self.init_fix_seed() + label = np.random.randint( + 0, self.num_classes, (self.batch_size, ), dtype=self.dtype) + + remapped_label, sampled_class_center = class_center_sample_numpy( + label, [self.num_classes], self.num_samples) + + self.inputs = {'Label': label} + self.outputs = { + 'RemappedLabel': remapped_label.astype(self.dtype), + 'SampledLocalClassCenter': sampled_class_center.astype(self.dtype) + } + + self.attrs = { + 'num_classes': self.num_classes, + 'num_samples': self.num_samples, + 'seed': self.seed, + 'fix_seed': self.fix_seed, + } + + def test_check_output(self): + self.check_output(no_check_set=['SampledLocalClassCenter']) + + +class TestClassCenterSampleOpINT32(TestClassCenterSampleOp): + def init_dtype(self): + self.dtype = np.int32 + + +class TestClassCenterSampleOpFixSeed(TestClassCenterSampleOp): + def init_fix_seed(self): + self.fix_seed = True + + +class TestClassCenterSampleV2(unittest.TestCase): + def setUp(self): + self.initParams() + np.random.seed(self.seed) + paddle.framework.random._manual_program_seed(2021) + self.places = [paddle.fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + self.places.append(paddle.fluid.CUDAPlace(0)) + + def initParams(self): + self.batch_size = 10 + self.num_samples = 6 + self.num_classes = 20 + self.seed = 0 + self.init_dtype() + + def init_dtype(self): + self.dtype = np.int64 + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + def check_static_result(self, place): + with program_guard(Program(), Program()): + label_np = np.random.randint( + 0, self.num_classes, (self.batch_size, ), dtype=self.dtype) + + label = paddle.static.data( + name='label', shape=[self.batch_size], dtype=self.dtype) + remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample( + label, self.num_classes, self.num_samples, seed=self.seed) + + remapped_label_np, sampled_class_center_np = class_center_sample_numpy( + label_np, [self.num_classes], self.num_samples) + exe = paddle.fluid.Executor(place) + [remapped_label_res, sampled_class_index_res] = exe.run( + paddle.fluid.default_main_program(), + feed={'label': label_np}, + fetch_list=[remapped_label, sampled_class_index]) + np.testing.assert_allclose(remapped_label_res, remapped_label_np) + np.testing.assert_allclose( + sampled_class_index_res[:len(sampled_class_center_np[0])], + sampled_class_center_np[0]) + + def test_dynamic(self): + for place in self.places: + self.check_dynamic_result(place=place) + + def check_dynamic_result(self, place): + with paddle.fluid.dygraph.guard(place): + label_np = np.random.randint( + 0, self.num_classes, (self.batch_size, ), dtype=self.dtype) + label = paddle.to_tensor(label_np, dtype=self.dtype) + + remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample( + label, self.num_classes, self.num_samples, seed=self.seed) + + remapped_label_np, sampled_class_center_np = class_center_sample_numpy( + label_np, [self.num_classes], self.num_samples) + + remapped_label_res = remapped_label.numpy() + sampled_class_index_res = sampled_class_index.numpy() + np.testing.assert_allclose(remapped_label_res, remapped_label_np) + np.testing.assert_allclose( + sampled_class_index_res[:len(sampled_class_center_np[0])], + sampled_class_center_np[0]) + + +class TestClassCenterSampleV2INT32(TestClassCenterSampleV2): + def init_dtype(self): + self.dtype = np.int32 + + +class TestClassCenterSampleAPIError(unittest.TestCase): + def setUp(self): + self.initParams() + np.random.seed(self.seed) + self.places = [paddle.fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + self.places.append(paddle.fluid.CUDAPlace(0)) + + def initParams(self): + self.batch_size = 20 + self.num_samples = 15 + self.num_classes = 10 + self.seed = 2021 + self.init_dtype() + + def init_dtype(self): + self.dtype = np.int64 + + def test_dynamic_errors(self): + def test_num_samples(): + for place in self.places: + with paddle.fluid.dygraph.guard(place): + label_np = np.random.randint( + 0, + self.num_classes, (self.batch_size, ), + dtype=self.dtype) + label = paddle.to_tensor(label_np) + + remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample( + label, + self.num_classes, + self.num_samples, + seed=self.seed) + + self.assertRaises(ValueError, test_num_samples) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py b/python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py new file mode 100644 index 00000000000000..19fc617ea25cd5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py @@ -0,0 +1,29 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestParallelClassCenterSample(TestMultipleGpus): + def test_parallel_class_center_sample(self): + self.run_mnist_2gpu('parallel_class_center_sample.py') + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py index 32ac4f412a8f5a..2492caff2f91ef 100644 --- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py @@ -31,4 +31,5 @@ 'rnn', 'fusion_lstm', 'softmax_with_cross_entropy', + 'class_center_sample', ] diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index 04e0b7c140d7fa..e10f0f1686dc5c 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -55,6 +55,7 @@ from .common import interpolate # noqa: F401 from .common import upsample # noqa: F401 from .common import bilinear # noqa: F401 +from .common import class_center_sample # noqa: F401 from .conv import conv1d # noqa: F401 from .conv import conv1d_transpose # noqa: F401 from .common import linear # noqa: F401 @@ -200,5 +201,6 @@ 'temporal_shift', 'batch_norm', 'layer_norm', - 'instance_norm' + 'instance_norm', + 'class_center_sample', ] diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index 4bc137222d2efa..aee8ea2a3cc59a 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -1564,3 +1564,156 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None): outputs={"Out": smooth_label}, attrs={"epsilon": float(epsilon)}) return smooth_label + + +def class_center_sample(label, num_classes, num_samples, group=None, seed=None): + """ + Class center sample method is proposed from the paper PartialFC that only sample a subset of the class centers. + The process of sampling subset class centers is straightforward: + + 1. First select the positive class centers; + 2. Then randomly sample negative class centers. + + Specifically, given a label tensor, shape [batch_size], select all the positive class centers and randomly + sample negative class centers, then remap the input label tensor using the sampled class centers. + + For more information, Partial FC: Training 10 Million Identities on a Single Machine + arxiv: https://arxiv.org/abs/2010.05222 + + .. hint:: + If the number of the positive class centers is greater than the input num_samples, it keeps all the positive + class centers and the shape of sampled_class_center will be [num_positive_class_centers]. + + The API supports CPU, single GPU and multi GPU. + + Args: + label (Tensor): 1-D tensor with shape [N], each label in [0, num_classes) + num_classes (int): A positive integer to specify the number of classes at local rank. + Note that num_classes of each GPU can be different. + num_samples (int): A positive integer to specify the number of class center to sample. + group (Group, optional): The abstract representation of group. + See paddle.distributed.collective.Group. Default is ``None``. + seed (int, optional): Random seed. Default is ``None``. + + Returns: + Tuple of two ``Tensor`` : (remapped_label, sampled_class_center), remapped label using sampled class center, + sampled class center from [0, num_classes). + + Examples: + + .. code-block:: python + + # CPU or single GPU + import paddle + num_classes = 20 + batch_size = 10 + num_samples = 6 + label = paddle.randint(low=0, high=num_classes, shape=[batch_size], dtype='int64') + remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample(label, num_classes, num_samples) + + print(label) + print(remapped_label) + print(sampled_class_index) + + # the output is + #Tensor(shape=[10], dtype=int64, place=CPUPlace, stop_gradient=True, + # [11, 5 , 1 , 3 , 12, 2 , 15, 19, 18, 19]) + #Tensor(shape=[10], dtype=int64, place=CPUPlace, stop_gradient=True, + # [4, 3, 0, 2, 5, 1, 6, 8, 7, 8]) + #Tensor(shape=[9], dtype=int64, place=CPUPlace, stop_gradient=True, + # [1 , 2 , 3 , 5 , 11, 12, 15, 18, 19]) + + .. code-block:: python + + # required: distributed + # Multi GPU, test_class_center_sample.py + import paddle + import paddle.distributed as dist + strategy = dist.fleet.DistributedStrategy() + dist.fleet.init(is_collective=True, strategy=strategy) + batch_size = 10 + num_samples = 6 + rank_id = dist.get_rank() + # num_classes of each GPU can be different, e.g num_classes_list = [10, 8] + num_classes_list = [10, 10] + num_classes = paddle.sum(paddle.to_tensor(num_classes_list)) + label = paddle.randint(low=0, high=num_classes.item(), shape=[batch_size], dtype='int64') + label_list = [] + dist.all_gather(label_list, label) + label = paddle.concat(label_list, axis=0) + remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample(label, num_classes_list[rank_id], num_samples) + + print(label) + print(remapped_label) + print(sampled_class_index) + + #python -m paddle.distributed.launch --gpus=0,1 test_class_center_sample.py + # rank 0 output: + #Tensor(shape=[20], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ]) + #Tensor(shape=[20], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ]) + #Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [0, 2, 4, 8, 9, 3]) + + # rank 1 output: + #Tensor(shape=[20], dtype=int64, place=CUDAPlace(1), stop_gradient=True, + # [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ]) + #Tensor(shape=[20], dtype=int64, place=CUDAPlace(1), stop_gradient=True, + # [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ]) + #Tensor(shape=[7], dtype=int64, place=CUDAPlace(1), stop_gradient=True, + # [0, 1, 2, 3, 5, 7, 8]) + """ + if group is not None and not group.is_member(): + return + + ring_id = 0 if group is None else group.id + rank = 0 + nranks = 1 + if core.is_compiled_with_dist(): + parallel_env = paddle.distributed.ParallelEnv() + global_rank = parallel_env.rank + rank = global_rank if group is None else group.get_group_rank( + global_rank) + nranks = parallel_env.world_size if group is None else group.nranks + + if num_samples > num_classes: + raise ValueError( + 'Expected num_samples less than or equal to {}, got num_samples {}'. + format(num_classes, num_samples)) + + if (seed is None or seed == 0) and default_main_program().random_seed != 0: + seed = default_main_program().random_seed + + if in_dygraph_mode(): + remapped_label, sampled_class_center = core.ops.class_center_sample( + label, 'num_classes', num_classes, 'num_samples', num_samples, + 'ring_id', ring_id, 'nranks', nranks, 'rank', rank, 'fix_seed', + seed is not None, 'seed', seed if seed is not None else 0) + return remapped_label, sampled_class_center + + check_variable_and_dtype(label, 'label', ['int64', 'int32'], + 'class_center_sample') + op_type = 'class_center_sample' + helper = LayerHelper(op_type, **locals()) + remapped_label = helper.create_variable_for_type_inference( + dtype=label.dtype) + sampled_class_center = helper.create_variable_for_type_inference( + dtype=label.dtype) + helper.append_op( + type=op_type, + inputs={'Label': label}, + outputs={ + 'RemappedLabel': remapped_label, + 'SampledLocalClassCenter': sampled_class_center + }, + attrs={ + 'num_classes': num_classes, + 'num_samples': num_samples, + 'ring_id': ring_id, + 'nranks': nranks, + 'rank': rank, + 'fix_seed': seed is not None, + 'seed': seed if seed is not None else 0 + }) + return remapped_label, sampled_class_center diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index d2f95c235b04c1..4255e1f4e440d0 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -719,5 +719,6 @@ 'test_sgd_op_bf16', 'test_marker_op', 'test_c_embedding_op', + 'test_class_center_sample_op', 'test_margin_cross_entropy_op', ] From 52a7b0c4e893d1444ac6067ea7f6b0e31f484cb5 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 18 Aug 2021 10:54:49 +0800 Subject: [PATCH 081/126] [NPU] add retry on HcclGetRootInfo to fix "bind fail" (#34977) * add retry for HcclGetRootInfo * refine code * reduce retry interval --- .../operators/collective/c_gen_hccl_id_op.cc | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc index 9ab7d90efaa9f3..63a783720e01cf 100644 --- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc @@ -32,9 +32,24 @@ namespace operators { #ifdef PADDLE_WITH_ASCEND_CL static void GenHCCLID(std::vector* hccl_ids) { + constexpr int timeout = 2 * 60 + 10; // 2MSL+10s + constexpr int retry_time = 1; for (size_t i = 0; i < hccl_ids->size(); ++i) { - PADDLE_ENFORCE_NPU_SUCCESS( - platform::dynload::HcclGetRootInfo(&(*hccl_ids)[i])); + bool failed = true; + for (auto retry_times = 0; retry_times * retry_time < timeout; + ++retry_times) { + auto err = platform::dynload::HcclGetRootInfo(&(*hccl_ids)[i]); + if (err == 0) { + failed = false; + break; + } + std::this_thread::sleep_for(std::chrono::seconds(retry_time)); + LOG(WARNING) << "HcclGetRootInfo failed, err is: " << err << ", retry " + << retry_times << " times"; + } + if (failed) { + PADDLE_THROW(platform::errors::External("HcclGetRootInfo failed!")); + } } } From 12bf046b951b7d6a357ebdbbfabf9412c7c29332 Mon Sep 17 00:00:00 2001 From: wawltor Date: Wed, 18 Aug 2021 11:13:42 +0800 Subject: [PATCH 082/126] add the safe check for the some ops (#34978) --- paddle/fluid/operators/diag_embed_op.cc | 26 ++++++++++++++++++------ paddle/fluid/operators/matmul_op.cc | 6 ++++++ paddle/fluid/operators/matmul_v2_op.cc | 8 ++++++++ paddle/fluid/operators/metrics/auc_op.cc | 16 ++++++++++++++- 4 files changed, 49 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/diag_embed_op.cc b/paddle/fluid/operators/diag_embed_op.cc index 6d8bc4d219ece9..7e0990df264aca 100644 --- a/paddle/fluid/operators/diag_embed_op.cc +++ b/paddle/fluid/operators/diag_embed_op.cc @@ -36,22 +36,36 @@ class DiagEmbedOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("Input"); - int dim1_ = dim1 < 0 ? x_dims.size() + dim1 + 1 : dim1; - int dim2_ = dim2 < 0 ? x_dims.size() + dim2 + 1 : dim2; - int offset_ = std::abs(offset); - + PADDLE_ENFORCE_GE( + dim1, -(x_dims.size() + 1), + platform::errors::OutOfRange( + "Dim1 is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size() + 1), x_dims.size(), dim1)); PADDLE_ENFORCE_LE( - dim1_, x_dims.size(), + dim1, x_dims.size(), platform::errors::OutOfRange( "Dim1 is out of range (expected to be in range of [%ld, " "%ld], but got %ld).", -(x_dims.size() + 1), x_dims.size(), dim1)); + + PADDLE_ENFORCE_GE( + dim2, -(x_dims.size() + 1), + platform::errors::OutOfRange( + "Dim2 is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size() + 1), x_dims.size(), dim2)); PADDLE_ENFORCE_LE( - dim2_, x_dims.size(), + dim2, x_dims.size(), platform::errors::OutOfRange( "Dim2 is out of range (expected to be in range of [%ld, " "%ld], but got %ld).", -(x_dims.size() + 1), x_dims.size(), dim2)); + + int dim1_ = dim1 < 0 ? x_dims.size() + dim1 + 1 : dim1; + int dim2_ = dim2 < 0 ? x_dims.size() + dim2 + 1 : dim2; + int offset_ = std::abs(offset); + PADDLE_ENFORCE_NE(dim1_, dim2_, platform::errors::InvalidArgument( "diagonal dimensions should not be identical " diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 988a6c4f7da997..78747108d44f53 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -330,6 +330,12 @@ framework::DDim GetDimForInput(const framework::InferShapeContext &ctx, auto axis = ctx.Attrs().Get>("fused_transpose_" + input_name); auto dim = ctx.GetInputDim(input_name); + + PADDLE_ENFORCE_GT(dim.size(), 0, + platform::errors::InvalidArgument( + "The Input(%s) has not been initialized properly. The " + "shape of Input(%s) = [%s].", + dim)); if (!shape.empty() && !axis.empty()) { PADDLE_ENFORCE_GE( shape.size(), 2, diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index d39eac0759cdb6..4ec9a052bb2e51 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -35,6 +35,14 @@ class MatMulV2Op : public framework::OperatorWithKernel { paddle::framework::vectorize(ctx->GetInputDim("Y")); auto ndims_x = dims_x.size(); auto ndims_y = dims_y.size(); + PADDLE_ENFORCE_GT(ndims_x, 0, + platform::errors::InvalidArgument( + "The Input(X) dims size must be greater than 0," + " but reviced dims size is 0. ")); + PADDLE_ENFORCE_GT(ndims_y, 0, + platform::errors::InvalidArgument( + "The Input(Y) dims size must be greater than 0," + " but reviced dims size is 0. ")); bool x_broadcasted = false, y_broadcasted = false; if (ndims_x == 1) { diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc index 1dfb22718e446f..4f2f1d0722c9ce 100644 --- a/paddle/fluid/operators/metrics/auc_op.cc +++ b/paddle/fluid/operators/metrics/auc_op.cc @@ -25,7 +25,21 @@ class AucOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { OP_INOUT_CHECK(ctx->HasInput("Predict"), "Input", "Predict", "Auc"); OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Auc"); - auto predict_width = ctx->GetInputDim("Predict")[1]; + auto predict_dims = ctx->GetInputDim("Predict"); + auto label_dims = ctx->GetInputDim("Label"); + auto predict_width = predict_dims[1]; + PADDLE_ENFORCE_NE( + framework::product(predict_dims), 0, + platform::errors::InvalidArgument( + "The Input(Predict) has not been initialized properly. The " + "shape of Input(Predict) = [%s], the shape can not involes 0.", + predict_dims)); + PADDLE_ENFORCE_NE( + framework::product(label_dims), 0, + platform::errors::InvalidArgument( + "The Input(Label) has not been initialized properly. The " + "shape of Input(Label) = [%s], the shape can not involes 0.", + label_dims)); if (ctx->IsRuntime()) { PADDLE_ENFORCE_LE(predict_width, 2, platform::errors::InvalidArgument( From a1373714f76bf85d79baad02d29f6f27cb9b7a8e Mon Sep 17 00:00:00 2001 From: WangXi Date: Wed, 18 Aug 2021 11:16:14 +0800 Subject: [PATCH 083/126] NPU use squared_l2_norm in GradientClipByGlobalNorm (#34836) --- python/paddle/fluid/clip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 04fb45cd3ae22d..d48cea48a76fd4 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -40,7 +40,7 @@ def _squared_l2_norm(x): This OP returns the squared L2 norm of a tensor. """ - if core.is_compiled_with_npu() or core.is_compiled_with_xpu(): + if core.is_compiled_with_xpu(): square = layers.square(x) sum_square = layers.reduce_sum(square) return sum_square From 2e9a31eb5da2a5b4415c87cd8d1400a1e030c1bb Mon Sep 17 00:00:00 2001 From: lilong12 Date: Wed, 18 Aug 2021 11:18:58 +0800 Subject: [PATCH 084/126] Fix bug in alltoall (#34975) --- python/paddle/distributed/collective.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index e5dfb34f24304d..70e16d67fb9f17 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -1524,7 +1524,7 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True): inputs={'X': [temp]}, outputs={'Out': [out]}, attrs={ - 'ring_id': group, + 'ring_id': ring_id, 'use_calc_stream': use_calc_stream, }) out_tensor_list.extend(paddle.split(out, nranks, 0)) From dd533dd3e014a9d27463332ff19320f9b423d365 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Wed, 18 Aug 2021 11:51:20 +0800 Subject: [PATCH 085/126] Add function to disable paddle signal handler (#34577) * Add function to disable paddle signal handler Paddle used google::InstallFaultSignalHandler to handle selected system signals, mainly for debugging and bug report purposes. However, this can be conflicted with other python packages whoever captures similar signals. Such python package involves tvm and more To resolve this issue, we support a function to disable signal handler * Remove signal test from WIN32 platform * Remove redundant return from disable_signal_handler() function * Add detailed messages to en_doc --- paddle/fluid/platform/init.cc | 28 ++++++++--- paddle/fluid/platform/init.h | 2 + paddle/fluid/pybind/pybind.cc | 2 + python/paddle/__init__.py | 2 + python/paddle/fluid/framework.py | 25 ++++++++++ .../fluid/tests/unittests/CMakeLists.txt | 1 + .../unittests/test_disable_signal_handler.py | 48 +++++++++++++++++++ 7 files changed, 102 insertions(+), 6 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_disable_signal_handler.py diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 3ee5a578601045..afae0465311433 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include @@ -245,15 +246,16 @@ void InitDevices(const std::vector devices) { // Description Quoted from // https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html const struct { + int signal_number; const char *name; const char *error_string; } SignalErrorStrings[] = { - {"SIGSEGV", "Segmentation fault"}, - {"SIGILL", "Illegal instruction"}, - {"SIGFPE", "Erroneous arithmetic operation"}, - {"SIGABRT", "Process abort signal"}, - {"SIGBUS", "Access to an undefined portion of a memory object"}, - {"SIGTERM", "Termination signal"}, + {SIGSEGV, "SIGSEGV", "Segmentation fault"}, + {SIGILL, "SIGILL", "Illegal instruction"}, + {SIGFPE, "SIGFPE", "Erroneous arithmetic operation"}, + {SIGABRT, "SIGABRT", "Process abort signal"}, + {SIGBUS, "SIGBUS", "Access to an undefined portion of a memory object"}, + {SIGTERM, "SIGTERM", "Termination signal"}, }; bool StartsWith(const char *str, const char *prefix) { @@ -319,7 +321,21 @@ void SignalHandle(const char *data, int size) { // will Kill program by the default signal handler } } +#endif // _WIN32 + +void DisableSignalHandler() { +#ifndef _WIN32 + for (size_t i = 0; + i < (sizeof(SignalErrorStrings) / sizeof(*(SignalErrorStrings))); ++i) { + int signal_number = SignalErrorStrings[i].signal_number; + struct sigaction sig_action; + memset(&sig_action, 0, sizeof(sig_action)); + sigemptyset(&sig_action.sa_mask); + sig_action.sa_handler = SIG_DFL; + sigaction(signal_number, &sig_action, NULL); + } #endif +} #ifdef WITH_WIN_DUMP_DBG typedef BOOL(WINAPI *MINIDUMP_WRITE_DUMP)( diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h index cd5ef843fa8f7d..b52456b19ac662 100644 --- a/paddle/fluid/platform/init.h +++ b/paddle/fluid/platform/init.h @@ -61,5 +61,7 @@ class SignalMessageDumper { void SignalHandle(const char* data, int size); #endif +void DisableSignalHandler(); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 0663da88ac75f1..235a06833fc675 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -507,6 +507,8 @@ PYBIND11_MODULE(core_noavx, m) { m.def("set_num_threads", &platform::SetNumThreads); + m.def("disable_signal_handler", &DisableSignalHandler); + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("cudnn_version", &platform::CudnnVersion); #endif diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 907a667cb6ba78..27a414e092802d 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -272,6 +272,7 @@ from .device import get_device # noqa: F401 from .fluid.framework import is_compiled_with_cuda # noqa: F401 from .fluid.framework import is_compiled_with_rocm # noqa: F401 +from .fluid.framework import disable_signal_handler # noqa: F401 from .device import is_compiled_with_xpu # noqa: F401 from .device import is_compiled_with_npu # noqa: F401 from .device import XPUPlace # noqa: F401 @@ -485,6 +486,7 @@ 'enable_static', 'scatter_nd', 'set_default_dtype', + 'disable_signal_handler', 'expand_as', 'stack', 'sqrt', diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 343ce352c3eaaf..12aa0c9391019a 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -395,6 +395,31 @@ def is_compiled_with_xpu(): return core.is_compiled_with_xpu() +def disable_signal_handler(): + """ + Reset signal handler registered by Paddle. + + Paddle installs signal handlers at C++ level to log debug information upon failing. + However, conflicts can happen if another python module is making use of such signal. + Such being the case, one may disblae paddle signal handler via this interface. + + Known frameworks that require disabling signal handler includes: + 1. TVM + 2. ADLIK + + Make sure you called paddle.disable_signal_handler() before using above mentioned frameworks. + + Returns: None + + Examples: + .. code-block:: python + + import paddle + paddle.disable_signal_handler() + """ + core.disable_signal_handler() + + def is_compiled_with_cuda(): """ Whether this whl package can be used to run the model on GPU. diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 5b9a37cfb6231d..a98ea618d373fb 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -113,6 +113,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32) LIST(REMOVE_ITEM TEST_OPS test_memcpy_op) LIST(REMOVE_ITEM TEST_OPS test_raw_program_optimizer) LIST(REMOVE_ITEM TEST_OPS test_fleet_gradient_scale) + LIST(REMOVE_ITEM TEST_OPS test_disable_signal_handler) endif() if(WIN32) diff --git a/python/paddle/fluid/tests/unittests/test_disable_signal_handler.py b/python/paddle/fluid/tests/unittests/test_disable_signal_handler.py new file mode 100644 index 00000000000000..dbe9dcb7f823d7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_disable_signal_handler.py @@ -0,0 +1,48 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import signal, os +import paddle +import subprocess + +SignalsToTest = { + signal.SIGTERM, signal.SIGBUS, signal.SIGABRT, signal.SIGSEGV, + signal.SIGILL, signal.SIGFPE +} + + +class TestSignOpError(unittest.TestCase): + def test_errors(self): + try: + for sig in SignalsToTest: + output = subprocess.check_output( + [ + "python", "-c", + f"import paddle; import signal,os; paddle.disable_signal_handler(); os.kill(os.getpid(), {sig})" + ], + stderr=subprocess.STDOUT) + except Exception as e: + # If paddle signal handler is enabled + # One would expect "paddle::framework::SignalHandle" in STDERR + stdout_message = str(e.output) + if "paddle::framework::SignalHandle" in stdout_message: + raise Exception("Paddle signal handler not disabled") + + +if __name__ == "__main__": + unittest.main() From 4d88cdb88197a8d20f84e05cf54f8ca8d94f3bb6 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 18 Aug 2021 12:02:18 +0800 Subject: [PATCH 086/126] [CustomOp] Fix ext_tensor.cast failed bug (#34884) * fix ext_tensor.cast failed bug * remove useless deps * fix windows cmake failed * try to fix windows make failed * fix make error on windwos --- paddle/fluid/extension/src/ext_tensor.cu | 1 + paddle/fluid/framework/CMakeLists.txt | 14 +++++++++++--- paddle/fluid/framework/custom_tensor_test.cc | 7 +++++++ 3 files changed, 19 insertions(+), 3 deletions(-) create mode 120000 paddle/fluid/extension/src/ext_tensor.cu diff --git a/paddle/fluid/extension/src/ext_tensor.cu b/paddle/fluid/extension/src/ext_tensor.cu new file mode 120000 index 00000000000000..23ae523331877f --- /dev/null +++ b/paddle/fluid/extension/src/ext_tensor.cu @@ -0,0 +1 @@ +ext_tensor.cc \ No newline at end of file diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index d0c64f44af3e2f..6853b03c61288d 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -17,7 +17,7 @@ function(windows_symbolic TARGET) add_custom_command(OUTPUT ${final_path}/.${src}.cu COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" COMMENT "create hidden file of ${src}.cu") - add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) + add_custom_target(${TARGET} ALL DEPENDS ${final_path}/.${src}.cu) endforeach() endfunction() @@ -413,8 +413,16 @@ include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include) include_directories(${PADDLE_SOURCE_DIR}/paddle/utils) -if(WITH_ROCM) - hip_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce) +if (WITH_GPU) + if (WIN32) + windows_symbolic(ext_tensor_cu SRCS ext_tensor.cu PATH ../extension/src) + nv_library(custom_tensor SRCS ../extension/src/.ext_tensor.cu DEPS lod_tensor memory enforce) + add_dependencies(custom_tensor ext_tensor_cu) + else() + nv_library(custom_tensor SRCS ../extension/src/ext_tensor.cu DEPS lod_tensor memory enforce) + endif(WIN32) +elseif (WITH_ROCM) + hip_library(custom_tensor SRCS ../extension/src/ext_tensor.cu DEPS lod_tensor memory enforce) else() cc_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce) endif() diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc index 7fbc4f554ba653..5d181bfb53bc91 100644 --- a/paddle/fluid/framework/custom_tensor_test.cc +++ b/paddle/fluid/framework/custom_tensor_test.cc @@ -144,6 +144,13 @@ void TestCast(paddle::DataType data_type) { t1.template mutable_data(); auto t2 = t1.cast(data_type); CHECK(t2.type() == data_type); +#ifdef PADDLE_WITH_CUDA + auto tg1 = paddle::Tensor(paddle::PlaceType::kGPU); + tg1.reshape(tensor_shape); + tg1.template mutable_data(); + auto tg2 = tg1.cast(data_type); + CHECK(tg2.type() == data_type); +#endif } void GroupTestCopy() { From 8967a66a21758680fda8f6d658ed627e471429ae Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Wed, 18 Aug 2021 14:18:23 +0800 Subject: [PATCH 087/126] support quantization of conv2d_transpose (#34547) --- .../slim/quantization/imperative/qat.py | 101 +++++++++++++---- .../slim/quantization/imperative/utils.py | 19 +++- .../contrib/slim/tests/test_imperative_qat.py | 10 +- .../tests/test_imperative_qat_user_defined.py | 19 ++++ python/paddle/nn/quant/quant_layers.py | 107 ++++++++++++++++++ tools/sampcd_processor.py | 1 + 6 files changed, 225 insertions(+), 32 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index b8c0e47e9bbc26..6208b43c9e9e48 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -42,17 +42,18 @@ class ImperativeQuantAware(object): Applying quantization aware training (QAT) to the dgraph model. """ - def __init__(self, - quantizable_layer_type=['Conv2D', 'Linear'], - weight_quantize_type='abs_max', - activation_quantize_type='moving_average_abs_max', - weight_bits=8, - activation_bits=8, - moving_rate=0.9, - weight_preprocess_layer=None, - act_preprocess_layer=None, - weight_quantize_layer=None, - act_quantize_layer=None): + def __init__( + self, + quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'], + weight_quantize_type='abs_max', + activation_quantize_type='moving_average_abs_max', + weight_bits=8, + activation_bits=8, + moving_rate=0.9, + weight_preprocess_layer=None, + act_preprocess_layer=None, + weight_quantize_layer=None, + act_quantize_layer=None): """ The constructor for ImperativeQuantAware. @@ -212,9 +213,44 @@ def quantize(self, model): the out_scale value of outputs would be calculated. Args: - model(fluid.dygraph.Layer): the model to be quantized. + model(paddle.nn.Layer): the model to be quantized. Returns: None + + Examples: + .. code-block:: python + + import paddle + from paddle.fluid.contrib.slim.quantization \ + import ImperativeQuantAware + + class ImperativeModel(paddle.nn.Layer): + def __init__(self): + super(ImperativeModel, self).__init__() + # self.linear_0 would skip the quantization. + self.linear_0 = paddle.nn.Linear(784, 400) + self.linear_0.skip_quant = True + + # self.linear_1 would not skip the quantization. + self.linear_1 = paddle.nn.Linear(400, 10) + self.linear_1.skip_quant = False + + def forward(self, inputs): + x = self.linear_0(inputs) + x = self.linear_1(inputs) + return x + + model = ImperativeModel() + imperative_qat = ImperativeQuantAware( + weight_quantize_type='abs_max', + activation_quantize_type='moving_average_abs_max') + + # Add the fake quant logical. + # The original model will be rewrite. + # + # There is only one Layer(self.linear1) would be added the + # fake quant logical. + imperative_qat.quantize(model) """ assert isinstance(model, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." @@ -232,17 +268,18 @@ class ImperativeQuantizeInputs(object): logic both for activation inputs and weight inputs. """ - def __init__(self, - quantizable_layer_type=['Conv2D', 'Linear'], - weight_quantize_type='abs_max', - activation_quantize_type='moving_average_abs_max', - weight_bits=8, - activation_bits=8, - moving_rate=0.9, - weight_preprocess_layer=None, - act_preprocess_layer=None, - weight_quantize_layer=None, - act_quantize_layer=None): + def __init__( + self, + quantizable_layer_type=['Conv2D', 'Linear', 'Conv2DTranspose'], + weight_quantize_type='abs_max', + activation_quantize_type='moving_average_abs_max', + weight_bits=8, + activation_bits=8, + moving_rate=0.9, + weight_preprocess_layer=None, + act_preprocess_layer=None, + weight_quantize_layer=None, + act_quantize_layer=None): """ The constructor for ImperativeQuantizeInputs. @@ -303,6 +340,18 @@ def __init__(self, } def apply(self, model): + """ + Quantize the weights and activations to calculate for specific + layers. + + Args: + model(paddle.nn.Layer): The target model which would + calculate the input quantization scale. + + Returns: + None + """ + assert isinstance(model, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." @@ -354,7 +403,7 @@ def apply(self, model): output scales for specific layers in the dygraph model. Args: - model(fluid.dygraph.Layer): The target model which would be + model(paddle.nn.Layer): The target model which would be calculate the output quantization scale. Returns: @@ -544,7 +593,9 @@ def _is_skip_quant_op(self, block, in_op): 1. the type of input op should be conv2d, depthwise_conv2d or matmul 2. the previous ops of the input op are not fake_quantize_dequantize ops """ - target_op_types = ["conv2d", "depthwise_conv2d", "matmul"] + target_op_types = [ + "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose" + ] if in_op.type not in target_op_types: return False diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py index a9d52c5a87ad36..009ce372b4f29c 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py @@ -24,6 +24,7 @@ from ..quantization_pass import _get_input_name_index layer_name_map = { + 'Conv2DTranspose': paddle.nn.Conv2DTranspose, 'Conv2D': paddle.nn.Conv2D, 'Linear': paddle.nn.Linear, 'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D, @@ -46,8 +47,9 @@ } # Apply fake quant for the inputs of these layers -# TODO (jc): support paddle.nn.Conv2DTranspose -fake_quant_input_layers = [paddle.nn.Conv2D, paddle.nn.Linear] +fake_quant_input_layers = [ + paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.Conv2DTranspose +] # Apply fake quant for the output of these layers # TODO(jc): fix the problem of adding duplicate fake_quant ops @@ -65,7 +67,8 @@ ] fake_quant_wrap_layers = [ - quant_layers.QuantizedConv2D, quant_layers.QuantizedLinear + quant_layers.QuantizedConv2D, quant_layers.QuantizedLinear, + quant_layers.QuantizedConv2DTranspose ] # The weight format of these layers is Cin * Cout * H * W @@ -84,9 +87,9 @@ def load_variable_data(scope, var_name): - ''' + """ Load variable value from scope - ''' + """ var_node = scope.find_var(var_name) assert var_node is not None, \ "Can not find " + var_name + " in the scope." @@ -120,6 +123,12 @@ def find_parent_layer_and_sub_name(model, name): the sub_name of the layer. For example, if name is 'block_1/convbn_1/conv_1', the parent layer is 'block_1/convbn_1' and the sub_name is `conv_1`. + Args: + model(paddle.nn.Layer): the model to be quantized. + name(string): the name of a layer + + Returns: + parent_layer, subname """ assert isinstance(model, paddle.nn.Layer), \ "The model must be the instance of paddle.nn.Layer." diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py index 14fa291ee077c6..677ccb52e242cf 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py @@ -28,10 +28,10 @@ from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware from paddle.fluid.dygraph.container import Sequential -from paddle.nn import Linear, Conv2D, Softmax +from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose from paddle.fluid.log_helper import get_logger from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX -from paddle.nn.quant.quant_layers import QuantizedConv2D +from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose from imperative_test_utils import fix_model_dict, ImperativeLenet @@ -75,6 +75,12 @@ def test_qat(self): data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') quant_conv1(fluid.dygraph.to_variable(data)) + conv_transpose = Conv2DTranspose(4, 6, (3, 3)) + quant_conv_transpose = QuantizedConv2DTranspose(conv_transpose) + x_var = paddle.uniform( + (2, 4, 8, 8), dtype='float32', min=-1.0, max=1.0) + quant_conv_transpose(x_var) + seed = 1 np.random.seed(seed) fluid.default_main_program().random_seed = seed diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py index 621213beb31cd7..270e8ee566ab57 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py @@ -28,6 +28,7 @@ from paddle.fluid.dygraph import Conv2D from paddle.fluid.dygraph import Pool2D from paddle.fluid.dygraph import Linear +from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose from paddle.fluid.log_helper import get_logger os.environ["CPU_NUM"] = "1" @@ -100,6 +101,19 @@ def dequantize(x, lower_bound, delta, interval): return x +class ModelForConv2dT(nn.Layer): + def __init__(self, num_classes=10): + super(ModelForConv2dT, self).__init__() + self.features = nn.Conv2DTranspose(4, 6, (3, 3)) + self.fc = Linear(input_dim=600, output_dim=num_classes) + + def forward(self, inputs): + x = self.features(inputs) + x = paddle.flatten(x, 1) + x = self.fc(x) + return x + + class ImperativeLenet(paddle.nn.Layer): def __init__(self, num_classes=10, classifier_activation='softmax'): super(ImperativeLenet, self).__init__() @@ -168,6 +182,11 @@ def test_quant_aware_training(self): imperative_qat.quantize(lenet) adam = Adam(learning_rate=0.001, parameters=lenet.parameters()) dynamic_loss_rec = [] + #for CI coverage + conv_transpose = ModelForConv2dT() + imperative_qat.quantize(conv_transpose) + x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) + conv_transpose(x_var) def train(model): adam = Adam(learning_rate=0.001, parameters=model.parameters()) diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py index 5573683ebd0458..040b04f5e7bf1e 100644 --- a/python/paddle/nn/quant/quant_layers.py +++ b/python/paddle/nn/quant/quant_layers.py @@ -31,6 +31,7 @@ 'FakeQuantMovingAverageAbsMax', 'FakeQuantChannelWiseAbsMax', 'QuantizedConv2D', + 'QuantizedConv2DTranspose', 'QuantizedLinear', 'MovingAverageAbsMaxScale', 'MAOutputScaleLayer', @@ -481,6 +482,112 @@ def forward(self, input): data_format=self._data_format) +class QuantizedConv2DTranspose(layers.Layer): + """ + The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose. + The only difference is that its inputs are all fake quantized. + + Examples: + .. code-block:: python + import paddle + import paddle.nn as nn + from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose + x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) + conv = nn.Conv2DTranspose(4, 6, (3, 3)) + conv_quantized = QuantizedConv2DTranspose(conv) + y_quantized = conv_quantized(x_var) + y_var = conv(x_var) + y_quantized_np = y_quantized.numpy() + y_np = y_var.numpy() + print(y_np.shape, y_quantized_np.shape) + # (2, 6, 10, 10), (2, 6, 10, 10) + """ + + def __init__(self, + layer, + weight_bits=8, + activation_bits=8, + moving_rate=0.9, + weight_quantize_type='abs_max', + activation_quantize_type='abs_max', + weight_pre_layer=None, + act_pre_layer=None, + weight_quant_layer=None, + act_quant_layer=None): + r""" + Constructor. + + The arguments are the same as ImperativeQuantAware. + """ + super(QuantizedConv2DTranspose, self).__init__() + # For Conv2DTranspose + self._groups = getattr(layer, '_groups') + self._stride = getattr(layer, '_stride') + self._padding = getattr(layer, '_padding') + self._output_padding = getattr(layer, 'output_padding') + self._dilation = getattr(layer, '_dilation') + self._data_format = getattr(layer, '_data_format') + self.weight = getattr(layer, 'weight') + self.bias = getattr(layer, 'bias') + # For FakeQuant + self._conv2d_transpose_quant_axis = 1 + if weight_quant_layer is not None: + self._fake_quant_weight = weight_quant_layer() + else: + self._fake_quant_weight = _get_fake_quant_type( + weight_quantize_type, + name=self.weight.name, + moving_rate=moving_rate, + quant_bits=weight_bits, + dtype=self._dtype, + quant_on_weight=True, + channel_num=self.weight.shape[ + self._conv2d_transpose_quant_axis], + quant_axis=self._conv2d_transpose_quant_axis) + if act_quant_layer is not None: + self._fake_quant_input = act_quant_layer() + else: + self._fake_quant_input = _get_fake_quant_type( + activation_quantize_type, + name=layer.full_name(), + moving_rate=moving_rate, + quant_bits=activation_bits, + dtype=self._dtype, + quant_on_weight=False) + + self._act_preprocess = act_pre_layer( + ) if act_pre_layer is not None else None + self._weight_preprocess = weight_pre_layer( + ) if weight_pre_layer is not None else None + + def forward(self, input, output_size=None): + if self._act_preprocess is not None: + input = self._act_preprocess(input) + quant_input = self._fake_quant_input(input) + + weight = self.weight + if self._weight_preprocess is not None: + weight = self._weight_preprocess(self.weight) + quant_weight = self._fake_quant_weight(weight) + + if output_size is None: + output_padding = self._output_padding + else: + output_padding = 0 + + return F.conv2d_transpose( + quant_input, + quant_weight, + bias=self.bias, + padding=self._padding, + output_padding=output_padding, + stride=self._stride, + dilation=self._dilation, + groups=self._groups, + output_size=output_size, + data_format=self._data_format) + + class QuantizedLinear(layers.Layer): """ The computational logic of QuantizedLinear is the same with Linear. diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py index 3ec12c11a7045a..d8cb70c9dd107b 100644 --- a/tools/sampcd_processor.py +++ b/tools/sampcd_processor.py @@ -440,6 +440,7 @@ def get_filenames(full_test=False): ''' global whl_error import paddle + import paddle.fluid.contrib.slim.quantization whl_error = [] if full_test: get_full_api_from_pr_spec() From 209075a44a90c51c450963cecafbdc8da74e4085 Mon Sep 17 00:00:00 2001 From: Fan Zhang Date: Wed, 18 Aug 2021 15:25:29 +0800 Subject: [PATCH 088/126] [CPU-PSLIB] Add consistency insepection of use_var_list and data_generator data, test=develop (#34463) --- .../distributed/fleet/dataset/dataset.py | 65 +++ .../test_dataset_consistency_inspection.py | 406 ++++++++++++++++++ 2 files changed, 471 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py index 8bc16dfbbae300..25a1d98cb11218 100644 --- a/python/paddle/distributed/fleet/dataset/dataset.py +++ b/python/paddle/distributed/fleet/dataset/dataset.py @@ -255,6 +255,71 @@ def _dynamic_adjust_before_train(self, thread_num): def _dynamic_adjust_after_train(self): pass + def _check_use_var_with_data_generator(self, var_list, data_generator_class, + test_file): + """ + Var consistency insepection of use_var_list and data_generator data. + + Examples: + .. code-block:: python + + # required: skiptest + import paddle + from dataset_generator import CTRDataset + dataset = paddle.distributed.fleet.DatasetBase() + generator_class = CTRDataset() + dataset._check_use_var_with_data_generator([data, label], generator_class, "data/part-00000") + + Args: + var_list(list): variable list + data_generator_class(class): data_generator class + test_file(str): local test file path + """ + + f = open(test_file, "r") + var_len = len(var_list) + + while True: + line = f.readline() + if line: + line_iter = data_generator_class.generate_sample(line) + for user_parsed_line in line_iter(): + data_gen_len = len(user_parsed_line) + if var_len != data_gen_len: + raise ValueError( + "var length mismatch error: var_list = %s vs data_generator = %s" + % (var_len, data_gen_len)) + + for i, ele in enumerate(user_parsed_line): + if len(ele[1]) == 0: + raise ValueError( + "var length error: var %s's length in data_generator is 0" + % ele[0]) + + if var_list[ + i].dtype == core.VarDesc.VarType.FP32 and not all( + isinstance(ele, float) for ele in ele[1]): + raise TypeError( + "var dtype mismatch error: var name = %s, var type in var_list = %s, while var in data_generator contains non-float value, which is %s \n" + "Please check if order of var_list and data_generator are aligned. \n" + "Please check if var's type in data_generator is correct." + % (ele[0], "float", ele[1])) + + if (var_list[i].dtype == core.VarDesc.VarType.INT64 or + var_list[i].dtype == core.VarDesc.VarType.INT32 + ) and not all( + isinstance(ele, int) for ele in ele[1]): + raise TypeError( + "var dtype mismatch error: var name = %s, var type in var_list = %s, while var in data_generator contains non-int value, which is %s \n" + "Please check if order of var_list and data_generator are aligned. \n" + "Please check if var's type in data_generator is correct." + % (ele[0], "int", ele[1])) + + else: + break + + f.close() + class InMemoryDataset(DatasetBase): """ diff --git a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py new file mode 100644 index 00000000000000..5911ada1817b60 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py @@ -0,0 +1,406 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +TestCases for Dataset consistency insepection of use_var_list and data_generator. +""" + +from __future__ import print_function +import paddle +import paddle.fluid as fluid +import paddle.compat as cpt +import paddle.fluid.core as core +import numpy as np +import random +import math +import os +import shutil +import unittest +import paddle.fluid.incubate.data_generator as dg + +#paddle.enable_static() +# fluid.disable_dygraph() +fluid.disable_dygraph() +url_schema_len = 5 +query_schema = [ + 'Q_query_basic', 'Q_query_phrase', 'Q_quq', 'Q_timelevel', + 'Q_context_title_basic1', 'Q_context_title_basic2', + 'Q_context_title_basic3', 'Q_context_title_basic4', + 'Q_context_title_basic5', 'Q_context_title_phrase1', + 'Q_context_title_phrase2', 'Q_context_title_phrase3', + 'Q_context_title_phrase4', 'Q_context_title_phrase5', 'Q_context_site1', + 'Q_context_site2', 'Q_context_site3', 'Q_context_site4', 'Q_context_site5' +] + + +class CTRDataset(dg.MultiSlotDataGenerator): + def __init__(self, mode): + self.test = mode + + def generate_sample(self, line): + def reader(): + ins = line.strip().split(';') + label_pos_num = int(ins[1].split(' ')[0]) + label_neg_num = int(ins[1].split(' ')[1]) + + #query fea parse + bias = 2 + query_len = 0 + sparse_query_feature = [] + for index in range(len(query_schema)): + pos = index + bias + sparse_query_feature.append( + [int(x) for x in ins[pos].split(' ')]) + if index == 0: + query_len = len(ins[pos].split(' ')) + query_len = 1.0 / (1 + pow(2.7182818, 3 - 1.0 * query_len)) + + #positive url fea parse + bias = 2 + len(query_schema) + pos_url_feas = [] + pos_click_feas = [] + pos_context_feas = [] + for k in range(label_pos_num): + pos_url_fea = [] + pos = 0 + for index in range(url_schema_len - 1): + pos = bias + k * (url_schema_len) + index + pos_url_fea.append([int(x) for x in ins[pos].split(' ')]) + #click info + if (ins[pos + 1] == ''): + continue + item = ins[pos + 1].split(' ') + if len(item) != 17: + continue + stat_fea = [[max(float(item[i]), 0.0)] for i in range(len(item)) \ + if not (i == 5 or i == 9 or i == 13 or i == 14 or i ==15 or i ==16)] + pos_url_feas.append(pos_url_fea) + pos_click_feas.append(stat_fea) + + query_serach = float(item[5]) + if query_serach > 0.0: + query_serach = min(math.log(query_serach), 10.0) / 10.0 + pos_context_fea = [[query_serach], [query_len]] + pos_context_feas.append(pos_context_fea) + + #negative url fea parse + bias = 2 + len(query_schema) + label_pos_num * (url_schema_len) + neg_url_feas = [] + neg_click_feas = [] + neg_context_feas = [] + for k in range(label_neg_num): + neg_url_fea = [] + pos = 0 + for index in range(url_schema_len - 1): + pos = bias + k * (url_schema_len) + index + neg_url_fea.append([int(x) for x in ins[pos].split(' ')]) + if (ins[pos + 1] == ''): + continue + item = ins[pos + 1].split(' ') + #zdf_tmp + if len(item) != 17: + continue + #print ins[pos + 1] + stat_fea = [[max(float(item[i]), 0.0)] for i in range(len(item)) \ + if not (i == 5 or i == 9 or i == 13 or i == 14 or i == 15 or i == 16)] + neg_click_feas.append(stat_fea) + neg_url_feas.append(neg_url_fea) + + query_serach = float(item[5]) + if query_serach > 0.0: + query_serach = min(math.log(query_serach), 10.0) / 10.0 + neg_context_fea = [[query_serach], [query_len]] + neg_context_feas.append(neg_context_fea) + + #make train data + if self.test == 1: + for p in range(len(pos_url_feas)): + # feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + pos_url_fea = pos_url_feas[p] + pos_click_fea = pos_click_feas[p] + pos_context_fea = pos_context_feas[p] + yield zip(feature_name, [[1]] + sparse_query_feature + + pos_url_fea + pos_click_fea + pos_context_fea + + pos_url_fea + pos_click_fea + pos_context_fea) + for n in range(len(neg_url_feas)): + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + neg_url_fea = neg_url_feas[n] + neg_click_fea = neg_click_feas[n] + neg_context_fea = neg_context_feas[n] + yield zip(feature_name, [[0]] + sparse_query_feature + + neg_url_fea + neg_click_fea + neg_context_fea + + neg_url_fea + neg_click_fea + neg_context_fea) + elif self.test == 0: + for p in range(len(pos_url_feas)): + #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + #print("#######") + #print(feature_name) + #print("#######") + pos_url_fea = pos_url_feas[p] + pos_click_fea = pos_click_feas[p] + pos_context_fea = pos_context_feas[p] + for n in range(len(neg_url_feas)): + # prob = get_rand() + # if prob < sample_rate: + neg_url_fea = neg_url_feas[n] + neg_click_fea = neg_click_feas[n] + neg_context_fea = neg_context_feas[n] + #print("q:", query_feas) + #print("pos:", pos_url_fea) + #print("neg:", neg_url_fea) + # yield zip(feature_name[:3], sparse_query_feature[:3]) + yield list(zip(feature_name, [[1]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea)) + elif self.test == 2: + for p in range(len(pos_url_feas)): + #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + #print("#######") + #print(feature_name) + #print("#######") + pos_url_fea = pos_url_feas[p] + pos_click_fea = pos_click_feas[p] + pos_context_fea = pos_context_feas[p] + for n in range(len(neg_url_feas)): + # prob = get_rand() + # if prob < sample_rate: + neg_url_fea = neg_url_feas[n] + neg_click_fea = neg_click_feas[n] + neg_context_fea = neg_context_feas[n] + #print("q:", query_feas) + #print("pos:", pos_url_fea) + #print("neg:", neg_url_fea) + # yield zip(feature_name[:3], sparse_query_feature[:3]) + yield list(zip(feature_name, [[1], [2]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea)) + elif self.test == 3: + for p in range(len(pos_url_feas)): + #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + #print("#######") + #print(feature_name) + #print("#######") + pos_url_fea = pos_url_feas[p] + pos_click_fea = pos_click_feas[p] + pos_context_fea = pos_context_feas[p] + for n in range(len(neg_url_feas)): + # prob = get_rand() + # if prob < sample_rate: + neg_url_fea = neg_url_feas[n] + neg_click_fea = neg_click_feas[n] + neg_context_fea = neg_context_feas[n] + #print("q:", query_feas) + #print("pos:", pos_url_fea) + #print("neg:", neg_url_fea) + # yield zip(feature_name[:3], sparse_query_feature[:3]) + yield list(zip(feature_name, [[1], [2.0]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea)) + elif self.test == 4: + for p in range(len(pos_url_feas)): + #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + #print("#######") + #print(feature_name) + #print("#######") + pos_url_fea = pos_url_feas[p] + pos_click_fea = pos_click_feas[p] + pos_context_fea = pos_context_feas[p] + for n in range(len(neg_url_feas)): + # prob = get_rand() + # if prob < sample_rate: + neg_url_fea = neg_url_feas[n] + neg_click_fea = neg_click_feas[n] + neg_context_fea = neg_context_feas[n] + #print("q:", query_feas) + #print("pos:", pos_url_fea) + #print("neg:", neg_url_fea) + # yield zip(feature_name[:3], sparse_query_feature[:3]) + yield list(zip(feature_name, [[], [2.0]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea)) + elif self.test == 5: + for p in range(len(pos_url_feas)): + #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + #print("#######") + #print(feature_name) + #print("#######") + pos_url_fea = pos_url_feas[p] + pos_click_fea = pos_click_feas[p] + pos_context_fea = pos_context_feas[p] + for n in range(len(neg_url_feas)): + # prob = get_rand() + # if prob < sample_rate: + neg_url_fea = neg_url_feas[n] + neg_click_fea = neg_click_feas[n] + neg_context_fea = neg_context_feas[n] + #print("q:", query_feas) + #print("pos:", pos_url_fea) + #print("neg:", neg_url_fea) + # yield zip(feature_name[:3], sparse_query_feature[:3]) + yield list(zip(feature_name, sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea)) + + return reader + + +class TestDataset(unittest.TestCase): + """ TestCases for Dataset. """ + + def setUp(self): + pass + # use_data_loader = False + # epoch_num = 10 + # drop_last = False + + def test_var_consistency_insepection(self): + """ + Testcase for InMemoryDataset of consistency insepection of use_var_list and data_generator. + """ + with open("test_run_with_dump_a.txt", "w") as f: + # data = "\n" + # data += "\n" + data = "2 1;1 9;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;0;40000001;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20012788 20000157;20002001 20001240 20001860 20003611 20000623 20000251 20000157 20000723 20000070 20000001 20000057;20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20003519 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20003519 20000005;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20131464;20002001 20001240 20001860 20003611 20018820 20000157 20000723 20000070 20000001 20000057;20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000200;10000200;10063938;10000008;10000177;20002001 20001240 20001860 20003611 20010833 20000210 20000500 20000401 20000251 20012198 20001023 20000157;20002001 20001240 20001860 20003611 20012396 20000500 20002513 20012198 20001023 20000157;10000123;30000004;0.623 0.233 0.290 0.208 0.354 49.000 0.000 0.000 0.000 -1.000 0.569 0.679 0.733 53 17 2 0;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;10000047;30000004;0.067 0.000 0.161 0.005 0.000 49.000 0.000 0.000 0.000 -1.000 0.000 0.378 0.043 0 6 0 0;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20003519 20000005;10000200;30000001;0.407 0.111 0.196 0.095 0.181 49.000 0.000 0.000 0.000 -1.000 0.306 0.538 0.355 48 8 0 0;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20003519 20000005;10000200;30000001;0.226 0.029 0.149 0.031 0.074 49.000 0.000 0.000 0.000 -1.000 0.220 0.531 0.286 26 6 0 0;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20012788 20000157;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20131464;10063938;30000001;0.250 0.019 0.138 0.012 0.027 49.000 0.000 0.000 0.000 -1.000 0.370 0.449 0.327 7 2 0 0;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;10000003;30000002;0.056 0.000 0.139 0.003 0.000 49.000 0.000 0.000 0.000 -1.000 0.000 0.346 0.059 15 3 0 0;20002001 20001240 20001860 20003611 20000623 20000251 20000157 20000723 20000070 20000001 20000057;20002001 20001240 20001860 20003611 20018820 20000157 20000723 20000070 20000001 20000057;10000008;30000001;0.166 0.004 0.127 0.001 0.004 49.000 0.000 0.000 0.000 -1.000 0.103 0.417 0.394 10 3 0 0;20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000177;30000001;0.094 0.008 0.157 0.012 0.059 49.000 0.000 0.000 0.000 -1.000 0.051 0.382 0.142 21 0 0 0;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20000157;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20000157;10000134;30000001;0.220 0.016 0.181 0.037 0.098 49.000 0.000 0.000 0.000 -1.000 0.192 0.453 0.199 17 1 0 0;20002001 20001240 20001860 20003611 20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002001 20001240 20001860 20003611 20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000638;30000001;0.000 0.000 0.000 0.000 0.000 49.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0 0 0 0;\n" + data += "2 1;1 11;20000025 20000404;20001923;20000002 20000157 20000028 20004205 20000500 20028809 20000571 20000007 20027523 20004940 20000651 20000043 20000051 20000520 20015398 20000066 20004720 20000070 20001648;40000001;20000025 20000404 20000571 20004940 20000001 20000017;20000025 20000404 20000029 20000500 20001408 20000404 20000001 20000017;0;0;0;20001923 20011130 20000027;20001923 20000029 20000500 20001408 20000404 20000027;0;0;0;10000005;10000005;0;0;0;20003316 20000392 20001979 20000474 20000025 20000194 20000025 20000404 20000019 20000109;20016528 20024913 20004748 20001923 20000019 20000109;10000015;30000002;0.572 0.043 0.401 0.352 0.562 32859.000 0.005 0.060 0.362 -1.000 0.448 0.673 0.222 16316 991 89 0;20000025 20000404 20000571 20004940 20000001 20000017;20001923 20011130 20000027;10000005;30000001;0.495 0.024 0.344 0.285 0.379 32859.000 0.002 0.050 0.362 -1.000 0.423 0.764 0.254 19929 896 72 0;20000202 20000026 20001314 20004289 20000025 20000404 20000451 20000089 20000007;20000202 20000026 20014094 20001314 20004289 20001923 20000451 20000089 20000007;10000035;30000003;0.133 0.006 0.162 0.042 0.174 32859.000 0.003 0.037 0.362 -1.000 0.363 0.542 0.122 14763 664 53 0;20000202 20000026 20001314 20004289 20000025 20000404;20000202 20000026 20014094 20001314 20004289 20001923;10000021;30000001;0.058 0.004 0.133 0.017 0.120 32859.000 0.000 0.006 0.362 -1.000 0.168 0.437 0.041 -1 -1 -1 -1;20000025 20000404 20000018 20012461 20001699 20000446 20000174 20000062 20000133 20003172 20000240 20007877 20067375 20000111 20000164 20001410 20000204 20016958;20001923 20000018 20012461 20001699 20007717 20000062 20000133 20003172 20000240 20007877 20067375 20000111 20000164 20001410 20000204 20016958;10000002;30000001;0.017 0.000 0.099 0.004 0.072 32859.000 0.000 0.009 0.362 -1.000 0.058 0.393 0.025 -1 -1 -1 -1;20000025 20000404;20001923;10000133;30000005;0.004 0.000 0.122 0.000 0.000 32859.000 0.000 0.000 0.362 -1.000 0.000 0.413 0.020 0 444 35 0;20000025 20000404;20001923;10005297;30000004;0.028 0.000 0.138 0.002 0.000 32859.000 0.000 0.000 0.362 -1.000 0.000 0.343 0.024 0 600 48 0;20000025 20000404;20001923;10000060;30000005;0.107 0.000 0.110 0.027 0.077 32859.000 0.000 0.005 0.362 -1.000 0.095 0.398 0.062 1338 491 39 0;20002960 20005534 20000043 20000025 20000404 20000025 20000007;20002960 20005534 20000043 20001923 20000025 20000007;10000020;30000003;0.041 0.000 0.122 0.012 0.101 32859.000 0.001 0.025 0.362 -1.000 0.302 0.541 0.065 9896 402 35 0;20000025 20000404 20000259 20000228 20000235 20000142;20001923 20000259 20000264 20000142;10000024;30000003;0.072 0.002 0.156 0.026 0.141 32859.000 0.002 0.032 0.362 -1.000 0.386 0.569 0.103 9896 364 35 0;20000025 20000404 20000029 20000500 20001408 20000404 20000001 20000017;20001923 20000029 20000500 20001408 20000404 20000027;10000005;30000001;0.328 0.006 0.179 0.125 0.181 32859.000 0.003 0.058 0.362 -1.000 0.300 0.445 0.141 9896 402 32 0;20000025 20000404;20001923;10012839;30000002;0.012 0.000 0.108 0.002 0.048 32859.000 0.000 0.000 0.362 -1.000 0.021 0.225 0.016 2207 120 12 0;\n" + # data += "" + f.write(data) + + slot_data = [] + label = fluid.layers.data( + name="click", + shape=[-1, 1], + dtype="int64", + lod_level=0, + append_batch_size=False) + slot_data.append(label) + + # sprase_query_feat_names + len_sparse_query = 19 + for feat_name in range(1, len_sparse_query + 1): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='int64', lod_level=1)) + + # sparse_url_feat_names + for feat_name in range(len_sparse_query + 1, len_sparse_query + 5): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='int64', lod_level=1)) + + # dense_feat_names + for feat_name in range(len_sparse_query + 5, len_sparse_query + 16): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='float32')) + + # context_feat_namess + for feat_name in range(len_sparse_query + 16, len_sparse_query + 18): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='float32')) + + # neg sparse_url_feat_names + for feat_name in range(len_sparse_query + 18, len_sparse_query + 22): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='int64', lod_level=1)) + + # neg dense_feat_names + for feat_name in range(len_sparse_query + 22, len_sparse_query + 33): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='float32')) + + # neg context_feat_namess + for feat_name in range(len_sparse_query + 33, len_sparse_query + 35): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='float32')) + + dataset = paddle.distributed.InMemoryDataset() + + print("========================================") + generator_class = CTRDataset(mode=0) + try: + dataset._check_use_var_with_data_generator( + slot_data, generator_class, "test_run_with_dump_a.txt") + print("case 1: check passed!") + except Exception as e: + print("warning: catch expected error") + print(e) + print("========================================") + print("\n") + + print("========================================") + generator_class = CTRDataset(mode=2) + try: + dataset._check_use_var_with_data_generator( + slot_data, generator_class, "test_run_with_dump_a.txt") + except Exception as e: + print("warning: case 2 catch expected error") + print(e) + print("========================================") + print("\n") + + print("========================================") + generator_class = CTRDataset(mode=3) + try: + dataset._check_use_var_with_data_generator( + slot_data, generator_class, "test_run_with_dump_a.txt") + except Exception as e: + print("warning: case 3 catch expected error") + print(e) + print("========================================") + print("\n") + + print("========================================") + generator_class = CTRDataset(mode=4) + try: + dataset._check_use_var_with_data_generator( + slot_data, generator_class, "test_run_with_dump_a.txt") + except Exception as e: + print("warning: case 4 catch expected error") + print(e) + print("========================================") + print("\n") + + print("========================================") + generator_class = CTRDataset(mode=5) + try: + dataset._check_use_var_with_data_generator( + slot_data, generator_class, "test_run_with_dump_a.txt") + except Exception as e: + print("warning: case 5 catch expected error") + print(e) + print("========================================") + + os.remove("./test_run_with_dump_a.txt") + + +if __name__ == '__main__': + unittest.main() From 51939c83e4bbc308b921541c9091b25416ccae86 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Wed, 18 Aug 2021 15:46:48 +0800 Subject: [PATCH 089/126] Fix the parameter name for atan2 API (#34812) --- python/paddle/tensor/math.py | 50 ++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 394d46b9161903..026224ce7a69ed 100755 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -2491,25 +2491,25 @@ def neg(x, name=None): return layers.scale(x, scale=-1.0, bias=0.0, bias_after_scale=True, act=None, name=name) -def atan2(y, x, name=None): +def atan2(x, y, name=None): r""" - Element-wise arctangent of y/x with consideration of the quadrant. + Element-wise arctangent of x/y with consideration of the quadrant. Equation: .. math:: - atan2(y,x)=\left\{\begin{matrix} - & tan^{-1}(\frac{y}{x}) & x > 0 \\ - & tan^{-1}(\frac{y}{x}) + \pi & y>=0, x < 0 \\ - & tan^{-1}(\frac{y}{x}) - \pi & y<0, x < 0 \\ - & +\frac{\pi}{2} & y>0, x = 0 \\ - & -\frac{\pi}{2} & y<0, x = 0 \\ - &\text{undefined} & y=0, x = 0 - \end{matrix}\right. + atan2(x,y)=\left\{\begin{matrix} + & tan^{-1}(\frac{x}{y}) & y > 0 \\ + & tan^{-1}(\frac{x}{y}) + \pi & x>=0, y < 0 \\ + & tan^{-1}(\frac{x}{y}) - \pi & x<0, y < 0 \\ + & +\frac{\pi}{2} & x>0, y = 0 \\ + & -\frac{\pi}{2} & x<0, y = 0 \\ + &\text{undefined} & x=0, y = 0 + \end{matrix}\right. Args: - y (Tensor): An N-D Tensor, the data type is int32, int64, float16, float32, float64. - x (Tensor): An N-D Tensor, must have the same type as `x`. + x (Tensor): An N-D Tensor, the data type is int32, int64, float16, float32, float64. + y (Tensor): An N-D Tensor, must have the same type as `x`. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: @@ -2518,30 +2518,30 @@ def atan2(y, x, name=None): Examples: .. code-block:: python - import paddle + import paddle - y = paddle.to_tensor([-1, +1, +1, -1]).astype('float32') - #Tensor(shape=[4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [-1, 1, 1, -1]) + x = paddle.to_tensor([-1, +1, +1, -1]).astype('float32') + #Tensor(shape=[4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [-1, 1, 1, -1]) - x = paddle.to_tensor([-1, -1, +1, +1]).astype('float32') - #Tensor(shape=[4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [-1, -1, 1, 1]) + y = paddle.to_tensor([-1, -1, +1, +1]).astype('float32') + #Tensor(shape=[4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [-1, -1, 1, 1]) - out = paddle.atan2(y, x) - #Tensor(shape=[4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [-2.35619450, 2.35619450, 0.78539819, -0.78539819]) + out = paddle.atan2(x, y) + #Tensor(shape=[4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [-2.35619450, 2.35619450, 0.78539819, -0.78539819]) """ if in_dygraph_mode(): - return _C_ops.atan2(y, x) + return _C_ops.atan2(x, y) else: - check_variable_and_dtype(y, 'y', ['int32', 'int64', 'float16', 'float32', 'float64'], 'atan2') check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'atan2') + check_variable_and_dtype(y, 'y', ['int32', 'int64', 'float16', 'float32', 'float64'], 'atan2') helper = LayerHelper('atan2', **locals()) - inputs = {'X1' : y, 'X2' : x} + inputs = {'X1' : x, 'X2' : y} out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='atan2', inputs=inputs, outputs={'Out': out}) From a9673b44b8fc0d92e9b48bc14f1f9bf63a90c71a Mon Sep 17 00:00:00 2001 From: WangXi Date: Wed, 18 Aug 2021 16:09:02 +0800 Subject: [PATCH 090/126] [Hybrid Performance] Move the cast op of AMP which cast fp32 param to fp16 param to the optimizer (#34965) --- .../framework/distributed_strategy.proto | 1 + .../fleet/base/distributed_strategy.py | 3 + .../sharding/offload_helper.py | 48 +++-- .../meta_optimizers/sharding_optimizer.py | 8 + .../test_fleet_sharding_meta_optimizer.py | 191 ++++++++++++++++++ 5 files changed, 236 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 1de6d26d05b9e4..546b9d2601df57 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -42,6 +42,7 @@ message ShardingConfig { optional bool optimize_offload = 9 [ default = false ]; optional bool pp_allreduce_in_optimize = 10 [ default = false ]; optional int32 pp_degree = 11 [ default = 1 ]; + optional bool optimize_cast = 12 [ default = false ]; } message HybridConfig { diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index 051f6b11c2609a..d43292ddbd32e9 100644 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -888,6 +888,9 @@ def sharding_configs(self): pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on. This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now.. Default is False. + optimize_cast(bool, optional): [Hybrid parallelism ONLY] Move the cast op of AMP which cast fp32 param to fp16 param to optimizer. optimize_cast will persist fp16 param, it + will take more memory, but will be faster, trade space for time. Recommend to turn on only when using pipeline or gradient_merge_acc_step large. + Examples: diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py index f6741b165ce072..a96705b09e835e 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..common import is_optimizer_op, OP_ROLE_KEY, OpRole +from ..common import is_optimizer_op, OP_ROLE_KEY, OpRole, is_update_op from paddle.fluid import core, unique_name __all__ = [] @@ -84,7 +84,7 @@ def _create_offload_var(self, var_name, offload_var_name, blocks): dtype=var.dtype, persistable=True) - def offload_fp32param(self, block, startup_block): + def offload_fp32param(self, block, startup_block, offload=True): """ (p_fp16) = cast(p) (p_fp16_recompute) = cast(p) @@ -113,11 +113,12 @@ def remove_param(input_name): # step1: record param for idx, op in reversed(list(enumerate(block.ops))): - if op.type in ('adam', 'momentum', 'lars', 'lamb'): + if is_update_op(op): param = op.desc.input("Param")[0] param_to_idx[param] = idx - # step2: remove param which can't offload + # step2: remove param which can't offload and + # record param->fp16param, fp16param->recompute_var for idx, op in enumerate(block.ops): if is_optimizer_op(op): break @@ -125,7 +126,7 @@ def remove_param(input_name): if input_name not in param_to_idx: continue - # param is real used by fp32 op + # param which will be used by fp32 op if op.type != 'cast': remove_param(input_name) continue @@ -154,17 +155,19 @@ def remove_param(input_name): # step3: main_block add offload, cast op # change recompute to fp16, remove cast(param) to fp16 for idx, op in reversed(list(enumerate(block.ops))): - if op.type in ('adam', 'momentum', 'lars', 'lamb'): + if is_update_op(op): param = op.desc.input("Param")[0] if param not in param_to_idx: continue # step3.1: create offload_var offload_var_name = self._get_offload_var_name(param) param_name_to_offload_name[param] = offload_var_name - self._create_offload_var(param, offload_var_name, - [block, startup_block]) + if offload: + self._create_offload_var(param, offload_var_name, + [block, startup_block]) - # step3.2: insert cast op and offload op - self._insert_offload_op(block, idx + 1, param, offload_var_name) + # step3.2: insert cast op and offload op + self._insert_offload_op(block, idx + 1, param, + offload_var_name) assert param in param_to_fp16 fp16_param_name = param_to_fp16[param] @@ -173,8 +176,9 @@ def remove_param(input_name): self._insert_cast_op(block, idx + 1, param, param_to_fp16[param]) - # step3.3: insert fetch op - self._insert_fetch_op(block, idx, offload_var_name, param) + if offload: + # step3.3: insert fetch op + self._insert_fetch_op(block, idx, offload_var_name, param) continue # step3.4: remove cast op @@ -206,9 +210,10 @@ def remove_param(input_name): if out_name in param_name_to_offload_name: var_name = out_name - offload_var_name = param_name_to_offload_name[var_name] - self._insert_offload_op(startup_block, idx + 1, var_name, - offload_var_name) + if offload: + offload_var_name = param_name_to_offload_name[var_name] + self._insert_offload_op(startup_block, idx + 1, + var_name, offload_var_name) self._insert_cast_op(startup_block, idx + 1, var_name, param_to_fp16[var_name]) @@ -217,6 +222,19 @@ def remove_param(input_name): block._sync_with_cpp() startup_block._sync_with_cpp() + def cast_fp32param_in_optimize(self, block, startup_block): + """ + (p_fp16) = cast(p) + (p_fp16_recompute) = cast(p) + (pout,) = adam(p) + ===========================> + rename(p_fp16_recompute, p_fp16) + + (pout,) = adam(p) + (p_fp16) = cast(p) + """ + self.offload_fp32param(block, startup_block, offload=False) + def offload(self, block, startup_block): """ (m1, m2) = prefetch(m1@offload, m2@offload) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 93901b38873b95..5c2f24054f835c 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -400,7 +400,14 @@ def _apply_optimize_offload_pass(self): logger.info("Sharding with optimize offload !") offload_helper = OffloadHelper() offload_helper.offload(main_block, startup_block) + # The optimize_cast is already included in offload_fp32param offload_helper.offload_fp32param(main_block, startup_block) + elif sharding_configs['optimize_cast']: + logger.info("Sharding with optimize cast !") + # NOTE(wangxi): optimize_cast will persist fp16 param, it + # will take more memory, but will be faster. Trade space for time. + offload_helper = OffloadHelper() + offload_helper.cast_fp32param_in_optimize(main_block, startup_block) def _dump_program_for_debug(self): main_block = self._main_program.global_block() @@ -444,6 +451,7 @@ def minimize_impl(self, # loss div dp_degree self._insert_loss_grad_scale_op() + # apply optimize offload or optimize cast self._apply_optimize_offload_pass() # step6: (optional) sharding gradient merge diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index d70a58c7d8ab41..5a981a470cb4ef 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -859,6 +859,197 @@ def test_hybrid_with_sharding_pp_amp_fp16allreduce_in_optimize(self): self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36002']) + def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.pp_net(train_prog, startup_prog) + strategy.amp = True + strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], } + strategy.sharding = True + strategy.sharding_configs = { + "sharding_degree": 1, + "mp_degree": 1, + "pp_degree": 2, + "dp_degree": 2, + "optimize_cast": True, + } + strategy.pipeline = True + strategy.pipeline_configs = { + "schedule_mode": "1F1B", + "micro_batch_size": 2, + "accumulate_steps": 4, + } + strategy.fp16_allreduce = True + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + train_prog = train_prog._pipeline_opt['section_program'] + startup_prog = startup_prog._pipeline_opt['startup_program'] + + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check program + startup_prog_op_types = [op.type for op in startup_prog_ops] + main_prog_op_types = [op.type for op in main_prog_ops] + + # ring: mp, pp_group, pp_pair, pp_pair + self.assertEqual(startup_prog_op_types, [ + 'uniform_random', 'cast', 'fill_constant', 'cast', 'uniform_random', + 'cast', 'fill_constant', 'cast', 'uniform_random', 'cast', + 'fill_constant', 'cast', 'uniform_random', 'cast', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', + 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', + 'c_sync_comm_stream' + ]) + + self.assertEqual(main_prog_op_types, [ + 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', + 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', + 'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', + 'elementwise_mul', 'fill_constant', 'scale', 'scale', + 'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', + 'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', + 'fill_constant', 'cast', 'sum', 'fill_constant', 'sum', + 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', + 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', + 'fill_constant', 'sum', 'coalesce_tensor', 'c_allreduce_sum', + 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', + 'c_sync_comm_stream', 'check_finite_and_unscale', 'cast', + 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', + 'cast', 'momentum', 'cast', 'momentum', 'cast', 'momentum', 'cast', + 'momentum', 'cast', 'momentum', 'cast', 'momentum', 'momentum', + 'cast' + ]) + + # amp check_finite_and_unscale, allreduce(pp) + self.assertEqual(main_prog_op_types.count('c_allreduce_max'), 1) + + # should has ring id for pp + created_ring_ids = [ + op.desc.attr("ring_id") for op in startup_prog_ops + if op.type == "c_comm_init" + ] + self.assertIn(self.pp_pair_ring_id, created_ring_ids) + self.assertIn(self.dp_ring_id, created_ring_ids) + + # check correctness of pp group + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "comm_id_0": + pp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36003']) + + # check correctness of dp group + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "comm_id_3": + dp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002']) + + def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_offload(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.pp_net(train_prog, startup_prog) + strategy.amp = True + strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], } + strategy.sharding = True + strategy.sharding_configs = { + "sharding_degree": 1, + "mp_degree": 1, + "pp_degree": 2, + "dp_degree": 2, + "optimize_offload": True, + } + strategy.pipeline = True + strategy.pipeline_configs = { + "schedule_mode": "1F1B", + "micro_batch_size": 2, + "accumulate_steps": 4, + } + strategy.fp16_allreduce = True + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + train_prog = train_prog._pipeline_opt['section_program'] + startup_prog = startup_prog._pipeline_opt['startup_program'] + + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check program + startup_prog_op_types = [op.type for op in startup_prog_ops] + main_prog_op_types = [op.type for op in main_prog_ops] + + # ring: mp, pp_group, pp_pair, pp_pair + self.assertEqual(startup_prog_op_types, [ + 'uniform_random', 'cast', 'memcpy', 'fill_constant', 'cast', + 'memcpy', 'uniform_random', 'cast', 'memcpy', 'fill_constant', + 'cast', 'memcpy', 'uniform_random', 'cast', 'memcpy', + 'fill_constant', 'cast', 'memcpy', 'uniform_random', 'cast', + 'memcpy', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', + 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', + 'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream' + ]) + + self.assertEqual(main_prog_op_types, [ + 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', + 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', + 'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', + 'elementwise_mul', 'fill_constant', 'scale', 'scale', + 'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', + 'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', + 'fill_constant', 'cast', 'sum', 'fill_constant', 'sum', + 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', + 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', + 'fill_constant', 'sum', 'coalesce_tensor', 'c_allreduce_sum', + 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', + 'c_sync_comm_stream', 'check_finite_and_unscale', 'cast', + 'c_allreduce_max', 'cast', 'update_loss_scaling', 'memcpy', + 'momentum', 'cast', 'memcpy', 'memcpy', 'momentum', 'cast', + 'memcpy', 'memcpy', 'momentum', 'cast', 'memcpy', 'memcpy', + 'momentum', 'cast', 'memcpy', 'memcpy', 'momentum', 'cast', + 'memcpy', 'memcpy', 'momentum', 'cast', 'memcpy', 'momentum', + 'memcpy', 'momentum', 'cast', 'memcpy' + ]) + + # amp check_finite_and_unscale, allreduce(pp) + self.assertEqual(main_prog_op_types.count('c_allreduce_max'), 1) + + # should has ring id for pp + created_ring_ids = [ + op.desc.attr("ring_id") for op in startup_prog_ops + if op.type == "c_comm_init" + ] + self.assertIn(self.pp_pair_ring_id, created_ring_ids) + self.assertIn(self.dp_ring_id, created_ring_ids) + + # check correctness of pp group + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "comm_id_0": + pp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36003']) + + # check correctness of dp group + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "comm_id_3": + dp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002']) + if __name__ == "__main__": unittest.main() From 40f627370ba0a1ea75f864a9107b5d7a979a911d Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Wed, 18 Aug 2021 16:44:38 +0800 Subject: [PATCH 091/126] [NPU] Add leaky Relu (#34894) * test=develop * test=develop --- paddle/fluid/operators/activation_op_npu.cc | 53 +++++++ .../unittests/npu/test_leaky_relu_op_npu.py | 141 ++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc index 8f6af4260dcc96..d815a3eeb4d81c 100755 --- a/paddle/fluid/operators/activation_op_npu.cc +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -207,6 +207,47 @@ class SqrtNPUKernel : public framework::OpKernel { } }; +template +class LeakyReluNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto alpha = ctx.Attr("alpha"); + + out->mutable_data(ctx.GetPlace()); + + auto stream = + ctx.template device_context() + .stream(); + + const auto& runner = + NpuOpRunner("LeakyRelu", {*x}, {*out}, {{"negative_slope", alpha}}); + runner.Run(stream); + } +}; + +template +class LeakyReluGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto alpha = ctx.Attr("alpha"); + + auto stream = + ctx.template device_context() + .stream(); + + dx->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("LeakyReluGrad", {*dout, *x}, {*dx}, + {{"negative_slope", alpha}}); + + runner.Run(stream); + } +}; + template class SqrtGradNPUKernel : public framework::OpKernel { public: @@ -778,6 +819,18 @@ REGISTER_OP_NPU_KERNEL( ops::Relu6GradNPUKernel); +REGISTER_OP_NPU_KERNEL( + leaky_relu, + ops::LeakyReluNPUKernel, + ops::LeakyReluNPUKernel); + +REGISTER_OP_NPU_KERNEL( + leaky_relu_grad, + ops::LeakyReluGradNPUKernel, + ops::LeakyReluGradNPUKernel); + REGISTER_OP_NPU_KERNEL( sqrt, ops::SqrtNPUKernel, ops::SqrtNPUKernel Date: Wed, 18 Aug 2021 16:44:58 +0800 Subject: [PATCH 092/126] [NPU] Add square grad (#34889) * test=develop * test=develop --- paddle/fluid/operators/activation_op_npu.cc | 35 +++++++++++++++++++ .../tests/unittests/npu/test_square_op_npu.py | 10 +++--- 2 files changed, 39 insertions(+), 6 deletions(-) mode change 100755 => 100644 paddle/fluid/operators/activation_op_npu.cc diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc old mode 100755 new mode 100644 index d815a3eeb4d81c..5f2925784e4244 --- a/paddle/fluid/operators/activation_op_npu.cc +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -386,6 +386,35 @@ class SquareNPUKernel : public framework::OpKernel { } }; +template +class SquareGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto factor = static_cast(2.0); + + auto place = ctx.GetPlace(); + auto stream = + ctx.template device_context() + .stream(); + // Step 1: Compute x_muls_factor = factor * x + Tensor x_muls_factor(x->type()); + x_muls_factor.mutable_data(x->dims(), place); + const auto& runner_muls_1 = + NpuOpRunner("Muls", {*x}, {x_muls_factor}, {{"value", factor}}); + runner_muls_1.Run(stream); + + // Step 2: Compute dx = dout * factor * x + dx->mutable_data(place); + const auto& runner_mul_2 = + NpuOpRunner("Mul", {*dout, x_muls_factor}, {*dx}, {}); + runner_mul_2.Run(stream); + } +}; + template class SigmoidNPUKernel : public framework::OpKernel { public: @@ -869,6 +898,12 @@ REGISTER_OP_NPU_KERNEL( paddle::platform::float16>, ops::SquareNPUKernel); +REGISTER_OP_NPU_KERNEL( + square_grad, + ops::SquareGradNPUKernel, + ops::SquareNPUKernel); + REGISTER_OP_NPU_KERNEL( sigmoid, ops::SigmoidNPUKernel, ops::SigmoidNPUKernel Date: Wed, 18 Aug 2021 17:20:51 +0800 Subject: [PATCH 093/126] add paddle detection model in pr-ci-inference (#34986) --- paddle/fluid/inference/api/analysis_config.cc | 2 +- paddle/fluid/inference/tests/infer_ut/run.sh | 122 ++++++++------ .../tests/infer_ut/test_ppyolo_mbv3.cc | 156 ++++++++++++++++++ .../tests/infer_ut/test_ppyolov2_r50vd.cc | 155 +++++++++++++++++ .../inference/tests/infer_ut/test_resnet50.cc | 29 ++++ .../inference/tests/infer_ut/test_yolov3.cc | 155 +++++++++++++++++ 6 files changed, 571 insertions(+), 48 deletions(-) create mode 100644 paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc create mode 100644 paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc create mode 100644 paddle/fluid/inference/tests/infer_ut/test_yolov3.cc diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index b515f7050e510b..bf7199491343e8 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -740,7 +740,7 @@ std::string AnalysisConfig::Summary() { // cpu info os.InsertRow( {"cpu_math_thread", std::to_string(cpu_math_library_num_threads_)}); - os.InsertRow({"enable_mkdlnn", use_mkldnn_ ? "true" : "false"}); + os.InsertRow({"enable_mkldnn", use_mkldnn_ ? "true" : "false"}); os.InsertRow( {"mkldnn_cache_capacity", std::to_string(mkldnn_cache_capacity_)}); os.InsetDivider(); diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh index 7d17bb647a1103..c1694c76a7d2c8 100755 --- a/paddle/fluid/inference/tests/infer_ut/run.sh +++ b/paddle/fluid/inference/tests/infer_ut/run.sh @@ -24,8 +24,14 @@ MSVC_STATIC_CRT=$6 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir EXIT_CODE=0 # init default exit code +export RED='\033[0;31m' # red color +export NC='\033[0m' # no color +export YELLOW='\033[33m' # yellow color + cd `dirname $0` current_dir=`pwd` +build_dir=${current_dir}/build +log_dir=${current_dir}/log if [ $2 == ON ]; then # You can export yourself if move the install path MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib @@ -83,24 +89,42 @@ for model_name in $nlp_download_list; do download $url_prefix $model_name done +det_download_list='yolov3 ppyolo_mbv3 ppyolov2_r50vd' +for model_name in $det_download_list; do + url_prefix="https://paddle-qa.bj.bcebos.com/inference_model/2.1.1/detection" + download $url_prefix $model_name +done + +function compile_test() { + mkdir -p ${build_dir} + cd ${build_dir} + TEST_NAME=$1 + cmake .. -DPADDLE_LIB=${inference_install_dir} \ + -DWITH_MKL=$TURN_ON_MKL \ + -DDEMO_NAME=${TEST_NAME} \ + -DWITH_GPU=$TEST_GPU_CPU \ + -DWITH_STATIC_LIB=OFF \ + -DUSE_TENSORRT=$USE_TENSORRT \ + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_GTEST=ON + make -j$(nproc) + cd - +} + + # compile and run test cd $current_dir -mkdir -p build -cd build +mkdir -p ${build_dir} +mkdir -p ${log_dir} +cd ${build_dir} rm -rf * -# ---------tensorrt resnet50 on linux--------- +# ---------tensorrt gpu tests on linux--------- if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then rm -rf * - cmake .. -DPADDLE_LIB=${inference_install_dir} \ - -DWITH_MKL=$TURN_ON_MKL \ - -DDEMO_NAME=test_resnet50 \ - -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=OFF \ - -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ - -DWITH_GTEST=ON - make -j$(nproc) + + printf "${YELLOW} start test_resnet50 ${NC} \n"; + compile_test "test_resnet50" ./test_resnet50 \ --modeldir=$DATA_DIR/resnet50/resnet50 \ --gtest_output=xml:test_resnet50.xml @@ -108,18 +132,9 @@ if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then echo "test_resnet50 runs failed" >> ${current_dir}/build/test_summary.txt EXIT_CODE=1 fi -fi -# ---------tensorrt det_mv3_db on linux--------- -if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then - cmake .. -DPADDLE_LIB=${inference_install_dir} \ - -DWITH_MKL=$TURN_ON_MKL \ - -DDEMO_NAME=test_det_mv3_db \ - -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=OFF \ - -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ - -DWITH_GTEST=ON + printf "${YELLOW} start test_det_mv3_db ${NC} \n"; + compile_test "test_det_mv3_db" make -j$(nproc) ./test_det_mv3_db \ --modeldir=$DATA_DIR/ocr_det_mv3_db/ocr_det_mv3_db \ @@ -128,19 +143,9 @@ if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then echo "test_det_mv3_db runs failed" >> ${current_dir}/build/test_summary.txt EXIT_CODE=1 fi -fi -# ---------tensorrt LeViT on linux--------- -if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then - cmake .. -DPADDLE_LIB=${inference_install_dir} \ - -DWITH_MKL=$TURN_ON_MKL \ - -DDEMO_NAME=test_LeViT \ - -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=OFF \ - -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ - -DWITH_GTEST=ON - make -j$(nproc) + printf "${YELLOW} start test_LeViT ${NC} \n"; + compile_test "test_LeViT" ./test_LeViT \ --modeldir=$DATA_DIR/LeViT/LeViT \ --gtest_output=xml:test_LeViT.xml @@ -148,19 +153,9 @@ if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then echo "test_LeViT runs failed" >> ${current_dir}/build/test_summary.txt EXIT_CODE=1 fi -fi -# ---------gpu ernie_text_cls on linux--------- -if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then - cmake .. -DPADDLE_LIB=${inference_install_dir} \ - -DWITH_MKL=$TURN_ON_MKL \ - -DDEMO_NAME=test_ernie_text_cls \ - -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=OFF \ - -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ - -DWITH_GTEST=ON - make -j$(nproc) + printf "${YELLOW} start test_ernie_text_cls ${NC} \n"; + compile_test "test_ernie_text_cls" ./test_ernie_text_cls \ --modeldir=$DATA_DIR/ernie_text_cls/ernie_text_cls \ --gtest_output=xml:test_ernie_text_cls.xml @@ -168,8 +163,41 @@ if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then echo "test_ernie_text_cls runs failed" >> ${current_dir}/build/test_summary.txt EXIT_CODE=1 fi + + printf "${YELLOW} start test_yolov3 ${NC} \n"; + compile_test "test_yolov3" + ./test_yolov3 \ + --modeldir=$DATA_DIR/yolov3/yolov3 \ + --gtest_output=xml:test_yolov3.xml + if [ $? -ne 0 ]; then + echo "test_yolov3 runs failed" >> ${current_dir}/build/test_summary.txt + EXIT_CODE=1 + fi + + printf "${YELLOW} start test_ppyolo_mbv3 ${NC} \n"; + compile_test "test_ppyolo_mbv3" + ./test_ppyolo_mbv3 \ + --modeldir=$DATA_DIR/ppyolo_mbv3/ppyolo_mbv3 \ + --gtest_output=xml:test_ppyolo_mbv3.xml + if [ $? -ne 0 ]; then + echo "test_ppyolo_mbv3 runs failed" >> ${current_dir}/build/test_summary.txt + EXIT_CODE=1 + fi + + printf "${YELLOW} start test_ppyolov2_r50vd ${NC} \n"; + compile_test "test_ppyolov2_r50vd" + ./test_ppyolov2_r50vd \ + --modeldir=$DATA_DIR/ppyolov2_r50vd/ppyolov2_r50vd \ + --gtest_output=xml:test_ppyolov2_r50vd.xml + if [ $? -ne 0 ]; then + echo "test_ppyolov2_r50vd runs failed" >> ${current_dir}/build/test_summary.txt + EXIT_CODE=1 + fi + + cp ./*.xml ${log_dir}; fi + if [[ -f ${current_dir}/build/test_summary.txt ]];then echo "=====================test summary======================" cat ${current_dir}/build/test_summary.txt diff --git a/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc b/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc new file mode 100644 index 00000000000000..ae99cd8cff51a1 --- /dev/null +++ b/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test_suite.h" // NOLINT + +DEFINE_string(modeldir, "", "Directory of the inference model."); + +namespace paddle_infer { + +std::map PrepareInput(int batch_size) { + // init input data + int channel = 3; + int width = 320; + int height = 320; + paddle::test::Record image, im_shape, scale_factor; + int input_num = batch_size * channel * width * height; + int shape_num = batch_size * 2; + std::vector image_data(input_num, 1); + for (int i = 1; i < input_num + 1; ++i) { + image_data[i] = i % 10 * 0.5; + } + std::vector im_shape_data(shape_num, 1); + std::vector scale_factor_data(shape_num, 1); + + image.data = std::vector(image_data.begin(), image_data.end()); + image.shape = std::vector{batch_size, channel, width, height}; + image.type = paddle::PaddleDType::FLOAT32; + + im_shape.data = + std::vector(im_shape_data.begin(), im_shape_data.end()); + im_shape.shape = std::vector{batch_size, 2}; + im_shape.type = paddle::PaddleDType::FLOAT32; + + scale_factor.data = + std::vector(scale_factor_data.begin(), scale_factor_data.end()); + scale_factor.shape = std::vector{batch_size, 2}; + scale_factor.type = paddle::PaddleDType::FLOAT32; + + std::map input_data_map; + input_data_map.insert({"image", image}); + input_data_map.insert({"im_shape", im_shape}); + input_data_map.insert({"scale_factor", scale_factor}); + + return input_data_map; +} + +TEST(test_ppyolo_mbv3, multi_thread4_trt_fp32_bz2) { + int thread_num = 4; + // init input data + auto input_data_map = PrepareInput(2); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/model.pdmodel", + FLAGS_modeldir + "/model.pdiparams"); + config_no_ir.EnableUseGpu(100, 0); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/model.pdmodel", + FLAGS_modeldir + "/model.pdiparams"); + config.EnableUseGpu(100, 0); + config.EnableTensorRtEngine( + 1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, false, false); + LOG(INFO) << config.Summary(); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &input_data_map, + &truth_output_data, 1); + + // get infer results from multi threads + std::vector threads; + services::PredictorPool pred_pool(config, thread_num); + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool.Retrive(i), &input_data_map, + &infer_output_data, 2); + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + threads[i].join(); + CompareRecord(&truth_output_data, &infer_output_data, 1e-2); + // TODO(OliverLPH): precision set to 1e-2 since input is fake, change to + // real input later + } + + std::cout << "finish multi-thread test" << std::endl; +} + +TEST(DISABLED_test_ppyolo_mbv3, multi_thread4_mkl_bz2) { + // TODO(OliverLPH): mkldnn multi thread will fail + int thread_num = 4; + // init input data + auto input_data_map = PrepareInput(2); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/model.pdmodel", + FLAGS_modeldir + "/model.pdiparams"); + config_no_ir.DisableGpu(); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/model.pdmodel", + FLAGS_modeldir + "/model.pdiparams"); + config.DisableGpu(); + config.EnableMKLDNN(); + config.SetMkldnnCacheCapacity(10); + config.SetCpuMathLibraryNumThreads(10); + LOG(INFO) << config.Summary(); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &input_data_map, + &truth_output_data, 1); + + // get infer results from multi threads + std::vector threads; + services::PredictorPool pred_pool(config, thread_num); + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool.Retrive(i), &input_data_map, + &infer_output_data, 2); + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + threads[i].join(); + CompareRecord(&truth_output_data, &infer_output_data, 1e-4); + } + + std::cout << "finish multi-thread test" << std::endl; +} + +} // namespace paddle_infer + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ::google::ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); +} diff --git a/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc b/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc new file mode 100644 index 00000000000000..b2cb4ca32238c6 --- /dev/null +++ b/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc @@ -0,0 +1,155 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test_suite.h" // NOLINT + +DEFINE_string(modeldir, "", "Directory of the inference model."); + +namespace paddle_infer { + +std::map PrepareInput(int batch_size) { + // init input data + int channel = 3; + int width = 640; + int height = 640; + paddle::test::Record image, im_shape, scale_factor; + int input_num = batch_size * channel * width * height; + int shape_num = batch_size * 2; + std::vector image_data(input_num, 1); + for (int i = 1; i < input_num + 1; ++i) { + image_data[i] = i % 10 * 0.5; + } + std::vector im_shape_data(shape_num, 1); + std::vector scale_factor_data(shape_num, 1); + + image.data = std::vector(image_data.begin(), image_data.end()); + image.shape = std::vector{batch_size, channel, width, height}; + image.type = paddle::PaddleDType::FLOAT32; + + im_shape.data = + std::vector(im_shape_data.begin(), im_shape_data.end()); + im_shape.shape = std::vector{batch_size, 2}; + im_shape.type = paddle::PaddleDType::FLOAT32; + + scale_factor.data = + std::vector(scale_factor_data.begin(), scale_factor_data.end()); + scale_factor.shape = std::vector{batch_size, 2}; + scale_factor.type = paddle::PaddleDType::FLOAT32; + + std::map input_data_map; + input_data_map.insert({"image", image}); + input_data_map.insert({"im_shape", im_shape}); + input_data_map.insert({"scale_factor", scale_factor}); + + return input_data_map; +} + +TEST(test_ppyolov2_r50vd, multi_thread2_trt_fp32_bz1) { + int thread_num = 2; // thread > 2 may OOM + // init input data + auto input_data_map = PrepareInput(1); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/model.pdmodel", + FLAGS_modeldir + "/model.pdiparams"); + config_no_ir.EnableUseGpu(100, 0); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/model.pdmodel", + FLAGS_modeldir + "/model.pdiparams"); + config.EnableUseGpu(100, 0); + config.EnableTensorRtEngine( + 1 << 20, 2, 10, paddle_infer::PrecisionType::kFloat32, false, false); + LOG(INFO) << config.Summary(); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &input_data_map, + &truth_output_data, 1); + + // get infer results from multi threads + std::vector threads; + services::PredictorPool pred_pool(config, thread_num); + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool.Retrive(i), &input_data_map, + &infer_output_data, 2); + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + threads[i].join(); + // CompareRecord(&truth_output_data, &infer_output_data, 1e-2); + // TODO(OliverLPH): disable comparison since precsion is low + } + + std::cout << "finish multi-thread test" << std::endl; +} + +TEST(test_ppyolov2_r50vd, multi_thread2_mkl_bz2) { + int thread_num = 2; + // init input data + auto input_data_map = PrepareInput(2); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/model.pdmodel", + FLAGS_modeldir + "/model.pdiparams"); + config_no_ir.DisableGpu(); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/model.pdmodel", + FLAGS_modeldir + "/model.pdiparams"); + config.DisableGpu(); + config.EnableMKLDNN(); + config.SetMkldnnCacheCapacity(10); + config.SetCpuMathLibraryNumThreads(10); + LOG(INFO) << config.Summary(); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &input_data_map, + &truth_output_data, 1); + + // get infer results from multi threads + std::vector threads; + services::PredictorPool pred_pool(config, thread_num); + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool.Retrive(i), &input_data_map, + &infer_output_data, 2); + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + threads[i].join(); + // CompareRecord(&truth_output_data, &infer_output_data, 1e-4); + // TODO(OliverLPH): disable comparison since precsion is low + } + + std::cout << "finish multi-thread test" << std::endl; +} + +} // namespace paddle_infer + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ::google::ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); +} diff --git a/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc b/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc index f497acc4b166ca..035bc3f34f3e47 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc @@ -170,6 +170,35 @@ TEST(test_resnet50, multi_thread4_trt_fp32_bz2) { std::cout << "finish multi-thread test" << std::endl; } +TEST(test_resnet50, trt_int8_bz2) { + // init input data + std::map my_input_data_map; + my_input_data_map["inputs"] = PrepareInput(2); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare inference config + paddle_infer::Config config; + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config.EnableUseGpu(100, 0); + config.EnableTensorRtEngine(1 << 20, 2, 3, paddle_infer::PrecisionType::kInt8, + true, true); + + // get first time prediction int8 results + paddle_infer::services::PredictorPool pred_pool(config, 1); + SingleThreadPrediction(pred_pool.Retrive(0), &my_input_data_map, + &truth_output_data, 1); + + // get repeat 5 times prediction int8 results + SingleThreadPrediction(pred_pool.Retrive(0), &my_input_data_map, + &infer_output_data, 5); + + // check outputs + CompareRecord(&truth_output_data, &infer_output_data); + std::cout << "finish test" << std::endl; +} + } // namespace paddle_infer int main(int argc, char** argv) { diff --git a/paddle/fluid/inference/tests/infer_ut/test_yolov3.cc b/paddle/fluid/inference/tests/infer_ut/test_yolov3.cc new file mode 100644 index 00000000000000..845bcbc5c5b5f8 --- /dev/null +++ b/paddle/fluid/inference/tests/infer_ut/test_yolov3.cc @@ -0,0 +1,155 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test_suite.h" // NOLINT + +DEFINE_string(modeldir, "", "Directory of the inference model."); + +namespace paddle_infer { + +std::map PrepareInput(int batch_size) { + // init input data + int channel = 3; + int width = 608; + int height = 608; + paddle::test::Record image, im_shape, scale_factor; + int input_num = batch_size * channel * width * height; + int shape_num = batch_size * 2; + std::vector image_data(input_num, 1); + for (int i = 1; i < input_num + 1; ++i) { + image_data[i] = i % 10 * 0.5; + } + std::vector im_shape_data(shape_num, 1); + std::vector scale_factor_data(shape_num, 1); + + image.data = std::vector(image_data.begin(), image_data.end()); + image.shape = std::vector{batch_size, channel, width, height}; + image.type = paddle::PaddleDType::FLOAT32; + + im_shape.data = + std::vector(im_shape_data.begin(), im_shape_data.end()); + im_shape.shape = std::vector{batch_size, 2}; + im_shape.type = paddle::PaddleDType::FLOAT32; + + scale_factor.data = + std::vector(scale_factor_data.begin(), scale_factor_data.end()); + scale_factor.shape = std::vector{batch_size, 2}; + scale_factor.type = paddle::PaddleDType::FLOAT32; + + std::map input_data_map; + input_data_map.insert({"image", image}); + input_data_map.insert({"im_shape", im_shape}); + input_data_map.insert({"scale_factor", scale_factor}); + + return input_data_map; +} + +TEST(test_yolov3, multi_thread3_trt_fp32_bz2) { + int thread_num = 3; + // init input data + auto input_data_map = PrepareInput(2); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/model.pdmodel", + FLAGS_modeldir + "/model.pdiparams"); + config_no_ir.EnableUseGpu(100, 0); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/model.pdmodel", + FLAGS_modeldir + "/model.pdiparams"); + config.EnableUseGpu(100, 0); + config.EnableTensorRtEngine( + 1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, false, false); + LOG(INFO) << config.Summary(); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &input_data_map, + &truth_output_data, 1); + + // get infer results from multi threads + std::vector threads; + services::PredictorPool pred_pool(config, thread_num); + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool.Retrive(i), &input_data_map, + &infer_output_data, 2); + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + threads[i].join(); + CompareRecord(&truth_output_data, &infer_output_data, 1e-2); + // TODO(OliverLPH): precision set to 1e-2 since input is fake, change to + // real input later + } + + std::cout << "finish multi-thread test" << std::endl; +} + +TEST(test_yolov3, multi_thread4_mkl_bz2) { + int thread_num = 4; + // init input data + auto input_data_map = PrepareInput(2); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/model.pdmodel", + FLAGS_modeldir + "/model.pdiparams"); + config_no_ir.DisableGpu(); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/model.pdmodel", + FLAGS_modeldir + "/model.pdiparams"); + config.DisableGpu(); + config.EnableMKLDNN(); + config.SetMkldnnCacheCapacity(10); + config.SetCpuMathLibraryNumThreads(10); + LOG(INFO) << config.Summary(); + // get groudtruth by disbale ir + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &input_data_map, + &truth_output_data, 1); + + // get infer results from multi threads + std::vector threads; + services::PredictorPool pred_pool(config, thread_num); + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool.Retrive(i), &input_data_map, + &infer_output_data, 2); + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + threads[i].join(); + CompareRecord(&truth_output_data, &infer_output_data, 1e-4); + } + + std::cout << "finish multi-thread test" << std::endl; +} + +} // namespace paddle_infer + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ::google::ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); +} From 40d4d834b077408aa445879bb7bef94eaaf4577d Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Wed, 18 Aug 2021 18:31:44 +0800 Subject: [PATCH 094/126] code refactoring for new executor (#34970) * code refactoring, test=develop * refine, test=develop * refine, test=develop * refine, test=develop --- paddle/fluid/framework/CMakeLists.txt | 1 + paddle/fluid/framework/new_exec.h | 629 ------------------ paddle/fluid/framework/new_exec_test.cc | 88 --- .../framework/new_executor/CMakeLists.txt | 4 + .../framework/new_executor/interpretercore.cc | 471 +++++++++++++ .../framework/new_executor/interpretercore.h | 84 +++ .../interpretercore_util.h} | 2 +- .../new_executor/new_executor_defs.h | 79 +++ .../new_executor/standalone_executor.cc | 106 +++ .../new_executor/standalone_executor.h | 67 ++ .../new_executor/standalone_executor_test.cc | 64 ++ paddle/fluid/pybind/CMakeLists.txt | 1 + paddle/fluid/pybind/pybind.cc | 24 +- ...rpreter.py => test_standalone_executor.py} | 16 +- 14 files changed, 901 insertions(+), 735 deletions(-) delete mode 100644 paddle/fluid/framework/new_exec.h delete mode 100644 paddle/fluid/framework/new_exec_test.cc create mode 100644 paddle/fluid/framework/new_executor/CMakeLists.txt create mode 100644 paddle/fluid/framework/new_executor/interpretercore.cc create mode 100644 paddle/fluid/framework/new_executor/interpretercore.h rename paddle/fluid/framework/{new_exec_util.h => new_executor/interpretercore_util.h} (99%) create mode 100644 paddle/fluid/framework/new_executor/new_executor_defs.h create mode 100644 paddle/fluid/framework/new_executor/standalone_executor.cc create mode 100644 paddle/fluid/framework/new_executor/standalone_executor.h create mode 100644 paddle/fluid/framework/new_executor/standalone_executor_test.cc rename python/paddle/fluid/tests/unittests/interpreter/{test_interpreter.py => test_standalone_executor.py} (76%) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6853b03c61288d..cf1b5c10bb5e50 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -25,6 +25,7 @@ add_subdirectory(ir) add_subdirectory(details) add_subdirectory(fleet) add_subdirectory(io) +add_subdirectory(new_executor) #ddim lib proto_library(framework_proto SRCS framework.proto) diff --git a/paddle/fluid/framework/new_exec.h b/paddle/fluid/framework/new_exec.h deleted file mode 100644 index defa7a967336b5..00000000000000 --- a/paddle/fluid/framework/new_exec.h +++ /dev/null @@ -1,629 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once - -#include -#include - -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/executor_gc_helper.h" -#include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/new_exec_util.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/init.h" - -// USE_OP(fill_constant); -// USE_OP(elementwise_add); - -// using namespace std; - -namespace paddle { -namespace framework { - -using std::cerr; -using std::endl; - -using OpKernelComputeFunc = std::function; -using OpKernelMap = - std::unordered_map; - -framework::ProgramDesc load_from_file(const std::string& file_name) { - std::ifstream fin(file_name, std::ios::in | std::ios::binary); - fin.seekg(0, std::ios::end); - std::string buffer(fin.tellg(), ' '); - fin.seekg(0, std::ios::beg); - fin.read(&buffer[0], buffer.size()); - fin.close(); - - ProgramDesc program_desc(buffer); - return program_desc; -} - -struct OpKernelFunc { - OpKernelComputeFunc compute_func_; - OperatorBase* operator_base_; -}; - -struct VariableMetaInfo { - int var_ref_count_; -}; - -struct VariableScope { - std::vector var_list; - std::map name2id; - std::vector vec_meta_info_; -}; - -struct NextInstruction { - std::vector direct_run_; -}; - -struct EventInter {}; - -struct InstructionInfo { - std::vector dependecy_count_; -}; - -struct EventRun { - EventInter event_inter; - std::vector same_device_run_; - std::vector synchronized_run; -}; - -struct Instruction { - OpKernelFunc kernel_func_; - std::map> input_index_; - std::map> output_index_; - - std::vector gc_check_var_list; - NextInstruction next_instruction_; - std::vector vec_event_list_; -}; - -struct OpFuncNode { - // int unsed; - std::map> input_index; - std::map> output_index; - - OpKernelComputeFunc kernel_func_; -}; - -int convert(const platform::Place& place) { - if (is_cpu_place(place)) { - return 0; - } - if (is_gpu_place(place)) { - return 1; - } - - return -1; -} - -std::vector merge_vec(const std::vector& first, - const std::vector& second) { - std::vector out(first.size() + second.size()); - std::merge(first.begin(), first.end(), second.begin(), second.end(), - out.begin()); - - std::vector::iterator it; - it = std::unique(out.begin(), out.end()); - - out.resize(std::distance(out.begin(), it)); - - return out; -} - -void build_variable_outer_scope(const framework::ProgramDesc& pdesc, - VariableScope* var_scope, Scope* outer_scope) { - auto& global_block = pdesc.Block(0); - - for (auto& var : global_block.AllVars()) { - if (var->Name() == framework::kEmptyVarName) { - continue; - } - auto v = outer_scope->Var(var->Name()); - - if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) { - var_scope->name2id[var->Name()] = var_scope->var_list.size(); - } - - InitializeVariable(v, var->GetType()); - var_scope->var_list.push_back(v); - } -} - -void build_variable_scope(const framework::ProgramDesc& pdesc, - VariableScope* var_scope) { - auto& global_block = pdesc.Block(0); - - for (auto& var : global_block.AllVars()) { - if (var->Name() == framework::kEmptyVarName) { - continue; - } - - if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) { - var_scope->name2id[var->Name()] = var_scope->var_list.size(); - } - - auto v = new Variable(); - InitializeVariable(v, var->GetType()); - var_scope->var_list.push_back(v); - } -} - -void build_op_func_list(const framework::ProgramDesc& pdesc, - std::vector* op_list, - std::vector* vec_func_list, - VariableScope* var_scope, - const platform::Place& place) { - auto& global_block = pdesc.Block(0); - - for (auto& op : global_block.AllOps()) { - VLOG(3) << op->Type(); - // << op->Type() << endl; - - auto& info = OpInfoMap::Instance().Get(op->Type()); - - const VariableNameMap& inputs_names = op->Inputs(); - const VariableNameMap& outputs_names = op->Outputs(); - AttributeMap op_attr_map = op->GetAttrMap(); - - if (info.Checker() != nullptr) { - info.Checker()->Check(&op_attr_map); - } - auto op_base = - info.Creator()(op->Type(), inputs_names, outputs_names, op_attr_map); - - OpFuncNode op_func_node; - - VariableValueMap ins_map; - std::map> ins_name2id; - for (auto& var_name_item : inputs_names) { - std::vector input_vars; - std::vector vec_ids; - input_vars.reserve(var_name_item.second.size()); - for (auto& var_name : var_name_item.second) { - auto it = var_scope->name2id.find(var_name); - assert(it != var_scope->name2id.end()); - input_vars.push_back(var_scope->var_list[it->second]); - vec_ids.push_back(it->second); - } - ins_map[var_name_item.first] = input_vars; - ins_name2id[var_name_item.first] = vec_ids; - } - - VariableValueMap outs_map; - std::map> outs_name2id; - for (auto& var_name_item : outputs_names) { - std::vector output_vars; - std::vector vec_ids; - output_vars.reserve(var_name_item.second.size()); - for (auto& var_name : var_name_item.second) { - auto it = var_scope->name2id.find(var_name); - assert(it != var_scope->name2id.end()); - output_vars.push_back(var_scope->var_list[it->second]); - vec_ids.push_back(it->second); - } - outs_map[var_name_item.first] = output_vars; - outs_name2id[var_name_item.first] = vec_ids; - } - - op_func_node.input_index = ins_name2id; - op_func_node.output_index = outs_name2id; - RuntimeContext runtime_context({}, {}); - runtime_context.inputs.swap(ins_map); - runtime_context.outputs.swap(outs_map); - RuntimeInferShapeContext infer_shape_ctx(*op_base, runtime_context); - static_cast(op_base)->InferShape( - &infer_shape_ctx); - auto& all_op_kernels = OperatorWithKernel::AllOpKernels(); - auto kernels_iter = all_op_kernels.find(op->Type()); - PADDLE_ENFORCE_NE( - kernels_iter, all_op_kernels.end(), - platform::errors::Unavailable( - "There are no kernels which are registered in the %s operator.", - op->Type())); - - OpKernelMap& kernels = kernels_iter->second; - // auto place = platform::CPUPlace(); - // auto place = platform::CUDAPlace(0); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - Scope scope; - auto exec_ctx = - ExecutionContext(*op_base, scope, *dev_ctx, runtime_context); - auto expected_kernel_key = - dynamic_cast(op_base) - ->GetExpectedKernelType(exec_ctx); - - VariableValueMap& ins_map_temp = runtime_context.inputs; - - for (auto& var_name_item : ins_map_temp) { - for (size_t i = 0; i < var_name_item.second.size(); ++i) { - auto var = var_name_item.second[i]; - auto tensor_in = static_cast(&(var->Get())); - if (!tensor_in->IsInitialized()) { - continue; - } - auto kernel_type_for_var = - static_cast(op_base) - ->GetKernelTypeForVar(var_name_item.first, *tensor_in, - expected_kernel_key); - if (!platform::is_same_place(kernel_type_for_var.place_, - expected_kernel_key.place_)) { - // need trans place - // 1. add var in scope - // 2. add copy op - std::string new_var_name = - "temp_1" + std::to_string(var_scope->var_list.size() + 1); - auto v = new Variable(); - v->GetMutable(); - var_scope->name2id[new_var_name] = var_scope->var_list.size(); - var_scope->var_list.push_back(v); - - VariableNameMap copy_in_map; - auto x_iter = inputs_names.find(var_name_item.first); - copy_in_map["X"] = {x_iter->second[i]}; - VariableNameMap copy_out_map; - copy_out_map["Out"] = {new_var_name}; - AttributeMap attr_map; - attr_map["dst_place_type"] = convert(place); - - std::map> copy_ins_name2id; - copy_ins_name2id["X"] = ins_name2id[var_name_item.first]; - std::map> copy_out_name2id; - copy_out_name2id["Out"] = {var_scope->name2id[new_var_name]}; - - op_func_node.input_index[var_name_item.first][i] = - var_scope->name2id[new_var_name]; - - VariableValueMap copy_ins_value_map; - copy_ins_value_map["X"] = {var}; - VariableValueMap copy_outs_value_map; - copy_outs_value_map["Out"] = {v}; - - auto& copy_info = OpInfoMap::Instance().Get("memcpy"); - auto copy_op = copy_info.Creator()("memcpy", copy_in_map, - copy_out_map, attr_map); - OpFuncNode copy_op_func_node; - copy_op_func_node.input_index = copy_ins_name2id; - copy_op_func_node.output_index = copy_out_name2id; - - RuntimeContext copy_runtime_context({}, {}); - copy_runtime_context.inputs.swap(copy_ins_value_map); - copy_runtime_context.outputs.swap(copy_outs_value_map); - RuntimeInferShapeContext copy_infer_shape_ctx(*copy_op, - copy_runtime_context); - static_cast(copy_op) - ->InferShape(©_infer_shape_ctx); - auto& all_op_kernels = OperatorWithKernel::AllOpKernels(); - auto kernels_iter = all_op_kernels.find("memcpy"); - PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(), - platform::errors::Unavailable( - "There are no kernels which are registered in " - "the memcpy operator.")); - - OpKernelMap& kernels = kernels_iter->second; - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - Scope scope; - auto copy_exec_ctx = - ExecutionContext(*copy_op, scope, *dev_ctx, copy_runtime_context); - auto expected_kernel_key = - dynamic_cast(copy_op) - ->GetExpectedKernelType(copy_exec_ctx); - auto kernel_iter = kernels.find(expected_kernel_key); - copy_op_func_node.kernel_func_ = - OpKernelComputeFunc(kernel_iter->second); - copy_op_func_node.kernel_func_(copy_exec_ctx); - op_list->push_back(copy_op); - vec_func_list->push_back(copy_op_func_node); - - var_name_item.second[i] = v; - } - } - } - - op_list->push_back(op_base); - - auto kernel_iter = kernels.find(expected_kernel_key); - PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), - platform::errors::NotFound( - "Operator (%s) does not have kernel for %s.", - op->Type(), KernelTypeToString(expected_kernel_key))); - - op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second); - op_func_node.kernel_func_(exec_ctx); - vec_func_list->push_back(op_func_node); - } -} - -class InterpreterCore { - public: - InterpreterCore(const platform::Place& place, const ProgramDesc& prog, - const ProgramDesc& startup_prog, Scope* scope) - : place_(place), prog_(prog), outer_scope_(scope) { - paddle::framework::InitDevices(); - - is_build_ = false; - - if (outer_scope_ != nullptr) { - auto name_list = outer_scope_->LocalVarNames(); - for (auto name : name_list) { - auto v = outer_scope_->Var(name); - if (global_scope.name2id.find(name) == global_scope.name2id.end()) { - global_scope.name2id[name] = global_scope.var_list.size(); - } - - global_scope.var_list.push_back(v); - } - } - - paddle::framework::build_variable_outer_scope(startup_prog, &global_scope, - outer_scope_); - - std::vector vec_func_list; - std::vector op_list; - paddle::framework::build_op_func_list( - startup_prog, &op_list, &vec_func_list, &global_scope, place_); - // add variable to outer_scope - } - void run(const std::vector& vec_name, - const std::vector& vec_tensor, - const std::vector& vec_fetch_name, - std::vector* vec_out) { - if (is_build_ == false) { - paddle::framework::build_variable_scope(prog_, &global_scope); - } - for (size_t i = 0; i < vec_name.size(); ++i) { - auto it = global_scope.name2id.find(vec_name[i]); - assert(it != global_scope.name2id.end()); - - auto feed_tensor = - global_scope.var_list[it->second]->GetMutable(); - feed_tensor->ShareDataWith(vec_tensor[i]); - } - - if (is_build_ == false) { - paddle::framework::build_op_func_list(prog_, &op_list, &vec_func_list, - &global_scope, place_); - is_build_ = true; - // convert vec func_list to graph - convert(); - } else { - exec_instruction_list(vec_instruction_, global_scope, place_); - } - - for (size_t i = 0; i < vec_fetch_name.size(); ++i) { - auto it = global_scope.name2id.find(vec_fetch_name[i]); - assert(it != global_scope.name2id.end()); - PADDLE_ENFORCE_NE(it, global_scope.name2id.end(), - platform::errors::NotFound( - "Can't find (%d) the fetch var (%s) in scope", i, - vec_fetch_name[i])); - - auto fetch_tensor = - global_scope.var_list[it->second]->GetMutable(); - - if (platform::is_gpu_place(fetch_tensor->place())) { - Tensor out; - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place_); - dev_ctx->Wait(); - TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out); - dev_ctx->Wait(); - vec_out->push_back(out); - } else { - Tensor out; - TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out); - vec_out->push_back(out); - } - } - } - - private: - void convert() { - input_var2op_info_.resize(global_scope.var_list.size()); - - vec_instruction_.reserve(vec_func_list.size()); - dependecy_count_.resize(vec_func_list.size()); - global_scope.vec_meta_info_.resize(global_scope.var_list.size()); - for (size_t i = 0; i < vec_func_list.size(); ++i) { - Instruction temp_inst; - temp_inst.kernel_func_.compute_func_ = vec_func_list[i].kernel_func_; - temp_inst.kernel_func_.operator_base_ = op_list[i]; - temp_inst.input_index_ = vec_func_list[i].input_index; - temp_inst.output_index_ = vec_func_list[i].output_index; - - std::vector gc_check_input_list; - for (auto& item : vec_func_list[i].input_index) { - for (auto id : item.second) { - input_var2op_info_[id].push_back(i); - gc_check_input_list.push_back(id); - } - } - std::sort(gc_check_input_list.begin(), gc_check_input_list.end()); - auto last = - std::unique(gc_check_input_list.begin(), gc_check_input_list.end()); - gc_check_input_list.erase(last, gc_check_input_list.end()); - for (auto var_id : gc_check_input_list) { - global_scope.vec_meta_info_[var_id].var_ref_count_++; - } - - temp_inst.gc_check_var_list.swap(gc_check_input_list); - - vec_instruction_.push_back(temp_inst); - } - - for (size_t i = 0; i < vec_instruction_.size(); ++i) { - std::vector vec_temp; - for (auto& item : vec_instruction_[i].output_index_) { - for (auto id : item.second) { - vec_temp = merge_vec(vec_temp, input_var2op_info_[id]); - } - } - - // In Program, op order is a very import information. - // Op can noly add op after it as next as next ops. - std::vector filter_next; - filter_next.reserve(vec_temp.size()); - for (auto item : vec_temp) { - if (item > i) { - filter_next.push_back(item); - } - } - vec_instruction_[i].next_instruction_.direct_run_ = filter_next; - - // checkout ouput - for (auto& item : vec_instruction_[i].output_index_) { - for (auto id : item.second) { - if (input_var2op_info_[id].size() == 0) { - // output var not be used by any kernel - vec_instruction_[i].gc_check_var_list.push_back(id); - global_scope.vec_meta_info_[id].var_ref_count_++; - } - } - } - - for (auto inst_id : filter_next) { - dependecy_count_[inst_id]++; - } - } - } - - void run_instr(const Instruction& instr_node, const VariableScope& var_scope, - const platform::Place& place) { - auto op_base = instr_node.kernel_func_.operator_base_; - // build runtime cost - VariableValueMap ins_map; - for (auto& var_name_item : instr_node.input_index_) { - std::vector input_vars; - - input_vars.reserve(var_name_item.second.size()); - for (auto& id : var_name_item.second) { - input_vars.emplace_back(var_scope.var_list[id]); - } - ins_map.emplace(var_name_item.first, std::move(input_vars)); - } - - VariableValueMap outs_map; - for (auto& var_name_item : instr_node.output_index_) { - std::vector out_vars; - - out_vars.reserve(var_name_item.second.size()); - for (auto& id : var_name_item.second) { - out_vars.emplace_back(var_scope.var_list[id]); - } - outs_map.emplace(var_name_item.first, std::move(out_vars)); - } - - RuntimeContext runtime_context({}, {}); - runtime_context.inputs.swap(ins_map); - runtime_context.outputs.swap(outs_map); - - RuntimeInferShapeContext infer_shape_ctx(*op_base, runtime_context); - - static_cast(op_base)->InferShape( - &infer_shape_ctx); - - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - Scope scope; - - auto exec_context = - ExecutionContext(*op_base, scope, *dev_ctx, runtime_context); - - instr_node.kernel_func_.compute_func_(exec_context); - } - - void exec_instruction_list(const std::vector& vec_instr, - const VariableScope& var_scope, - const platform::Place& place) { - std::queue working_queue; - auto working_dependecy_count = dependecy_count_; - for (size_t i = 0; i < dependecy_count_.size(); ++i) { - if (dependecy_count_[i] == 0) { - working_queue.push(i); - } - } - - auto working_var_ref = global_scope.vec_meta_info_; - - size_t run_op_number = 0; - while (!working_queue.empty()) { - auto instr_id = working_queue.front(); - working_queue.pop(); - auto& instr_node = vec_instr[instr_id]; - run_instr(instr_node, var_scope, place); - - auto& next_instr = instr_node.next_instruction_.direct_run_; - ++run_op_number; - - for (auto next_i : next_instr) { - --working_dependecy_count[next_i]; - if (working_dependecy_count[next_i] == 0) { - working_queue.push(next_i); - } - } - - // GC infomation - - auto& gc_check_list = instr_node.gc_check_var_list; - for (auto var_id : gc_check_list) { - --working_var_ref[var_id].var_ref_count_; - } - } - - for (size_t i = 0; i < working_var_ref.size(); ++i) { - if (working_var_ref[i].var_ref_count_ != 0) { - cerr << " var ref is not zero " << i << endl; - } - } - } - - const platform::Place& place_; - const ProgramDesc& prog_; - paddle::framework::VariableScope global_scope; - std::vector vec_func_list; - std::vector op_list; - - bool is_build_; - - std::vector vec_instruction_; - - InstructionInfo instruction_info_; - - std::vector dependecy_count_; - std::vector ref_coun_info; - std::vector> input_var2op_info_; - - Scope* outer_scope_; -}; -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/new_exec_test.cc b/paddle/fluid/framework/new_exec_test.cc deleted file mode 100644 index 7bfb6b6540cff8..00000000000000 --- a/paddle/fluid/framework/new_exec_test.cc +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/executor_gc_helper.h" -#include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/device_context.h" - -#include "paddle/fluid/pybind/pybind.h" - -#include "gperftools/profiler.h" -#include "paddle/fluid/framework/new_exec.h" -#include "paddle/fluid/platform/init.h" - -int main() { - paddle::framework::InitDevices(); - paddle::framework::VariableScope global_scope; - auto place = paddle::platform::CUDAPlace(0); - auto test_prog = paddle::framework::load_from_file("lm_startup_program"); - { - paddle::framework::build_variable_scope(test_prog, &global_scope); - - std::vector vec_func_list; - std::vector op_list; - paddle::framework::build_op_func_list(test_prog, op_list, vec_func_list, - &global_scope, place); - - // paddle::framework::exec_op_func_list( vec_func_list, op_list, - // global_scope, place ); - } - - cerr << "run main" << endl; - auto main_prog = paddle::framework::load_from_file("lm_main_program"); - - paddle::framework::build_variable_scope(main_prog, &global_scope); - - std::vector vec_main_func_list; - std::vector op_main_list; - paddle::framework::build_op_func_list( - main_prog, op_main_list, vec_main_func_list, &global_scope, place); - paddle::framework::Scope scope; - paddle::framework::InterpreterCore interp_core(place, main_prog, test_prog, - &scope); - auto start = std::chrono::steady_clock::now(); - ProfilerStart("new_executor.prof"); - for (size_t i = 0; i < 2320; ++i) { - if (i % 200 == 0) { - cerr << i << endl; - } - // paddle::framework::exec_op_func_list( vec_main_func_list, op_main_list, - // global_scope, place ); - std::vector vec_out; - interp_core.run({}, {}, {}, vec_out); - } - ProfilerStop(); - auto end = std::chrono::steady_clock::now(); - std::chrono::duration diff = end - start; - - cerr << "time cost " << diff.count() << endl; - - return 1; -} diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt new file mode 100644 index 00000000000000..80f9d343de0556 --- /dev/null +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -0,0 +1,4 @@ +cc_library(interpretercore SRCS interpretercore.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${PYBIND_DEPS} profiler) +cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${PYBIND_DEPS} profiler) + +# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc new file mode 100644 index 00000000000000..7f6091742f02be --- /dev/null +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -0,0 +1,471 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/new_executor/interpretercore.h" + +namespace paddle { +namespace framework { + +InterpreterCore::InterpreterCore(const platform::Place& place, + const ProgramDesc& main_prog, + VariableScope* global_scope, + const std::vector& feed_names, + const std::vector& fetch_names) + : place_(place), main_program_(main_prog), global_scope_(global_scope) { + is_build_ = false; + feed_names_ = feed_names; + fetch_names_ = fetch_names; + // add feedop and fetchop to main_program + + // prune + + // optmize graph pass + + // convert to run graph +} + +void InterpreterCore::Run(const std::vector& feed_tensors, + std::vector* fetch_tensors) { + if (is_build_ == false) { + BuildVariableScope(main_program_, global_scope_); + } + for (size_t i = 0; i < feed_names_.size(); ++i) { + auto it = global_scope_->name2id.find(feed_names_[i]); + assert(it != global_scope_->name2id.end()); + + auto feed_tensor = + global_scope_->var_list[it->second]->GetMutable(); + feed_tensor->ShareDataWith(feed_tensors[i]); + } + + if (is_build_ == false) { + BuildOpFuncList(place_, main_program_, &op_list_, &vec_func_list_, + global_scope_); + is_build_ = true; + // convert vec func_list to graph + Convert(); + } else { + ExecuteInstructionList(vec_instruction_, *global_scope_, place_); + } + + for (size_t i = 0; i < fetch_names_.size(); ++i) { + auto it = global_scope_->name2id.find(fetch_names_[i]); + assert(it != global_scope_->name2id.end()); + PADDLE_ENFORCE_NE( + it, global_scope_->name2id.end(), + platform::errors::NotFound( + "Can't find (%d) the fetch var (%s) in scope", i, fetch_names_[i])); + + auto fetch_tensor = + global_scope_->var_list[it->second]->GetMutable(); + + if (platform::is_gpu_place(fetch_tensor->place())) { + Tensor out; + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place_); + dev_ctx->Wait(); + TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out); + dev_ctx->Wait(); + fetch_tensors->push_back(out); + } else { + Tensor out; + TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out); + fetch_tensors->push_back(out); + } + } +} + +void InterpreterCore::Convert() { + input_var2op_info_.resize(global_scope_->var_list.size()); + + vec_instruction_.reserve(vec_func_list_.size()); + dependecy_count_.resize(vec_func_list_.size()); + vec_meta_info_.resize(global_scope_->var_list.size()); + for (size_t i = 0; i < vec_func_list_.size(); ++i) { + Instruction temp_inst; + temp_inst.kernel_func_.compute_func_ = vec_func_list_[i].kernel_func_; + temp_inst.kernel_func_.operator_base_ = op_list_[i]; + temp_inst.input_index_ = vec_func_list_[i].input_index; + temp_inst.output_index_ = vec_func_list_[i].output_index; + + std::vector gc_check_input_list; + for (auto& item : vec_func_list_[i].input_index) { + for (auto id : item.second) { + input_var2op_info_[id].push_back(i); + gc_check_input_list.push_back(id); + } + } + std::sort(gc_check_input_list.begin(), gc_check_input_list.end()); + auto last = + std::unique(gc_check_input_list.begin(), gc_check_input_list.end()); + gc_check_input_list.erase(last, gc_check_input_list.end()); + for (auto var_id : gc_check_input_list) { + vec_meta_info_[var_id].var_ref_count_++; + } + + temp_inst.gc_check_var_list.swap(gc_check_input_list); + + vec_instruction_.push_back(temp_inst); + } + + for (size_t i = 0; i < vec_instruction_.size(); ++i) { + std::vector vec_temp; + for (auto& item : vec_instruction_[i].output_index_) { + for (auto id : item.second) { + vec_temp = MergeVector(vec_temp, input_var2op_info_[id]); + } + } + + // In Program, op order is a very import information. + // Op can noly add op after it as next as next ops. + std::vector filter_next; + filter_next.reserve(vec_temp.size()); + for (auto item : vec_temp) { + if (item > i) { + filter_next.push_back(item); + } + } + vec_instruction_[i].next_instruction_.direct_run_ = filter_next; + + // checkout ouput + for (auto& item : vec_instruction_[i].output_index_) { + for (auto id : item.second) { + if (input_var2op_info_[id].size() == 0) { + // output var not be used by any kernel + vec_instruction_[i].gc_check_var_list.push_back(id); + vec_meta_info_[id].var_ref_count_++; + } + } + } + + for (auto inst_id : filter_next) { + dependecy_count_[inst_id]++; + } + } +} + +void InterpreterCore::RunInstruction(const Instruction& instr_node, + const VariableScope& var_scope, + const platform::Place& place) { + auto op_base = instr_node.kernel_func_.operator_base_; + // build runtime cost + VariableValueMap ins_map; + for (auto& var_name_item : instr_node.input_index_) { + std::vector input_vars; + + input_vars.reserve(var_name_item.second.size()); + for (auto& id : var_name_item.second) { + input_vars.emplace_back(var_scope.var_list[id]); + } + ins_map.emplace(var_name_item.first, std::move(input_vars)); + } + + VariableValueMap outs_map; + for (auto& var_name_item : instr_node.output_index_) { + std::vector out_vars; + + out_vars.reserve(var_name_item.second.size()); + for (auto& id : var_name_item.second) { + out_vars.emplace_back(var_scope.var_list[id]); + } + outs_map.emplace(var_name_item.first, std::move(out_vars)); + } + + RuntimeContext runtime_context({}, {}); + runtime_context.inputs.swap(ins_map); + runtime_context.outputs.swap(outs_map); + + RuntimeInferShapeContext infer_shape_ctx(*op_base, runtime_context); + + static_cast(op_base)->InferShape( + &infer_shape_ctx); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + Scope scope; + + auto exec_context = + ExecutionContext(*op_base, scope, *dev_ctx, runtime_context); + + instr_node.kernel_func_.compute_func_(exec_context); +} + +void InterpreterCore::ExecuteInstructionList( + const std::vector& vec_instr, const VariableScope& var_scope, + const platform::Place& place) { + std::queue working_queue; + auto working_dependecy_count = dependecy_count_; + for (size_t i = 0; i < dependecy_count_.size(); ++i) { + if (dependecy_count_[i] == 0) { + working_queue.push(i); + } + } + + auto working_var_ref = vec_meta_info_; + + size_t run_op_number = 0; + while (!working_queue.empty()) { + auto instr_id = working_queue.front(); + working_queue.pop(); + auto& instr_node = vec_instr[instr_id]; + RunInstruction(instr_node, var_scope, place); + + auto& next_instr = instr_node.next_instruction_.direct_run_; + ++run_op_number; + + for (auto next_i : next_instr) { + --working_dependecy_count[next_i]; + if (working_dependecy_count[next_i] == 0) { + working_queue.push(next_i); + } + } + + // GC infomation + + auto& gc_check_list = instr_node.gc_check_var_list; + for (auto var_id : gc_check_list) { + --working_var_ref[var_id].var_ref_count_; + } + } + + for (size_t i = 0; i < working_var_ref.size(); ++i) { + if (working_var_ref[i].var_ref_count_ != 0) { + std::cerr << " var ref is not zero " << i << std::endl; + } + } +} + +std::vector InterpreterCore::MergeVector( + const std::vector& first, const std::vector& second) { + std::vector out(first.size() + second.size()); + std::merge(first.begin(), first.end(), second.begin(), second.end(), + out.begin()); + + std::vector::iterator it; + it = std::unique(out.begin(), out.end()); + + out.resize(std::distance(out.begin(), it)); + + return out; +} + +void InterpreterCore::BuildVariableScope(const framework::ProgramDesc& pdesc, + VariableScope* var_scope) { + auto& global_block = pdesc.Block(0); + + for (auto& var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) { + var_scope->name2id[var->Name()] = var_scope->var_list.size(); + auto v = new Variable(); + InitializeVariable(v, var->GetType()); + var_scope->var_list.push_back(v); + } + } +} + +void InterpreterCore::BuildOpFuncList(const platform::Place& place, + const framework::ProgramDesc& pdesc, + std::vector* op_list, + std::vector* vec_func_list, + VariableScope* var_scope) { + auto& global_block = pdesc.Block(0); + + for (auto& op : global_block.AllOps()) { + VLOG(3) << op->Type(); + // << op->Type() << endl; + + auto& info = OpInfoMap::Instance().Get(op->Type()); + + const VariableNameMap& inputs_names = op->Inputs(); + const VariableNameMap& outputs_names = op->Outputs(); + AttributeMap op_attr_map = op->GetAttrMap(); + + if (info.Checker() != nullptr) { + info.Checker()->Check(&op_attr_map); + } + auto op_base = + info.Creator()(op->Type(), inputs_names, outputs_names, op_attr_map); + + OpFuncNode op_func_node; + + VariableValueMap ins_map; + std::map> ins_name2id; + for (auto& var_name_item : inputs_names) { + std::vector input_vars; + std::vector vec_ids; + input_vars.reserve(var_name_item.second.size()); + for (auto& var_name : var_name_item.second) { + auto it = var_scope->name2id.find(var_name); + assert(it != var_scope->name2id.end()); + input_vars.push_back(var_scope->var_list[it->second]); + vec_ids.push_back(it->second); + } + ins_map[var_name_item.first] = input_vars; + ins_name2id[var_name_item.first] = vec_ids; + } + + VariableValueMap outs_map; + std::map> outs_name2id; + for (auto& var_name_item : outputs_names) { + std::vector output_vars; + std::vector vec_ids; + output_vars.reserve(var_name_item.second.size()); + for (auto& var_name : var_name_item.second) { + auto it = var_scope->name2id.find(var_name); + assert(it != var_scope->name2id.end()); + output_vars.push_back(var_scope->var_list[it->second]); + vec_ids.push_back(it->second); + } + outs_map[var_name_item.first] = output_vars; + outs_name2id[var_name_item.first] = vec_ids; + } + + op_func_node.input_index = ins_name2id; + op_func_node.output_index = outs_name2id; + RuntimeContext runtime_context({}, {}); + runtime_context.inputs.swap(ins_map); + runtime_context.outputs.swap(outs_map); + RuntimeInferShapeContext infer_shape_ctx(*op_base, runtime_context); + static_cast(op_base)->InferShape( + &infer_shape_ctx); + auto& all_op_kernels = OperatorWithKernel::AllOpKernels(); + auto kernels_iter = all_op_kernels.find(op->Type()); + PADDLE_ENFORCE_NE( + kernels_iter, all_op_kernels.end(), + platform::errors::Unavailable( + "There are no kernels which are registered in the %s operator.", + op->Type())); + + OpKernelMap& kernels = kernels_iter->second; + // auto place = platform::CPUPlace(); + // auto place = platform::CUDAPlace(0); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + Scope scope; + auto exec_ctx = + ExecutionContext(*op_base, scope, *dev_ctx, runtime_context); + auto expected_kernel_key = + dynamic_cast(op_base) + ->GetExpectedKernelType(exec_ctx); + + VariableValueMap& ins_map_temp = runtime_context.inputs; + + for (auto& var_name_item : ins_map_temp) { + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto var = var_name_item.second[i]; + auto tensor_in = static_cast(&(var->Get())); + if (!tensor_in->IsInitialized()) { + continue; + } + auto kernel_type_for_var = + static_cast(op_base) + ->GetKernelTypeForVar(var_name_item.first, *tensor_in, + expected_kernel_key); + if (!platform::is_same_place(kernel_type_for_var.place_, + expected_kernel_key.place_)) { + // need trans place + // 1. add var in scope + // 2. add copy op + std::string new_var_name = + "temp_1" + std::to_string(var_scope->var_list.size() + 1); + auto v = new Variable(); + v->GetMutable(); + var_scope->name2id[new_var_name] = var_scope->var_list.size(); + var_scope->var_list.push_back(v); + + VariableNameMap copy_in_map; + auto x_iter = inputs_names.find(var_name_item.first); + copy_in_map["X"] = {x_iter->second[i]}; + VariableNameMap copy_out_map; + copy_out_map["Out"] = {new_var_name}; + AttributeMap attr_map; + attr_map["dst_place_type"] = + is_cpu_place(place) ? 0 : is_gpu_place(place) ? 1 : -1; + + std::map> copy_ins_name2id; + copy_ins_name2id["X"] = ins_name2id[var_name_item.first]; + std::map> copy_out_name2id; + copy_out_name2id["Out"] = {var_scope->name2id[new_var_name]}; + + op_func_node.input_index[var_name_item.first][i] = + var_scope->name2id[new_var_name]; + + VariableValueMap copy_ins_value_map; + copy_ins_value_map["X"] = {var}; + VariableValueMap copy_outs_value_map; + copy_outs_value_map["Out"] = {v}; + + auto& copy_info = OpInfoMap::Instance().Get("memcpy"); + auto copy_op = copy_info.Creator()("memcpy", copy_in_map, + copy_out_map, attr_map); + OpFuncNode copy_op_func_node; + copy_op_func_node.input_index = copy_ins_name2id; + copy_op_func_node.output_index = copy_out_name2id; + + RuntimeContext copy_runtime_context({}, {}); + copy_runtime_context.inputs.swap(copy_ins_value_map); + copy_runtime_context.outputs.swap(copy_outs_value_map); + RuntimeInferShapeContext copy_infer_shape_ctx(*copy_op, + copy_runtime_context); + static_cast(copy_op) + ->InferShape(©_infer_shape_ctx); + auto& all_op_kernels = OperatorWithKernel::AllOpKernels(); + auto kernels_iter = all_op_kernels.find("memcpy"); + PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(), + platform::errors::Unavailable( + "There are no kernels which are registered in " + "the memcpy operator.")); + + OpKernelMap& kernels = kernels_iter->second; + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + Scope scope; + auto copy_exec_ctx = + ExecutionContext(*copy_op, scope, *dev_ctx, copy_runtime_context); + auto expected_kernel_key = + dynamic_cast(copy_op) + ->GetExpectedKernelType(copy_exec_ctx); + auto kernel_iter = kernels.find(expected_kernel_key); + copy_op_func_node.kernel_func_ = + OpKernelComputeFunc(kernel_iter->second); + copy_op_func_node.kernel_func_(copy_exec_ctx); + op_list->push_back(copy_op); + vec_func_list->push_back(copy_op_func_node); + + var_name_item.second[i] = v; + } + } + } + + op_list->push_back(op_base); + + auto kernel_iter = kernels.find(expected_kernel_key); + PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), + platform::errors::NotFound( + "Operator (%s) does not have kernel for %s.", + op->Type(), KernelTypeToString(expected_kernel_key))); + + op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second); + op_func_node.kernel_func_(exec_ctx); + vec_func_list->push_back(op_func_node); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h new file mode 100644 index 00000000000000..4d3369c8947419 --- /dev/null +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -0,0 +1,84 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/new_executor/interpretercore_util.h" +#include "paddle/fluid/framework/new_executor/new_executor_defs.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { + +class InterpreterCore { + public: + InterpreterCore(const platform::Place& place, const ProgramDesc& main_prog, + VariableScope* global_scope, + const std::vector& feed_names, + const std::vector& fetch_names); + + void Run(const std::vector& feed_tensors, + std::vector* fetch_tensors); + + static void BuildOpFuncList(const platform::Place& place, + const framework::ProgramDesc& pdesc, + std::vector* op_list, + std::vector* vec_func_list, + VariableScope* var_scope); + + private: + void Convert(); + + void RunInstruction(const Instruction& instr_node, + const VariableScope& var_scope, + const platform::Place& place); + + void ExecuteInstructionList(const std::vector& vec_instr, + const VariableScope& var_scope, + const platform::Place& place); + + std::vector MergeVector(const std::vector& first, + const std::vector& second); + + void BuildVariableScope(const framework::ProgramDesc& pdesc, + VariableScope* var_scope); + + const platform::Place& place_; + const ProgramDesc& main_program_; + VariableScope* global_scope_; + std::vector vec_meta_info_; + + std::vector vec_func_list_; + std::vector op_list_; + + std::vector vec_instruction_; + InstructionInfo instruction_info_; + std::vector dependecy_count_; + std::vector ref_coun_info_; + std::vector> input_var2op_info_; + + bool is_build_; + + std::vector feed_names_; + std::vector fetch_names_; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_exec_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h similarity index 99% rename from paddle/fluid/framework/new_exec_util.h rename to paddle/fluid/framework/new_executor/interpretercore_util.h index 1783b9be74becf..e6651f38d91273 100644 --- a/paddle/fluid/framework/new_exec_util.h +++ b/paddle/fluid/framework/new_executor/interpretercore_util.h @@ -13,7 +13,7 @@ // limitations under the License. /************************************************************************* - > File Name: new_exec_util.h + > File Name: interpretercore_util.h > Author: guanshanshan@baidu.com > Created Time: Fri 23 Jul 2021 06:19:19 AM UTC ************************************************************************/ diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h new file mode 100644 index 00000000000000..fb8a96aaca403d --- /dev/null +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -0,0 +1,79 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { + +using OpKernelComputeFunc = std::function; +using OpKernelMap = + std::unordered_map; + +struct OpKernelFunc { + OpKernelComputeFunc compute_func_; + OperatorBase* operator_base_; +}; + +struct VariableMetaInfo { + int var_ref_count_; +}; + +struct VariableScope { + std::vector var_list; + std::map name2id; +}; + +struct NextInstruction { + std::vector direct_run_; +}; + +struct EventInter {}; + +struct InstructionInfo { + std::vector dependecy_count_; +}; + +struct EventRun { + EventInter event_inter; + std::vector same_device_run_; + std::vector synchronized_run; +}; + +struct Instruction { + OpKernelFunc kernel_func_; + std::map> input_index_; + std::map> output_index_; + + std::vector gc_check_var_list; + NextInstruction next_instruction_; + std::vector vec_event_list_; +}; + +struct OpFuncNode { + // int unsed; + std::map> input_index; + std::map> output_index; + + OpKernelComputeFunc kernel_func_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc new file mode 100644 index 00000000000000..c312195feb5180 --- /dev/null +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/new_executor/standalone_executor.h" + +namespace paddle { +namespace framework { +StandaloneExecutor::StandaloneExecutor(const platform::Place& place, + const ProgramDesc& startup_prog, + const ProgramDesc& main_prog, + Scope* scope) + : place_(place), + startup_prog_(startup_prog), + main_prog_(main_prog), + outer_scope_(scope) { + paddle::framework::InitDevices(); + + // init scope + BuildVariableOuterScope(startup_prog, &global_scope_, scope); + + if (outer_scope_ != nullptr) { + auto name_list = outer_scope_->LocalVarNames(); + for (auto name : name_list) { + auto v = outer_scope_->Var(name); + if (global_scope_.name2id.find(name) == global_scope_.name2id.end()) { + global_scope_.name2id[name] = global_scope_.var_list.size(); + } + + global_scope_.var_list.push_back(v); + } + } + + // run startup program + std::vector vec_func_list; + std::vector op_list; + InterpreterCore::BuildOpFuncList(place_, startup_prog, &op_list, + &vec_func_list, &global_scope_); +} + +int StandaloneExecutor::Run(const std::vector& feed_names, + const std::vector& feed_tensors, + const std::vector& fetch_names, + std::vector* fetch_tensors) { + auto core = GetInterpreterCore(feed_names, fetch_names); + + core->Run(feed_tensors, fetch_tensors); + + return 0; +} + +void StandaloneExecutor::BuildVariableOuterScope( + const framework::ProgramDesc& pdesc, VariableScope* var_scope, + Scope* outer_scope) { + auto& global_block = pdesc.Block(0); + + for (auto& var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) { + var_scope->name2id[var->Name()] = var_scope->var_list.size(); + auto v = outer_scope->Var(var->Name()); + InitializeVariable(v, var->GetType()); + var_scope->var_list.push_back(v); + } + } +} + +std::shared_ptr StandaloneExecutor::GetInterpreterCore( + const std::vector& feed_names, + const std::vector& fetch_names) { + std::ostringstream oss; + oss << "feed:"; + for (auto& feedname : feed_names) { + oss << feedname << ","; + } + oss << "fetch:"; + for (auto& fetchname : fetch_names) { + oss << fetchname << ","; + } + + auto iter = interpretercores_.find(oss.str()); + + if (iter == interpretercores_.end()) { + auto core = std::make_shared( + place_, main_prog_, &global_scope_, feed_names, fetch_names); + interpretercores_.emplace(oss.str(), core); + return core; + } else { + return iter->second; + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h new file mode 100644 index 00000000000000..8526f64c6bcfbc --- /dev/null +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -0,0 +1,67 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/new_executor/interpretercore.h" + +namespace paddle { +namespace framework { + +class ExecutorBase { + public: + virtual ~ExecutorBase() {} + virtual int Run(const std::vector& feed_names, + const std::vector& feed_tensors, + const std::vector& fetch_names, + std::vector* fetch_tensors) = 0; +}; + +class StandaloneExecutor : public ExecutorBase { + public: + StandaloneExecutor(const platform::Place& place, + const ProgramDesc& startup_prog, + const ProgramDesc& main_prog, Scope* scope); + + ~StandaloneExecutor() {} + + virtual int Run(const std::vector& feed_names, + const std::vector& feed_tensors, + const std::vector& fetch_names, + std::vector* fetch_tensors); + + private: + void BuildVariableOuterScope(const framework::ProgramDesc& pdesc, + VariableScope* var_scope, Scope* outer_scope); + + std::shared_ptr GetInterpreterCore( + const std::vector& feed_names, + const std::vector& fetch_names); + + const platform::Place& place_; + const ProgramDesc& startup_prog_; + const ProgramDesc& main_prog_; + Scope* outer_scope_; + VariableScope global_scope_; + + std::unordered_map> + interpretercores_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc new file mode 100644 index 00000000000000..9e831147903674 --- /dev/null +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/new_executor/standalone_executor.h" + +paddle::framework::ProgramDesc load_from_file(const std::string& file_name) { + std::ifstream fin(file_name, std::ios::in | std::ios::binary); + fin.seekg(0, std::ios::end); + std::string buffer(fin.tellg(), ' '); + fin.seekg(0, std::ios::beg); + fin.read(&buffer[0], buffer.size()); + fin.close(); + + paddle::framework::ProgramDesc program_desc(buffer); + return program_desc; +} + +int main() { + paddle::framework::InitDevices(); + auto place = paddle::platform::CUDAPlace(0); + auto test_prog = load_from_file("lm_startup_program"); + + auto main_prog = load_from_file("lm_main_program"); + + paddle::framework::Scope scope; + paddle::framework::StandaloneExecutor exec(place, test_prog, main_prog, + &scope); + + auto start = std::chrono::steady_clock::now(); + for (size_t i = 0; i < 2320; ++i) { + if (i % 200 == 0) { + std::cout << i << std::endl; + } + + std::vector vec_out; + exec.Run({}, {}, {}, &vec_out); + } + auto end = std::chrono::steady_clock::now(); + std::chrono::duration diff = end - start; + + std::cout << "time cost " << diff.count() << std::endl; + + return 1; +} diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b8774f429632e2..ca0ed68a13f2fa 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -200,6 +200,7 @@ if(WITH_PYTHON) endif(WIN32) add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file}) + list(APPEND PYBIND_DEPS interpretercore standalone_executor) cc_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 235a06833fc675..426b539e80c76c 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -42,7 +42,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/new_exec.h" +#include "paddle/fluid/framework/new_executor/standalone_executor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -1945,30 +1945,30 @@ All parameter, weight, gradient are variables in Paddle. fetch_vars); }); - py::class_(m, "InterpreterCore") + py::class_(m, "StandaloneExecutor") .def(py::init()) .def("run", - [](InterpreterCore &self, + [](StandaloneExecutor &self, const std::unordered_map &input_dict, - std::vector vec_fetch_name) { + std::vector fetch_names) { pybind11::gil_scoped_release release; - std::vector vec_tensor; - std::vector vec_name; + std::vector feed_tensors; + std::vector feed_names; for (auto &item : input_dict) { framework::LoDTensor t; SetTensorFromPyArray( &t, item.second, platform::CPUPlace(), false); - vec_name.push_back(item.first); - vec_tensor.push_back(t); + feed_names.push_back(item.first); + feed_tensors.push_back(t); } - std::vector vec_out; - self.run(vec_name, vec_tensor, vec_fetch_name, &vec_out); + std::vector fetch_tensors; + self.Run(feed_names, feed_tensors, fetch_names, &fetch_tensors); std::vector vec_ret; - for (size_t i = 0; i < vec_out.size(); ++i) { - vec_ret.push_back(TensorToPyArray(vec_out[i], true)); + for (size_t i = 0; i < fetch_tensors.size(); ++i) { + vec_ret.push_back(TensorToPyArray(fetch_tensors[i], true)); } return vec_ret; }); diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_interpreter.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py similarity index 76% rename from python/paddle/fluid/tests/unittests/interpreter/test_interpreter.py rename to python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py index bb18d28e48b67d..bfed9621c94d85 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_interpreter.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py @@ -15,7 +15,7 @@ import unittest import paddle from paddle.fluid import core -from paddle.fluid.core import InterpreterCore +from paddle.fluid.core import StandaloneExecutor import numpy as np @@ -37,19 +37,25 @@ def test_interp_base(self): startup_program = paddle.fluid.default_startup_program() p = core.Place() p.set_place(self.place) - inter_core = InterpreterCore(p, main_program.desc, startup_program.desc, - core.Scope()) + standaloneexecutor = StandaloneExecutor(p, startup_program.desc, + main_program.desc, core.Scope()) - out = inter_core.run({ + out = standaloneexecutor.run({ "a": np.ones( [2, 2], dtype="float32") * 2 }, [c.name]) for i in range(10): - out = inter_core.run({ + out = standaloneexecutor.run({ "a": np.ones( [2, 2], dtype="float32") * i }, [c.name]) + for i in range(10): + out = standaloneexecutor.run({ + "a": np.ones( + [2, 2], dtype="float32") * i + }, [a.name, c.name]) + if __name__ == "__main__": unittest.main() From 248e27b7c27db12df71bc85dd8138ded3e635f6d Mon Sep 17 00:00:00 2001 From: littletomatodonkey Date: Wed, 18 Aug 2021 18:50:47 +0800 Subject: [PATCH 095/126] fix pad outliers err (#34979) * fix pad outliers err * fix pad api input type and doc * fix example of pad * add unittest for pad3d * fix unittest * fix error format * fix pad doc --- paddle/fluid/operators/pad3d_op.cc | 7 ++ paddle/fluid/operators/pad3d_op.cu | 7 ++ .../fluid/tests/unittests/test_pad3d_op.py | 64 ++++++++++++------- python/paddle/nn/functional/common.py | 32 ++++++++-- python/paddle/nn/layer/common.py | 2 +- 5 files changed, 81 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc index 0751cf25587889..c2be9ac97ff89b 100644 --- a/paddle/fluid/operators/pad3d_op.cc +++ b/paddle/fluid/operators/pad3d_op.cc @@ -567,6 +567,13 @@ class Pad3dCPUKernel : public framework::OpKernel { in_width, pads[1])); } + if (mode == "circular") { + PADDLE_ENFORCE_NE( + in_depth * in_height * in_width, 0, + platform::errors::InvalidArgument( + "The input tensor size can not be 0 for circular padding mode.")); + } + const int pad_left = pads[0]; const int pad_top = pads[2]; const int pad_front = pads[4]; diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu index 672a75389ccf18..ed936c10755f07 100644 --- a/paddle/fluid/operators/pad3d_op.cu +++ b/paddle/fluid/operators/pad3d_op.cu @@ -620,6 +620,13 @@ class Pad3dCUDAKernel : public framework::OpKernel { in_width, pads[1])); } + if (mode == "circular") { + PADDLE_ENFORCE_NE( + in_depth * in_height * in_width, 0, + platform::errors::InvalidArgument( + "The input tensor size can not be 0 for circular padding mode.")); + } + const int pad_left = pads[0]; const int pad_top = pads[2]; const int pad_front = pads[4]; diff --git a/python/paddle/fluid/tests/unittests/test_pad3d_op.py b/python/paddle/fluid/tests/unittests/test_pad3d_op.py index 8dc825e60bc4de..5ec7bdc66fe495 100644 --- a/python/paddle/fluid/tests/unittests/test_pad3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pad3d_op.py @@ -682,46 +682,64 @@ def test_class(self): class TestPad3dOpError(unittest.TestCase): + def setUp(self): + self.places = [paddle.CPUPlace()] + if core.is_compiled_with_cuda(): + self.places.append(paddle.CUDAPlace(0)) + def test_errors(self): def test_variable(): input_shape = (1, 2, 3, 4, 5) data = np.random.rand(*input_shape).astype(np.float32) - F.pad(x=data, paddings=[1, 1, 1, 1, 1, 1]) + y = F.pad(x=data, pad=[1, 1, 1, 1, 1, 1], data_format="NCDHW") def test_reflect_1(): input_shape = (1, 2, 3, 4, 5) data = np.random.rand(*input_shape).astype(np.float32) - x = paddle.fluid.data(name="x", shape=input_shape) - y = F.pad(x, pad=[5, 6, 1, 1, 1, 1], value=1, mode='reflect') - place = paddle.CPUPlace() - exe = Executor(place) - outputs = exe.run(feed={'x': data}, fetch_list=[y.name]) + x = paddle.to_tensor(data) + y = F.pad(x, + pad=[5, 6, 1, 1, 1, 1], + value=1, + mode='reflect', + data_format="NCDHW") def test_reflect_2(): input_shape = (1, 2, 3, 4, 5) data = np.random.rand(*input_shape).astype(np.float32) - x = paddle.fluid.data(name="x", shape=input_shape) - y = F.pad(x, pad=[1, 1, 4, 3, 1, 1], value=1, mode='reflect') - place = paddle.CPUPlace() - exe = Executor(place) - outputs = exe.run(feed={'x': data}, fetch_list=[y.name]) + x = paddle.to_tensor(data) + y = F.pad(x, + pad=[1, 1, 4, 3, 1, 1], + value=1, + mode='reflect', + data_format="NCDHW") def test_reflect_3(): input_shape = (1, 2, 3, 4, 5) data = np.random.rand(*input_shape).astype(np.float32) - x = paddle.fluid.data(name="x", shape=input_shape) - y = F.pad(x, pad=[1, 1, 1, 1, 2, 3], value=1, mode='reflect') - place = paddle.CPUPlace() - exe = Executor(place) - outputs = exe.run(feed={'x': data}, fetch_list=[y.name]) - - self.assertRaises(TypeError, test_variable) - - self.assertRaises(Exception, test_reflect_1) - - self.assertRaises(Exception, test_reflect_2) + x = paddle.to_tensor(data) + y = F.pad(x, + pad=[1, 1, 1, 1, 2, 3], + value=1, + mode='reflect', + data_format="NCDHW") + + def test_circular_1(): + input_shape = (1, 2, 0, 4, 5) + data = np.random.rand(*input_shape).astype(np.float32) + x = paddle.to_tensor(data) + y = F.pad(x, + pad=[1, 1, 1, 1, 2, 3], + mode='circular', + data_format="NCDHW") - self.assertRaises(Exception, test_reflect_3) + paddle.disable_static() + for place in self.places: + self.assertRaises(ValueError, test_variable) + self.assertRaises(Exception, test_reflect_1) + self.assertRaises(Exception, test_reflect_2) + self.assertRaises(Exception, test_reflect_3) + self.assertRaises(Exception, test_circular_1) + paddle.enable_static() class TestPadDataformatError(unittest.TestCase): diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index aee8ea2a3cc59a..4ead5f49d409f5 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -1160,12 +1160,13 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None): Parameters: x (Tensor): The input tensor with data type float32/double/int32/int64_t. - pad (Tensor | List[int32]): The padding size with data type int32. [len(padding)/2] dimensions - of input will be padded. 1. If input dimension is 3, then the pad has the form (pad_left, + pad (Tensor | List[int] | Tuple[int]): The padding size with data type int. + If mode is 'constant' and length of pad is twice as length of x dimension, then x will + be padded from the first dimension to the last dimension. + Else: 1. If input dimension is 3, then the pad has the form (pad_left, pad_right). 2. If the input dimension is 4, then the pad has the form (pad_left, pad_right, pad_top, pad_bottom). 3. If the input dimension is 5, then the pad has the form (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back). - mode (str): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'. When in 'constant' mode, this op uses a constant value to pad the input tensor. When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor. @@ -1189,6 +1190,15 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None): [4., 5., 6.]]]]] Case 0: + pad = [0, 0, 0, 0, 0, 0, 1, 1, 0, 0], + mode = 'constant' + value = 0 + Out = [[[[[0., 0., 0.], + [1., 2., 3.], + [4., 5., 6.], + [0., 0., 0.]]]]] + + Case 1: pad = [2, 2, 1, 1, 0, 0], mode = 'constant' value = 0 @@ -1197,7 +1207,7 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None): [0. 0. 4. 5. 6. 0. 0.] [0. 0. 0. 0. 0. 0. 0.]]]]] - Case 1: + Case 2: pad = [2, 2, 1, 1, 0, 0], mode = 'reflect' Out = [[[[[6. 5. 4. 5. 6. 5. 4.] @@ -1205,7 +1215,7 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None): [6. 5. 4. 5. 6. 5. 4.] [3. 2. 1. 2. 3. 2. 1.]]]]] - Case 2: + Case 3: pad = [2, 2, 1, 1, 0, 0], mode = 'replicate' Out = [[[[[1. 1. 1. 2. 3. 3. 3.] @@ -1213,7 +1223,7 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None): [4. 4. 4. 5. 6. 6. 6.] [4. 4. 4. 5. 6. 6. 6.]]]]] - Case 3: + Case 4: pad = [2, 2, 1, 1, 0, 0], mode = 'circular' Out = [[[[[5. 6. 4. 5. 6. 4. 5.] @@ -1231,11 +1241,18 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None): # example 1 x_shape = (1, 1, 3) x = paddle.arange(np.prod(x_shape), dtype="float32").reshape(x_shape) + 1 - y = F.pad(x, [2, 3], value=1, mode='constant', data_format="NCL") + y = F.pad(x, [0, 0, 0, 0, 2, 3], value=1, mode='constant', data_format="NCL") print(y) # [[[1. 1. 1. 2. 3. 1. 1. 1.]]] # example 2 + x_shape = (1, 1, 3) + x = paddle.arange(np.prod(x_shape), dtype="float32").reshape(x_shape) + 1 + y = F.pad(x, [2, 3], value=1, mode='constant', data_format="NCL") + print(y) + # [[[1. 1. 1. 2. 3. 1. 1. 1.]]] + + # example 3 x_shape = (1, 1, 2, 3) x = paddle.arange(np.prod(x_shape), dtype="float32").reshape(x_shape) + 1 y = F.pad(x, [1, 2, 1, 1], value=1, mode='circular') @@ -1295,6 +1312,7 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None): unsqueezed_dim = [1] x = unsqueeze(x, axis=unsqueezed_dim) else: + pad = list(pad) if data_format in ["NCL", "NCHW", "NCDHW"]: data_format = "NCDHW" if x_dim == 3: diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py index b88dc4bfe9538d..357c9bc7d401e1 100644 --- a/python/paddle/nn/layer/common.py +++ b/python/paddle/nn/layer/common.py @@ -24,7 +24,7 @@ def _npairs(x, n): - if isinstance(x, (paddle.Tensor, list)): + if isinstance(x, (paddle.Tensor, list, tuple)): return x x = [x] * (n * 2) return x From 755c8a195cdbe1c9600d8f9b2dd55cc97014f9fd Mon Sep 17 00:00:00 2001 From: xiongkun <807377414@qq.com> Date: Wed, 18 Aug 2021 19:00:24 +0800 Subject: [PATCH 096/126] Add NPU kernel for norm Op: float16 and float32 (#34609) * Add NPU kernel for norm Op: float16 and float32 * fix code for code review * fix for code review * add type for paddle_throw * remove unnecessary head file.\nAdd more testcase * remove a broadcast --- paddle/fluid/operators/norm_op_npu.cc | 67 +++++++++++ .../tests/unittests/npu/test_norm_op_npu.py | 112 ++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 paddle/fluid/operators/norm_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py diff --git a/paddle/fluid/operators/norm_op_npu.cc b/paddle/fluid/operators/norm_op_npu.cc new file mode 100644 index 00000000000000..17b0fca2bb0971 --- /dev/null +++ b/paddle/fluid/operators/norm_op_npu.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/norm_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class NormNPUKernel : public framework::OpKernel { + private: + void CheckAxis(int axis, int rank) const { + // check the axis is in [-rank, rank-1] + if (axis <= rank - 1 && axis >= -rank) return; + PADDLE_THROW(platform::errors::InvalidArgument( + "axis in norm operator must between (%d) and (%d)" + "but got (%d).", + -rank, rank - 1, axis)); + } + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + VLOG(4) << "Launch Norm Op Kernel on NPU." << std::endl; + auto* in_x = ctx.Input("X"); + auto* out_y = ctx.Output("Out"); + auto* out_norm = ctx.Output("Norm"); + out_y->mutable_data(ctx.GetPlace()); + out_norm->mutable_data(ctx.GetPlace()); + auto xdim = in_x->dims(); + float eps = ctx.Attr("epsilon"); + int axis = ctx.Attr("axis"); + CheckAxis(axis, xdim.size()); + if (axis < 0) axis = xdim.size() + axis; + + framework::NPUAttributeMap attr_input_norm; + attr_input_norm["axes"] = std::vector({axis}); + attr_input_norm["p"] = 2; + attr_input_norm["keepdim"] = true; + attr_input_norm["epsilon"] = eps; + const auto& runner = + NpuOpRunner("LpNorm", {*in_x}, {*out_norm}, attr_input_norm); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + NpuOpRunner("Div", {*in_x, *out_norm}, {*out_y}, {}).Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_NPU_KERNEL( + norm, ops::NormNPUKernel, + ops::NormNPUKernel) diff --git a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py new file mode 100644 index 00000000000000..2c946bb893127a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py @@ -0,0 +1,112 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +sys.path.append("..") +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +from op_test import OpTest, skip_check_grad_ci + +SEED = 2021 + + +def l2_norm(x, axis, epsilon): + x2 = x**2 + s = np.sum(x2, axis=axis, keepdims=True) + r = np.sqrt(s) + epsilon + y = x / np.broadcast_to(r, x.shape) + return y, r + + +class TestNorm(OpTest): + def setUp(self): + paddle.enable_static() + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "norm" + self.init_dtype() + + x = np.random.random(self.shape).astype(self.dtype) + y, norm = l2_norm(x, self.axis, self.epsilon) + self.inputs = {'X': x} + self.attrs = {'epsilon': self.epsilon, 'axis': self.axis} + self.outputs = {'Out': y, 'Norm': norm} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + self.axis = 1 + self.epsilon = 1e-10 + self.shape = (2, 3, 4, 5) + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestNormOp2(TestNorm): + def init_test_case(self): + self.shape = [5, 3, 9, 7] + self.axis = 0 + self.epsilon = 1e-8 + self.dtype = np.float32 + + +class TestNormOp3(TestNorm): + def init_test_case(self): + self.shape = [5, 3, 2, 7] + self.axis = -1 + self.epsilon = 1e-8 + self.dtype = np.float32 + + +class TestNormOp4(TestNorm): + def init_test_case(self): + self.shape = [128, 1024, 14, 14] + self.axis = 2 + self.epsilon = 1e-8 + self.dtype = np.float32 + + +class API_NormTest(unittest.TestCase): + def test_errors(self): + paddle.enable_static() + with fluid.program_guard(fluid.Program()): + + def test_norm_x_type(): + data = fluid.data(name="x", shape=[3, 3], dtype="float64") + out = fluid.layers.l2_normalize(data) + + self.assertRaises(TypeError, test_norm_x_type) + + +class TestNormFP16(TestNorm): + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + self.axis = -1 + self.epsilon = 1e-10 + self.shape = (2, 3, 100) + + +if __name__ == '__main__': + unittest.main() From 9cbba97b3d3fcd4c2f4ca1bf8b6088df93af2cf9 Mon Sep 17 00:00:00 2001 From: lzzyzlbb <287246233@qq.com> Date: Wed, 18 Aug 2021 19:49:14 +0800 Subject: [PATCH 097/126] [NPU]add rmsprop op (#34864) * [npu]add rmsprop op --- .../operators/optimizers/rmsprop_op_npu.cc | 101 ++++++++++++ .../unittests/npu/test_rmsprop_op_npu.py | 152 ++++++++++++++++++ 2 files changed, 253 insertions(+) create mode 100644 paddle/fluid/operators/optimizers/rmsprop_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc new file mode 100644 index 00000000000000..2edde1dd9cb4cb --- /dev/null +++ b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc @@ -0,0 +1,101 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/optimizers/rmsprop_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class RMSPROPNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *grad_var = ctx.InputVar("Grad"); + auto *param_out = ctx.Output("ParamOut"); + auto *moment_out = ctx.Output("MomentOut"); + auto *mean_square_out = ctx.Output("MeanSquareOut"); + + param_out->mutable_data(ctx.GetPlace()); + moment_out->mutable_data(ctx.GetPlace()); + mean_square_out->mutable_data(ctx.GetPlace()); + + auto epsilon = static_cast(ctx.Attr("epsilon")); + auto rho = static_cast(ctx.Attr("decay")); + auto momentum = static_cast(ctx.Attr("momentum")); + auto *p_tensor = ctx.Input("Param"); + auto *ms_tensor = ctx.Input("MeanSquare"); + auto *lr_tensor = ctx.Input("LearningRate"); + auto *mom_tensor = ctx.Input("Moment"); + bool centered = ctx.Attr("centered"); + + auto stream = + ctx.template device_context() + .stream(); + if (grad_var->IsType()) { + auto *grad_tensor = ctx.Input("Grad"); + if (centered) { + framework::NPUAttributeMap attr_input = {{"use_locking", false}}; + const Tensor *rho_tensor = nullptr; + const Tensor *momentum_tensor = nullptr; + const Tensor *epsilon_tensor = nullptr; + Tensor rho_tmp(framework::proto::VarType::FP32); + rho_tmp.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&rho_tmp, rho); + rho_tensor = &rho_tmp; + Tensor momentum_tmp(framework::proto::VarType::FP32); + momentum_tmp.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&momentum_tmp, momentum); + momentum_tensor = &momentum_tmp; + Tensor epsilon_tmp(framework::proto::VarType::FP32); + epsilon_tmp.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&epsilon_tmp, epsilon); + epsilon_tensor = &epsilon_tmp; + auto *mg_tensor = ctx.Input("MeanGrad"); + auto *mean_grad_out = ctx.Output("MeanGradOut"); + mean_grad_out->mutable_data(ctx.GetPlace()); + const auto &runner_applycenterrmsprop = NpuOpRunner( + std::string("ApplyCenteredRMSPropD"), + {*p_tensor, *mg_tensor, *ms_tensor, *mom_tensor, *lr_tensor, + *rho_tensor, *momentum_tensor, *epsilon_tensor, *grad_tensor}, + {*param_out, *mean_grad_out, *mean_square_out, *moment_out}, + {attr_input}); + runner_applycenterrmsprop.Run(stream); + } else { + framework::NPUAttributeMap attr_input = { + {"rho", rho}, {"momentum", momentum}, {"epsilon", epsilon}}; + const auto &runner_applyrmsprop = NpuOpRunner( + std::string("ApplyRMSPropD"), + {*p_tensor, *ms_tensor, *mom_tensor, *lr_tensor, *grad_tensor}, + {*param_out, *mean_square_out, *moment_out}, {attr_input}); + runner_applyrmsprop.Run(stream); + } + } else { + PADDLE_ENFORCE_EQ(false, true, + platform::errors::PermissionDenied( + "Unsupported Variable Type of Grad " + "in RmspropOp. Excepted LodTensor, " + "But received [%s]", + paddle::framework::ToTypeName(grad_var->Type()))); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + rmsprop, ops::RMSPROPNPUKernel) diff --git a/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py new file mode 100644 index 00000000000000..8bdf841c5cf181 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_rmsprop_op_npu.py @@ -0,0 +1,152 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import numpy as np +import paddle.fluid.core as core +from paddle.fluid.op import Operator +import paddle.fluid as fluid +import paddle + +paddle.enable_static() +SEED = 2021 + + +class TestNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.pow(sum, 2.0) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + rmsprop = fluid.optimizer.RMSProp(learning_rate=0.01) + rmsprop.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3)) + self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3)) + + +class TestCenteredNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.pow(sum, 2.0) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + rmsprop = fluid.optimizer.RMSProp(learning_rate=0.01, centered=True) + rmsprop.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3)) + self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3)) + + +if __name__ == "__main__": + unittest.main() From 22da1907e4b9b89cb6acd58b437e765a810f4b20 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 19 Aug 2021 09:46:38 +0800 Subject: [PATCH 098/126] Abstract DeviceEvent to manage cross-platform Event implementation (#34922) * add device_context * add gtest for device_event_gpu * Remvoe duplicate DeviceType * push for test * add unittest * fix macros * fix MSVC using usage --- paddle/fluid/platform/CMakeLists.txt | 5 + paddle/fluid/platform/device_context.h | 2 + paddle/fluid/platform/device_event.cc | 27 ++ paddle/fluid/platform/device_event.h | 277 +++++++++++++++++++++ paddle/fluid/platform/device_event_gpu.cc | 105 ++++++++ paddle/fluid/platform/device_event_test.cc | 78 ++++++ paddle/fluid/pybind/cuda_streams_py.cc | 2 +- 7 files changed, 495 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/platform/device_event.cc create mode 100644 paddle/fluid/platform/device_event.h create mode 100644 paddle/fluid/platform/device_event_gpu.cc create mode 100644 paddle/fluid/platform/device_event_test.cc diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 97c81568e673e8..fab0909c01c5fa 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -151,11 +151,16 @@ endif() cc_test(init_test SRCS init_test.cc DEPS device_context) +cc_library(device_event SRCS device_event.cc DEPS place enforce device_context op_registry) +cc_library(device_event_gpu SRCS device_event_gpu.cc DEPS device_event) + + if(WITH_GPU) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) + nv_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu) endif() if(WITH_ROCM) diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index abac12ff266486..87ce7c438b65be 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -97,6 +97,8 @@ enum DeviceType { CUDA = 1, XPU = 2, NPU = 3, + + MAX_DEVICE_TYPES = 4, }; DeviceType Place2DeviceType(const platform::Place& place); diff --git a/paddle/fluid/platform/device_event.cc b/paddle/fluid/platform/device_event.cc new file mode 100644 index 00000000000000..2c96de163799f8 --- /dev/null +++ b/paddle/fluid/platform/device_event.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device_event.h" + +namespace paddle { +namespace platform { + +EventCreateFunction DeviceEvent::event_creator_[MaxDeviceTypes]; +EventRecordFunction DeviceEvent::event_recorder_[MaxDeviceTypes]; +EventQueryFunction DeviceEvent::event_querier_[MaxDeviceTypes]; +EventFinishFunction DeviceEvent::event_finisher_[MaxDeviceTypes]; +EventWaitFunction DeviceEvent::event_waiter_[MaxDeviceTypes][MaxDeviceTypes]; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h new file mode 100644 index 00000000000000..c1f0acc00e6c91 --- /dev/null +++ b/paddle/fluid/platform/device_event.h @@ -0,0 +1,277 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace platform { + +class DeviceOption; +class DeviceEvent; + +constexpr int MaxDeviceTypes = + static_cast(platform::DeviceType::MAX_DEVICE_TYPES); + +typedef void (*EventCreateFunction)(DeviceEvent*, const DeviceOption&); +typedef void (*EventRecordFunction)(DeviceEvent*, const platform::Place&, + const DeviceContext*); +typedef bool (*EventQueryFunction)(const DeviceEvent*); +typedef void (*EventFinishFunction)(const DeviceEvent*); +typedef void (*EventWaitFunction)(const DeviceEvent*, DeviceContext*); + +inline int DeviceTypeToId(const DeviceType& device_type) { + return static_cast(device_type); +} + +class DeviceOption { + public: + explicit DeviceOption(int device_type) : device_type_(device_type) {} + + DeviceOption(int device_type, int device_id) + : device_type_(device_type), device_id_(device_id) {} + + int device_type() const { return device_type_; } + + int device_id() const { return device_id_; } + + private: + int device_type_; + int device_id_; +}; + +class DeviceEvent { + public: + explicit DeviceEvent(const DeviceOption& device_option) + : event_(), + type_(device_option.device_type()), + device_option_(device_option) { + PADDLE_ENFORCE_LT(type_, MaxDeviceTypes, + platform::errors::PreconditionNotMet( + "Required type < %d, but received type = %d", + MaxDeviceTypes, type_)); + PADDLE_ENFORCE_NOT_NULL( + event_creator_[type_], + platform::errors::Unavailable( + "event_creator_[%d] shall not be nullptr.", type_)); + event_creator_[type_](this, device_option_); + } + + ~DeviceEvent() {} + + void Record(const platform::Place& place, const DeviceContext* dev_ctx) { + PADDLE_ENFORCE_NOT_NULL( + event_recorder_[type_], + platform::errors::Unavailable( + "event_recorder_[%d] shall not be nullptr.", type_)); + event_recorder_[type_](this, place, dev_ctx); + } + + bool Query() { + PADDLE_ENFORCE_NOT_NULL( + event_querier_[type_], + platform::errors::Unavailable( + "event_querier_[%d] shall not be nullptr.", type_)); + return event_querier_[type_](this); + } + + void Finish() const { + PADDLE_ENFORCE_NOT_NULL( + event_finisher_[type_], + platform::errors::Unavailable( + "event_finisher_[%d] shall not be nullptr.", type_)); + event_finisher_[type_](this); + } + + void Wait(const DeviceType& waiter_type, DeviceContext* context) const { + auto waiter_idx = DeviceTypeToId(waiter_type); + PADDLE_ENFORCE_NOT_NULL( + event_waiter_[waiter_idx][type_], + platform::errors::Unavailable( + "event_waiter_[%d][%d] shall not be nullptr.", waiter_idx, type_)); + event_waiter_[waiter_idx][type_](this, context); + } + + void InitEvent(std::shared_ptr event) { event_ = event; } + + std::shared_ptr GetEvent() const { return event_; } + + private: + std::shared_ptr event_; + int type_; + DeviceOption device_option_; + + static EventCreateFunction event_creator_[MaxDeviceTypes]; + static EventRecordFunction event_recorder_[MaxDeviceTypes]; + static EventQueryFunction event_querier_[MaxDeviceTypes]; + static EventFinishFunction event_finisher_[MaxDeviceTypes]; + static EventWaitFunction event_waiter_[MaxDeviceTypes][MaxDeviceTypes]; + + template + friend struct EventCreateFunctionRegisterer; + + template + friend struct EventRecordFunctionRegisterer; + + template + friend struct EventQueryFunctionRegisterer; + + template + friend struct EventFinishFunctionRegisterer; + + template + friend struct EventWaitFunctionRegisterer; +}; + +/** + * check if MACRO is used in GLOBAL NAMESPACE. + */ +#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +// =============== Register for Create =============== +template +struct EventCreateFunctionRegisterer : public framework::Registrar { + explicit EventCreateFunctionRegisterer(EventCreateFunction func) { + auto type_idx = DeviceTypeToId(device_type); + VLOG(3) << "register event_creator with type_id :" << type_idx; + DeviceEvent::event_creator_[type_idx] = func; + } +}; + +#define REGISTER_EVENT_CREATE_FUNCTION(device_type, func) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_event_creator__##device_type, \ + "REGISTER_EVENT_CREATE_FUNCTION must be called in global namespace"); \ + static ::paddle::platform::EventCreateFunctionRegisterer \ + __reg_event_create_##device_type##__(func); \ + int TouchDeviceEventCreate##device_type() { \ + __reg_event_create_##device_type##__.Touch(); \ + return 0; \ + } + +// =============== Register for Record =============== +template +struct EventRecordFunctionRegisterer : public framework::Registrar { + explicit EventRecordFunctionRegisterer(EventRecordFunction func) { + auto type_idx = DeviceTypeToId(device_type); + VLOG(3) << "register event_recorder with type_id :" << type_idx; + DeviceEvent::event_recorder_[type_idx] = func; + } +}; + +#define REGISTER_EVENT_RECORD_FUNCTION(device_type, func) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_event_recorder__##device_type, \ + "REGISTER_EVENT_RECORD_FUNCTION must be called in global namespace"); \ + static ::paddle::platform::EventRecordFunctionRegisterer \ + __reg_event_record_##device_type##__(func); \ + int TouchDeviceEventRecord##device_type() { \ + __reg_event_record_##device_type##__.Touch(); \ + return 0; \ + } + +// =============== Register for Query =============== +template +struct EventQueryFunctionRegisterer : public framework::Registrar { + explicit EventQueryFunctionRegisterer(EventQueryFunction func) { + auto type_idx = DeviceTypeToId(device_type); + VLOG(3) << "register event_querier with type_id :" << type_idx; + DeviceEvent::event_querier_[type_idx] = func; + } +}; + +#define REGISTER_EVENT_QUERY_FUNCTION(device_type, func) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_event_querier__##device_type, \ + "REGISTER_EVENT_QUERY_FUNCTION must be called in global namespace"); \ + static ::paddle::platform::EventQueryFunctionRegisterer \ + __reg_event_query_##device_type##__(func); \ + int TouchDeviceEventQuery##device_type() { \ + __reg_event_query_##device_type##__.Touch(); \ + return 0; \ + } + +// =============== Register for Finish =============== +template +struct EventFinishFunctionRegisterer : public framework::Registrar { + explicit EventFinishFunctionRegisterer(EventFinishFunction func) { + auto type_idx = DeviceTypeToId(device_type); + VLOG(3) << "register event_finisher with type_id :" << type_idx; + DeviceEvent::event_finisher_[type_idx] = func; + } +}; + +#define REGISTER_EVENT_FINISH_FUNCTION(device_type, func) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_event_finishier__##device_type, \ + "REGISTER_EVENT_FINISH_FUNCTION must be called in global namespace"); \ + static ::paddle::platform::EventFinishFunctionRegisterer \ + __reg_event_finish_##device_type##__(func); \ + int TouchDeviceEventFinish##device_type() { \ + __reg_event_finish_##device_type##__.Touch(); \ + return 0; \ + } + +// =============== Register for Wait =============== +template +struct EventWaitFunctionRegisterer : public framework::Registrar { + explicit EventWaitFunctionRegisterer(EventWaitFunction func) { + auto waiter_idx = DeviceTypeToId(waiter_type); + auto event_idx = DeviceTypeToId(event_type); + VLOG(3) << "register event_finisher with waiter_idx : " << waiter_idx + << ", event_idx : " << event_idx; + DeviceEvent::event_waiter_[waiter_idx][event_idx] = func; + } +}; + +#define REGISTER_EVENT_WAIT_FUNCTION(waiter_type, event_type, func) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_event_waiter__##waiter_type##event_type, \ + "REGISTER_EVENT_WAIT_FUNCTION must be called in global namespace"); \ + static ::paddle::platform::EventWaitFunctionRegisterer \ + __reg_event_wait_##waiter_type##event_type##__(func); \ + int TouchDeviceEventWait##waiter_type##event_type() { \ + __reg_event_wait_##waiter_type##event_type##__.Touch(); \ + return 0; \ + } + +#define USE_EVENT(device_type) \ + extern int TouchDeviceEventCreate##device_type(); \ + extern int TouchDeviceEventRecord##device_type(); \ + extern int TouchDeviceEventQuery##device_type(); \ + extern int TouchDeviceEventFinish##device_type(); \ + UNUSED static int use_event_creator_##device_type = \ + TouchDeviceEventCreate##device_type(); \ + UNUSED static int use_event_recorder_##device_type = \ + TouchDeviceEventRecord##device_type(); \ + UNUSED static int use_event_querier_##device_type = \ + TouchDeviceEventQuery##device_type(); \ + UNUSED static int use_event_finisher_##device_type = \ + TouchDeviceEventFinish##device_type(); + +#define USE_EVENT_WAIT(waiter_type, event_type) \ + extern int TouchDeviceEventWait##waiter_type##event_type(); \ + UNUSED static int use_event_waiter_##waiter_type##event_type = \ + TouchDeviceEventWait##waiter_type##event_type(); + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc new file mode 100644 index 00000000000000..86bcfdad5ba952 --- /dev/null +++ b/paddle/fluid/platform/device_event_gpu.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device_event.h" +#include "paddle/fluid/platform/event.h" + +#ifdef PADDLE_WITH_CUDA +namespace paddle { +namespace platform { +struct CUDADeviceEventWrapper { + explicit CUDADeviceEventWrapper(const DeviceOption& dev_opt) + : inner_event_() { + PADDLE_ENFORCE_EQ( + dev_opt.device_type(), static_cast(DeviceType::CUDA), + platform::errors::PreconditionNotMet( + "Required device type shall be CUDA, but received %d. ", + dev_opt.device_type())); + PADDLE_ENFORCE_GT( + dev_opt.device_id(), -1, + platform::errors::PreconditionNotMet( + "Required DeviceOption.device_id > -1, but received %d. ", + dev_opt.device_id())); + device_id_ = dev_opt.device_id(); + } + + CudaEvent inner_event_; + int device_id_; +}; + +void DeviceEventCreateCUDA(DeviceEvent* event, const DeviceOption& dev_opt) { + event->InitEvent(std::make_shared(dev_opt)); +} + +void DeviceEventRecordCUDA(DeviceEvent* event, const platform::Place& place, + const DeviceContext* context) { + auto* wrapper = static_cast(event->GetEvent().get()); + + auto* cuda_dev_ctx = + dynamic_cast(context); + PADDLE_ENFORCE_NOT_NULL( + cuda_dev_ctx, + platform::errors::PreconditionNotMet( + "Failed to dynamic_cast context into CUDADeviceContext.")); + + wrapper->inner_event_.Record(*cuda_dev_ctx->context()->Stream()); +} + +bool DeviceEventQueryCUDA(const DeviceEvent* event) { + auto* wrapper = static_cast(event->GetEvent().get()); + PADDLE_ENFORCE_NOT_NULL( + wrapper, + platform::errors::PreconditionNotMet( + "Failed to dynamic_cast event into CUDADeviceEventWrapper.")); + + return wrapper->inner_event_.Query(); +} + +void DeviceEventFinishCUDA(const DeviceEvent* event) { + auto* wrapper = static_cast(event->GetEvent().get()); + // calling cudaEventSynchronize + wrapper->inner_event_.Synchronize(); +} + +void DeviceEventCUDAWaitCUDA(const DeviceEvent* event, DeviceContext* context) { + auto* wrapper = static_cast(event->GetEvent().get()); + auto* cuda_dev_ctx = + dynamic_cast(context); + PADDLE_ENFORCE_NOT_NULL( + cuda_dev_ctx, + platform::errors::PreconditionNotMet( + "Failed to dynamic_cast context into CUDADeviceContext.")); + // calling cudaStreamWaitEvent(stream, event, 0) + cuda_dev_ctx->context()->Stream()->WaitEvent( + wrapper->inner_event_.GetRawCudaEvent()); +} + +void DeviceEventCPUWaitCUDA(const DeviceEvent* event, DeviceContext* context) { + DeviceEventFinishCUDA(event); +} + +} // namespace platform +} // namespace paddle + +using ::paddle::platform::kCUDA; +using ::paddle::platform::kCPU; +REGISTER_EVENT_CREATE_FUNCTION(kCUDA, paddle::platform::DeviceEventCreateCUDA) +REGISTER_EVENT_RECORD_FUNCTION(kCUDA, paddle::platform::DeviceEventRecordCUDA) +REGISTER_EVENT_QUERY_FUNCTION(kCUDA, paddle::platform::DeviceEventQueryCUDA) +REGISTER_EVENT_FINISH_FUNCTION(kCUDA, paddle::platform::DeviceEventFinishCUDA) +REGISTER_EVENT_WAIT_FUNCTION(kCUDA, kCUDA, + paddle::platform::DeviceEventCUDAWaitCUDA) +REGISTER_EVENT_WAIT_FUNCTION(kCPU, kCUDA, + paddle::platform::DeviceEventCPUWaitCUDA) +#endif diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc new file mode 100644 index 00000000000000..04288599c40a4e --- /dev/null +++ b/paddle/fluid/platform/device_event_test.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device_event.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#ifdef PADDLE_WITH_CUDA +#include +using ::paddle::platform::kCUDA; +using ::paddle::platform::kCPU; +USE_EVENT(kCUDA); +USE_EVENT_WAIT(kCUDA, kCUDA) +USE_EVENT_WAIT(kCPU, kCUDA) + +TEST(DeviceEvent, CUDA) { + VLOG(1) << "In Test"; + using paddle::platform::CUDAPlace; + using paddle::platform::DeviceOption; + using paddle::platform::DeviceEvent; + using paddle::platform::DeviceContextPool; + using paddle::platform::DeviceType; + + auto& pool = DeviceContextPool::Instance(); + auto place = CUDAPlace(0); + auto* context = + static_cast(pool.Get(place)); + int device_type = static_cast(DeviceType::CUDA); + DeviceOption dev_opt(device_type, place.device); + + ASSERT_NE(context, nullptr); + // case 1. test for event_creator + DeviceEvent event(dev_opt); + ASSERT_NE(event.GetEvent().get(), nullptr); + // case 2. test for event_recorder + event.Record(place, context); + bool status = event.Query(); + ASSERT_EQ(status, false); + // case 3. test for event_finisher + event.Finish(); + status = event.Query(); + ASSERT_EQ(status, true); + + // case 4. test for event_waiter + float *src_fp32, *dst_fp32; + int size = 1000000 * sizeof(float); + cudaMallocHost(reinterpret_cast(&src_fp32), size); + cudaMalloc(reinterpret_cast(&dst_fp32), size); + cudaMemcpyAsync(dst_fp32, src_fp32, size, cudaMemcpyHostToDevice, + context->stream()); + event.Record(place, context); // step 1. record it + status = event.Query(); + ASSERT_EQ(status, false); + + event.Wait(kCUDA, context); // step 2. add streamWaitEvent + status = event.Query(); + ASSERT_EQ(status, false); // async + + event.Wait(kCPU, context); // step 3. EventSynchornize + status = event.Query(); + ASSERT_EQ(status, true); // sync + + // release resource + cudaFree(dst_fp32); + cudaFreeHost(src_fp32); +} +#endif diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc index 21c6e0a4f28caa..5ea0a2553f7516 100644 --- a/paddle/fluid/pybind/cuda_streams_py.cc +++ b/paddle/fluid/pybind/cuda_streams_py.cc @@ -105,7 +105,7 @@ void BindCudaStream(py::module *m_ptr) { .def("wait_stream", [](paddle::platform::stream::CUDAStream &self, paddle::platform::stream::CUDAStream &stream) { - auto event = paddle::platform::CudaEvent(); + paddle::platform::CudaEvent event; event.Record(stream); self.WaitEvent(event.GetRawCudaEvent()); From c4e05e1c2f733e31885c668317087129ed7db40f Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 19 Aug 2021 09:47:02 +0800 Subject: [PATCH 099/126] Fix op-benchmark cpu/gpu error (#34997) --- tools/test_ci_op_benchmark.sh | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh index 6d5f88c1f57ee9..d217d7e50e9bc5 100644 --- a/tools/test_ci_op_benchmark.sh +++ b/tools/test_ci_op_benchmark.sh @@ -253,6 +253,17 @@ function check_op_benchmark_result { return $check_status_code } +function check_CHANGE_OP_MAP { + for op_name in ${!CHANGE_OP_MAP[@]} + do + if [ -z "${BENCHMARK_OP_MAP[$op_name]}" ] + then + exit_code=8 + LOG "[ERROR] Missing test script of \"${op_name}\"(${CHANGE_OP_MAP[$op_name]}) in benchmark." + fi + done +} + # diff benchmakr result and miss op function summary_problems { local op_name exit_code @@ -262,14 +273,7 @@ function summary_problems { check_op_benchmark_result exit_code=$? fi - for op_name in ${!CHANGE_OP_MAP[@]} - do - if [ -z "${BENCHMARK_OP_MAP[$op_name]}" ] - then - exit_code=8 - LOG "[ERROR] Missing test script of \"${op_name}\"(${CHANGE_OP_MAP[$op_name]}) in benchmark." - fi - done + check_CHANGE_OP_MAP if [ $exit_code -ne 0 ]; then LOG "[INFO] See https://github.com/PaddlePaddle/Paddle/wiki/PR-CI-OP-benchmark-Manual for details." LOG "[INFO] Or you can apply for one RD (Avin0323(Recommend), Xreki, luotao1) approval to pass this PR." @@ -285,7 +289,7 @@ function cpu_op_benchmark { load_CHANGE_OP_MAP load_BENCHMARK_OP_MAP build_whl - summary_problems + check_CHANGE_OP_MAP LOG "[INFO] Op benchmark run success and no error!" exit 0 } From 26213a7732f319b7a377c25d7d179fdf3373224b Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 19 Aug 2021 09:47:31 +0800 Subject: [PATCH 100/126] Fix Inference CI CPU/GPU (#34931) * notest;test=gpu-inference * notest;test=gpu-inference * notest;test=gpu-inference * notest;test=gpu-inference * fix error * notest;test=gpu-inference * notest;test=gpu-inference * notest;test=gpu-inference * test=gpu-inference --- paddle/fluid/inference/goapi/test.sh | 2 +- paddle/scripts/paddle_build.sh | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh index b764e2ac72c70e..ae4b8d97d6cbda 100644 --- a/paddle/fluid/inference/goapi/test.sh +++ b/paddle/fluid/inference/goapi/test.sh @@ -21,7 +21,7 @@ if [ ! -d mobilenetv1 ]; then fi # 2. set LD_LIBRARY_PATH -export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$PWD/paddle_inference_c/paddle/lib +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/ # 3. go test go test -v ./... diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index abaae9a361d6bb..0ace9568d49652 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -2422,6 +2422,11 @@ function main() { python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py gen_fluid_lib ${parallel_number} ;; + gpu_inference) + test_fluid_lib + test_go_inference_api + check_approvals_of_unittest 3 + ;; test_train) gen_fluid_lib ${parallel_number} test_fluid_lib_train From 255fc7d89dbc3f722dbb0f835d7b87c3b3377131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Thu, 19 Aug 2021 11:45:32 +0800 Subject: [PATCH 101/126] add the auto scan test for TensorRT convert,test=develop (#34980) --- .../unittests/ir/inference/CMakeLists.txt | 1 + .../unittests/ir/inference/auto_scan_test.py | 92 ++++++++++ .../unittests/ir/inference/program_config.py | 162 ++++++++++++++++++ .../ir/inference/test_trt_convert_conv2d.py | 67 ++++++++ .../ir/inference/trt_layer_auto_scan_test.py | 115 +++++++++++++ 5 files changed, 437 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/program_config.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index 45e392cd66e90a..0ea9b8d2fc62f0 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -38,4 +38,5 @@ set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120) set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45) set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60) set_tests_properties(test_trt_tile_op PROPERTIES TIMEOUT 60) +set_tests_properties(test_trt_convert_conv2d PROPERTIES TIMEOUT 100) endif() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py new file mode 100644 index 00000000000000..7d749cca5c2541 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py @@ -0,0 +1,92 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import unittest +import abc +import paddle +import paddle.fluid as fluid +from paddle.fluid.initializer import NumpyArrayInitializer +import paddle.fluid.core as core +from paddle import compat as cpt +import paddle.inference as paddle_infer +from typing import Optional, List, Callable, Dict, Any, Set +from program_config import TensorConfig, OpConfig, ProgramConfig, create_fake_model + + +class AutoScanTest(unittest.TestCase): + def __init__(self, methodName='runTest'): + paddle.enable_static() + super(AutoScanTest, self).__init__(methodName) + self.threshold = 1e-5 + + @abc.abstractmethod + def sample_program_configs(self) -> List[ProgramConfig]: + ''' + Generate all config with the combination of different Input tensor shape and + different Attr values. + ''' + raise NotImplementedError + + @abc.abstractmethod + def sample_predictor_configs(self) -> List[paddle_infer.Config]: + raise NotImplementedError + + def run_test_config(self, model, params, prog_config, pred_config, + feed_data) -> Dict[str, np.ndarray]: + ''' + Test a single case. + ''' + pred_config.set_model_buffer(model, len(model), params, len(params)) + predictor = paddle_infer.create_predictor(pred_config) + + for name, _ in prog_config.inputs.items(): + input_tensor = predictor.get_input_handle(name) + input_tensor.copy_from_cpu(feed_data[name]) + predictor.run() + result = {} + for out_name in prog_config.outputs: + result[out_name] = predictor.get_output_handle( + out_name).copy_to_cpu() + return result + + def assert_tensors_near(self, + threshold: float, + tensors: List[Dict[str, np.array]]): + assert len(tensors) > 1 + first = tensors[0] + for group in tensors[1:]: + for key, arr in group.items(): + self.assertTrue( + np.allclose( + first[key], arr, atol=threshold), + "Output has diff between GPU and TensorRT. ") + + def run_test(self): + for prog_config in self.sample_program_configs(): + model, params = create_fake_model(prog_config) + for batch_size in self.batch_size_set: + feed_data = {} + for name, tensor_config in prog_config.inputs.items(): + tensor_shape = tensor_config.shape.copy() + tensor_shape[0] = batch_size + feed_data[name] = np.random.random(tensor_shape).astype( + tensor_config.dtype) + results: List[Dict[str, Tensor]] = [] + for pred_config in self.sample_predictor_configs(): + results.append( + self.run_test_config(model, params, prog_config, + pred_config, feed_data)) + self.assert_tensors_near( + threshold=self.threshold, tensors=results) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py new file mode 100644 index 00000000000000..399501618b6988 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py @@ -0,0 +1,162 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, List, Callable, Dict, Any, Set +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle import compat as cpt +from paddle.fluid.initializer import NumpyArrayInitializer +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + + +class TensorConfig: + ''' + A config builder for a input or a weight. + + InputVar's shape can be [-1, xxx], batch_size + ''' + + def __init__(self, + shape: [List[int]], + dtype: [str]="float32", + data: Optional[np.array]=None): + ''' + shape: The shape of the tensor. + dtype: The data type of the tensor. + data: The value of WeightVar. for input, it should be None + ''' + self.shape = shape + self.dtype = dtype + self.data = data + + +class OpConfig: + ''' A config builder for generating a Op. ''' + + def __init__(self, + type: str, + inputs: Dict[str, List[str]], + outputs: Dict[str, List[str]], + attrs: Dict[str, Any]): + self.type = type + self.inputs = inputs + self.outputs = outputs + self.attrs = attrs + + +class ProgramConfig: + ''' A config builder for generating a Program. ''' + + def __init__(self, + ops: List[OpConfig], + weights: Dict[str, TensorConfig], + inputs: Dict[str, TensorConfig], + outputs: List[str]): + self.ops = ops + self.weights = weights + self.inputs = inputs + self.outputs = outputs + + +def create_fake_model(program_config): + ''' Create a Paddle model(in memory) according to the given config. ''' + paddle.enable_static() + main_program_desc = core.ProgramDesc() + util_program = fluid.Program() + main_block_desc = main_program_desc.block(0) + + var_desc = main_block_desc.var(cpt.to_bytes("feed")) + var_desc.set_type(core.VarDesc.VarType.FEED_MINIBATCH) + var_desc.set_persistable(True) + + index = 0 + for name, tensor_config in program_config.inputs.items(): + var_desc = main_block_desc.var(cpt.to_bytes(name)) + var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR) + var_desc.set_dtype(convert_np_dtype_to_dtype_(tensor_config.dtype)) + var_desc.set_shape(tensor_config.shape) + var_desc.set_need_check_feed(True) + op_desc = main_block_desc._prepend_op() + op_desc.set_type("feed") + op_desc.set_input('X', ["feed"]) + op_desc.set_output('Out', [name]) + op_desc._set_attr("col", index) + index = index + 1 + + save_var_map = {} + for name, tensor_config in program_config.weights.items(): + var_desc = main_block_desc.var(cpt.to_bytes(name)) + var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR) + var_desc.set_dtype(convert_np_dtype_to_dtype_(tensor_config.dtype)) + var_desc.set_shape(tensor_config.shape) + var_desc.set_persistable(True) + + save_var_map[name] = util_program.global_block().create_parameter( + dtype=tensor_config.dtype, + shape=tensor_config.shape, + type=core.VarDesc.VarType.LOD_TENSOR, + name=name, + initializer=NumpyArrayInitializer(tensor_config.data)) + in_vars = [] + for name in sorted(save_var_map.keys()): + in_vars.append(save_var_map[name]) + + out_var = util_program.global_block().create_var( + type=core.VarDesc.VarType.RAW, name="out_var_0") + out_var.desc.set_persistable(True) + util_program.global_block().append_op( + type='save_combine', + inputs={'X': in_vars}, + outputs={'Y': out_var}, + attrs={'file_path': '', + 'save_to_memory': True}) + for op_config in program_config.ops: + op_desc = main_block_desc.append_op() + op_desc.set_type(op_config.type) + for name, values in op_config.inputs.items(): + op_desc.set_input(name, values) + for name, values in op_config.attrs.items(): + op_desc._set_attr(name, values) + for name, values in op_config.outputs.items(): + op_desc.set_output(name, values) + var_desc = main_block_desc.var(cpt.to_bytes(name)) + var_desc.set_type(core.VarDesc.VarType.LOD_TENSOR) + op_desc.infer_var_type(main_block_desc) + op_desc.infer_shape(main_block_desc) + + for index, name in enumerate(program_config.outputs): + var_desc = main_block_desc.var(cpt.to_bytes("fetch")) + var_desc.set_type(core.VarDesc.VarType.FETCH_LIST) + var_desc.set_need_check_feed(True) + op_desc = main_block_desc.append_op() + op_desc.set_type("fetch") + op_desc.set_input('X', [name]) + op_desc.set_output('Out', ["fetch"]) + op_desc._set_attr("col", index) + + main_program_desc._set_version() + paddle.fluid.core.save_op_version_info(main_program_desc) + + model = main_program_desc.serialize_to_string() + + util_program._sync_with_cpp() + place = fluid.CPUPlace() + executor = fluid.Executor(place) + scope = fluid.Scope() + with fluid.scope_guard(scope): + executor.run(util_program) + params = scope.find_var("out_var_0").get_bytes() + return model, params diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py new file mode 100644 index 00000000000000..98c3367b3f2c3b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py @@ -0,0 +1,67 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest +from program_config import TensorConfig +import numpy as np + + +class TrtConvertConv2dTest(TrtLayerAutoScanTest): + def setUp(self): + self.ops_config = [{ + "op_type": "conv2d", + "op_inputs": { + "Input": ["input_data"], + "Filter": ["conv2d_weight"] + }, + "op_outputs": { + "Output": ["conv_output_data"] + }, + "op_attrs": { + "data_format": ["NCHW"], + "dilations": [[1, 1]], + "padding_algorithm": ["EXPLICIT"], + "groups": [1], + "paddings": [[0, 3], [3, 1]], + "strides": [[1, 1], [2, 2]], + } + }, { + "op_type": "relu", + "op_inputs": { + "X": ["conv_output_data"] + }, + "op_outputs": { + "Out": ["relu_output_data"] + }, + "op_attrs": {} + }] + self.batch_size_set = [1, 2, 4] + + def update_program_input_and_weight_with_attr(self, op_attr_list): + weight = np.random.randn(24, 3, 3, 3).astype("float32") + filter = TensorConfig(shape=[24, 3, 3, 3], data=weight) + if op_attr_list[0]["data_format"] == "NCHW": + input_data = TensorConfig(shape=[-1, 3, 64, 64]) + else: + input_data = TensorConfig(shape=[-1, 64, 64, 3]) + self.program_weights = {"conv2d_weight": filter} + self.program_inputs = {"input_data": input_data} + self.program_outputs = ["relu_output_data"] + + def test_check_output(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py new file mode 100644 index 00000000000000..589916ad39003d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py @@ -0,0 +1,115 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import unittest +import itertools +import abc +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.inference as paddle_infer + +from paddle import compat as cpt +from typing import * +from program_config import TensorConfig, OpConfig, ProgramConfig +from auto_scan_test import AutoScanTest + + +class TrtLayerAutoScanTest(AutoScanTest): + class TensorRTParam: + ''' + TensorRT subgraph engine parameters. + ''' + + def __init__(self, workspace_size, max_batch_size, min_subgraph_size, + precision, use_static, use_calib_mode): + self.workspace_size = workspace_size + self.max_batch_size = max_batch_size + self.min_subgraph_size = min_subgraph_size + self.precision = precision + self.use_static = use_static + self.use_calib_mode = use_calib_mode + + def __init__(self, methodName='runTest'): + super(TrtLayerAutoScanTest, self).__init__(methodName) + self.trt_param = self.TensorRTParam( + workspace_size=0, + max_batch_size=4, + min_subgraph_size=0, + precision=paddle_infer.PrecisionType.Float32, + use_static=False, + use_calib_mode=False) + + def update_program_input_and_weight_with_attr(self, op_attr_list): + raise NotImplementedError + + @abc.abstractmethod + def sample_program_configs(self): + all_op_attrs_keys = [] + all_op_attrs_values = [] + for op_config in self.ops_config: + all_op_attrs_keys.append(list(op_config["op_attrs"].keys())) + all_op_attrs_values.extend(list(op_config["op_attrs"].values())) + if len(all_op_attrs_values) == 0: + all_op_attrs_values.append([None]) + for attrs_sample in itertools.product(*all_op_attrs_values): + op_attr_list = [] + index = 0 + ops = [] + for op_config in self.ops_config: + op_attr = dict( + zip( + list(op_config["op_attrs"].keys()), attrs_sample[ + index:index + len(op_config["op_attrs"])])) + op_attr_list.append(op_attr) + index = index + len(op_config["op_attrs"]) + ops.append( + OpConfig( + type=op_config["op_type"], + inputs=op_config["op_inputs"], + outputs=op_config["op_outputs"], + attrs=op_attr)) + + self.update_program_input_and_weight_with_attr(op_attr_list) + program_config = ProgramConfig( + ops=ops, + weights=self.program_weights, + inputs=self.program_inputs, + outputs=self.program_outputs) + yield program_config + + def create_program_config( + self, use_trt=True, + precision_mode=paddle_infer.PrecisionType.Float32): + config = paddle_infer.Config() + config.enable_use_gpu(100, 0) + if use_trt: + config.enable_tensorrt_engine( + max_batch_size=self.trt_param.max_batch_size, + workspace_size=self.trt_param.workspace_size, + min_subgraph_size=self.trt_param.min_subgraph_size, + precision_mode=precision_mode, + use_static=self.trt_param.use_static, + use_calib_mode=self.trt_param.use_calib_mode) + return config + + @abc.abstractmethod + def sample_predictor_configs(self): + yield self.create_program_config(use_trt=False) + yield self.create_program_config( + use_trt=True, precision_mode=self.trt_param.precision) + if self.trt_param.precision == paddle_infer.PrecisionType.Float32: + yield self.create_program_config( + use_trt=True, precision_mode=paddle_infer.PrecisionType.Half) From ca7f5208b8f3e58258136a154cb02ab3fdcfa4bf Mon Sep 17 00:00:00 2001 From: ceci3 Date: Thu, 19 Aug 2021 13:25:21 +0800 Subject: [PATCH 102/126] fix batch_norm and instance norm when input is [] (#34107) * fix batch_norm and instance norm when input is [] --- paddle/fluid/operators/batch_norm_op.cc | 10 ++++++++++ paddle/fluid/operators/instance_norm_op.cc | 7 +++++++ 2 files changed, 17 insertions(+) diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index be17bf9a03fc19..4f22d28a450c1a 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -55,6 +55,16 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const { "Variance and VarianceOut should share the same memory")); const auto x_dims = ctx->GetInputDim("X"); + + for (int i = 0; i < x_dims.size(); i++) { + PADDLE_ENFORCE_EQ( + (x_dims[i] == -1) || (x_dims[i] > 0), true, + platform::errors::InvalidArgument( + "Each dimension of input tensor is expected to be -1 or a " + "positive number, but recieved %d. Input's shape is [%s].", + x_dims[i], x_dims)); + } + const DataLayout data_layout = framework::StringToDataLayout( ctx->Attrs().Get("data_layout")); diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc index 28643ac1c0d832..0a850400686c49 100644 --- a/paddle/fluid/operators/instance_norm_op.cc +++ b/paddle/fluid/operators/instance_norm_op.cc @@ -32,6 +32,13 @@ void InstanceNormOp::InferShape(framework::InferShapeContext *ctx) const { "InstanceNorm"); const auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_NE(framework::product(x_dims), 0, + platform::errors::PreconditionNotMet( + "The Input variable X(%s) has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.", + ctx->Inputs("X").front())); PADDLE_ENFORCE_GE( x_dims.size(), 2, platform::errors::InvalidArgument( From a2e0865723219d008f1d0d231bff6b733944297c Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Thu, 19 Aug 2021 16:57:50 +0800 Subject: [PATCH 103/126] Add dimension check for inverse to avoid dividing by 0 error when input's shape is [0, 0, 0]. (#34996) --- paddle/fluid/operators/inverse_op.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/paddle/fluid/operators/inverse_op.cc b/paddle/fluid/operators/inverse_op.cc index e73e74ab856abb..8c1fd34ae87d2e 100644 --- a/paddle/fluid/operators/inverse_op.cc +++ b/paddle/fluid/operators/inverse_op.cc @@ -35,6 +35,14 @@ class InverseOp : public framework::OperatorWithKernel { "The dimension of Input(Input) is expected to be no less than 2. " "But recieved: Input(Input)'s dimension = %d, shape = [%s].", input_rank, input_dims)); + for (int64_t i = 0; i < input_rank; ++i) { + PADDLE_ENFORCE_EQ( + (input_dims[i] == -1) || (input_dims[i] > 0), true, + platform::errors::InvalidArgument( + "Each dimension of input tensor is expected to be -1 or a " + "positive number, but recieved %d. Input's shape is [%s].", + input_dims[i], input_dims)); + } if (input_dims[input_rank - 2] > 0 && input_dims[input_rank - 1] > 0) { PADDLE_ENFORCE_EQ(input_dims[input_rank - 2], input_dims[input_rank - 1], platform::errors::InvalidArgument( From 97cae5e878c1a6173509547b0fbbfb0e62b4433f Mon Sep 17 00:00:00 2001 From: Peihan Date: Thu, 19 Aug 2021 17:26:13 +0800 Subject: [PATCH 104/126] add resnet50_quant model in PR-CI-INFERENCE (#35012) * add slim resnet50 quant model in pr-ci-inference * enable resnet50_quant multi_thread4_trt_int8_bz1 * remove LOG(FATAL) --- paddle/fluid/inference/tests/infer_ut/run.sh | 18 ++ .../tests/infer_ut/test_resnet50_quant.cc | 170 ++++++++++++++++++ .../inference/tests/infer_ut/test_suite.h | 1 + 3 files changed, 189 insertions(+) create mode 100644 paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh index c1694c76a7d2c8..627cd56f4830c9 100755 --- a/paddle/fluid/inference/tests/infer_ut/run.sh +++ b/paddle/fluid/inference/tests/infer_ut/run.sh @@ -95,6 +95,12 @@ for model_name in $det_download_list; do download $url_prefix $model_name done +unknown_download_list='resnet50_quant' +for model_name in $unknown_download_list; do + url_prefix="https://paddle-qa.bj.bcebos.com/inference_model/unknown" + download $url_prefix $model_name +done + function compile_test() { mkdir -p ${build_dir} cd ${build_dir} @@ -194,6 +200,18 @@ if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then EXIT_CODE=1 fi + printf "${YELLOW} start test_resnet50_quant ${NC} \n"; + compile_test "test_resnet50_quant" + ./test_resnet50_quant \ + --int8dir=$DATA_DIR/resnet50_quant/resnet50_quant/resnet50_quant \ + --modeldir=$DATA_DIR/resnet50/resnet50 \ + --datadir=$DATA_DIR/resnet50_quant/resnet50_quant/imagenet-eval-binary/9.data \ + --gtest_output=xml:test_resnet50_quant.xml + if [ $? -ne 0 ]; then + echo "test_resnet50_quant runs failed" >> ${current_dir}/build/test_summary.txt + EXIT_CODE=1 + fi + cp ./*.xml ${log_dir}; fi diff --git a/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc b/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc new file mode 100644 index 00000000000000..bf26f38c083fa2 --- /dev/null +++ b/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test_suite.h" // NOLINT + +DEFINE_string(modeldir, "", "Directory of the inference model."); +DEFINE_string(int8dir, "", "Directory of the quant inference model."); +DEFINE_string(datadir, "", "Directory of the infer data."); + +namespace paddle_infer { + +paddle::test::Record PrepareInput(int batch_size) { + // init input data + int channel = 3; + int width = 224; + int height = 224; + paddle::test::Record image_Record; + int input_num = batch_size * channel * width * height; + + // load from binary data + std::ifstream fs(FLAGS_datadir, std::ifstream::binary); + EXPECT_TRUE(fs.is_open()); + CHECK(fs.is_open()); + + float* input = new float[input_num]; + memset(input, 0, input_num * sizeof(float)); + auto input_data_tmp = input; + for (int i = 0; i < input_num; ++i) { + fs.read(reinterpret_cast(input_data_tmp), sizeof(*input_data_tmp)); + input_data_tmp++; + } + int label = 0; + fs.read(reinterpret_cast(&label), sizeof(label)); + fs.close(); + + std::vector input_data{input, input + input_num}; + image_Record.data = input_data; + image_Record.shape = std::vector{batch_size, channel, width, height}; + image_Record.type = paddle::PaddleDType::FLOAT32; + image_Record.label = label; + return image_Record; +} + +TEST(test_resnet50_quant, multi_thread4_trt_int8_bz1) { + int thread_num = 4; + // init input data + std::map input_data_map; + input_data_map["image"] = PrepareInput(1); + // init output data + std::map infer_output_data; + // prepare inference config + paddle_infer::Config config; + config.SetModel(FLAGS_int8dir); + config.EnableUseGpu(1000, 0); + config.EnableTensorRtEngine(1 << 20, 10, 3, + paddle_infer::PrecisionType::kInt8, true, false); + // get infer results from multi threads + std::vector threads; + services::PredictorPool pred_pool(config, thread_num); + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool.Retrive(i), &input_data_map, + &infer_output_data, 5); + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + threads[i].join(); + + // check outputs + std::vector index(1000); + std::iota(index.begin(), index.end(), 0); + auto out_data = + infer_output_data["save_infer_model/scale_0.tmp_0"].data.data(); + std::sort(index.begin(), index.end(), [out_data](size_t i1, size_t i2) { + return out_data[i1] > out_data[i2]; + }); + // compare inference & groundtruth label + ASSERT_EQ(index[0], input_data_map["image"].label); + } + + std::cout << "finish test" << std::endl; +} + +TEST(test_resnet50_quant, multi_thread_multi_instance) { + int thread_num = 4; + // init input data + std::map input_data_fp32, input_data_quant; + input_data_quant["image"] = PrepareInput(1); + input_data_fp32["inputs"] = PrepareInput(1); + + // init output data + std::map infer_output_data; + // prepare inference config + paddle_infer::Config config_fp32, config_quant; + config_fp32.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config_fp32.EnableUseGpu(1000, 0); + config_fp32.EnableTensorRtEngine( + 1 << 20, 10, 3, paddle_infer::PrecisionType::kFloat32, true, false); + + config_quant.SetModel(FLAGS_int8dir); + config_quant.EnableUseGpu(1000, 0); + config_quant.EnableTensorRtEngine( + 1 << 20, 10, 3, paddle_infer::PrecisionType::kInt8, true, false); + + // get infer results from multi threads + std::vector threads; + services::PredictorPool pred_pool_fp32(config_fp32, thread_num); + services::PredictorPool pred_pool_quant(config_quant, thread_num); + for (int i = 0; i < thread_num; ++i) { + if (i % 2 == 0) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool_fp32.Retrive(i), &input_data_fp32, + &infer_output_data, 5); + } else { + threads.emplace_back(paddle::test::SingleThreadPrediction, + pred_pool_quant.Retrive(i), &input_data_quant, + &infer_output_data, 5); + } + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + std::vector index(1000); + threads[i].join(); + if (i % 2 == 0) { + // check outputs + std::iota(index.begin(), index.end(), 0); + auto out_data = + infer_output_data["save_infer_model/scale_0.tmp_0"].data.data(); + std::sort(index.begin(), index.end(), [out_data](size_t i1, size_t i2) { + return out_data[i1] > out_data[i2]; + }); + // compare inference & groundtruth label + ASSERT_EQ(index[0], input_data_fp32["inputs"].label); + } else { + // check outputs + std::iota(index.begin(), index.end(), 0); + auto out_data = + infer_output_data["save_infer_model/scale_0.tmp_0"].data.data(); + std::sort(index.begin(), index.end(), [out_data](size_t i1, size_t i2) { + return out_data[i1] > out_data[i2]; + }); + // compare inference & groundtruth label + ASSERT_EQ(index[0], input_data_quant["image"].label); + } + } +} + +} // namespace paddle_infer + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + ::google::ParseCommandLineFlags(&argc, &argv, true); + return RUN_ALL_TESTS(); +} diff --git a/paddle/fluid/inference/tests/infer_ut/test_suite.h b/paddle/fluid/inference/tests/infer_ut/test_suite.h index 2f1034d4df9a65..b2546b180b976a 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_suite.h +++ b/paddle/fluid/inference/tests/infer_ut/test_suite.h @@ -35,6 +35,7 @@ class Record { std::vector data; std::vector shape; paddle::PaddleDType type; + int label; }; std::string read_file(std::string filename) { From ef024c89cca81d5544029c65b3eda80b0d95fbfb Mon Sep 17 00:00:00 2001 From: lilong12 Date: Thu, 19 Aug 2021 18:46:36 +0800 Subject: [PATCH 105/126] remove unused statements in test_dist_base.py (#35017) --- .../fluid/tests/unittests/test_dist_base.py | 30 ------------------- 1 file changed, 30 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index f5e67f2ddfaccd..eceb484a0184c9 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -164,36 +164,6 @@ def run_pipeline_trainer(self, args): sys.stdout.buffer.write(pickle.dumps(out_losses)) - if args.save_model: - model_save_dir = "/tmp" - if fleet.worker_index() == 0: - model_save_dir_fluid = os.path.join(model_save_dir, - "fluid_persistables") - model_save_dir_fleet = os.path.join(model_save_dir, - "fleet_persistables") - infer_save_dir_fluid = os.path.join(model_save_dir, - "fluid_infer") - infer_save_dir_fleet = os.path.join(model_save_dir, - "fleet_infer") - else: - model_save_dir_fluid = os.path.join(model_save_dir, - "fluid_persistables_2") - model_save_dir_fleet = os.path.join(model_save_dir, - "fleet_persistables_2") - infer_save_dir_fluid = os.path.join(model_save_dir, - "fluid_infer_2") - infer_save_dir_fleet = os.path.join(model_save_dir, - "fleet_infer_2") - fluid.io.save_persistables(exe, model_save_dir_fluid, - fleet._origin_program) - fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet) - feeded_var_names = [var.name for var in feed_var_list] - fluid.io.save_inference_model(infer_save_dir_fluid, - feeded_var_names, [avg_cost], exe, - fleet._origin_program) - fleet.save_inference_model(exe, infer_save_dir_fleet, - feeded_var_names, [avg_cost]) - def run_use_fleet_api_20_trainer(self, args): """ 1. remove codes for DistributedStrategy and leave the DistributedStrategy part to get_model() From ed9a14e49e1665dc2fd94110231fc4702439acd9 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 19 Aug 2021 18:47:26 +0800 Subject: [PATCH 106/126] Fix op-benchmark cpu/gpu; test=document_fix (#35027) --- tools/test_ci_op_benchmark.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh index d217d7e50e9bc5..8dd5c9703c01af 100644 --- a/tools/test_ci_op_benchmark.sh +++ b/tools/test_ci_op_benchmark.sh @@ -262,6 +262,11 @@ function check_CHANGE_OP_MAP { LOG "[ERROR] Missing test script of \"${op_name}\"(${CHANGE_OP_MAP[$op_name]}) in benchmark." fi done + if [ $exit_code -ne 0 ]; then + LOG "[INFO] See https://github.com/PaddlePaddle/Paddle/wiki/PR-CI-OP-benchmark-Manual for details." + LOG "[INFO] Or you can apply for one RD (Avin0323(Recommend), Xreki, luotao1) approval to pass this PR." + exit $exit_code + fi } # diff benchmakr result and miss op @@ -274,11 +279,6 @@ function summary_problems { exit_code=$? fi check_CHANGE_OP_MAP - if [ $exit_code -ne 0 ]; then - LOG "[INFO] See https://github.com/PaddlePaddle/Paddle/wiki/PR-CI-OP-benchmark-Manual for details." - LOG "[INFO] Or you can apply for one RD (Avin0323(Recommend), Xreki, luotao1) approval to pass this PR." - exit $exit_code - fi } @@ -288,8 +288,8 @@ function cpu_op_benchmark { prepare_benchmark_environment load_CHANGE_OP_MAP load_BENCHMARK_OP_MAP - build_whl check_CHANGE_OP_MAP + build_whl LOG "[INFO] Op benchmark run success and no error!" exit 0 } From 866c1ea60b23b3aa2ebdb52d537bfadc56d03e4a Mon Sep 17 00:00:00 2001 From: parap1uie-s Date: Thu, 19 Aug 2021 19:10:59 +0800 Subject: [PATCH 107/126] fix reshape when is a number (#35016) --- python/paddle/fluid/layers/nn.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 6251bdf165a974..d0d15e92bfb635 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6176,6 +6176,10 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None): elif isinstance(shape, Variable): shape.stop_gradient = True out, _ = _C_ops.reshape2(x, shape) + else: + raise ValueError( + "shape must be an instance of `list`, `tuple` or `Variable`," + " got '{}.'".format(type(shape))) return dygraph_utils._append_activation_in_dygraph(out, act) From 4641e8fc5d8dba9b4b5ed6092d264d3a2024e4e0 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Thu, 19 Aug 2021 19:26:49 +0800 Subject: [PATCH 108/126] [NPU] Support npu kernel for sin op (#34844) * add npu sin op * [NPU] Support npu kernel for sin op * modify support npu kernel for sin op * modify support npu kernel for sin op * modify nou sin op * modify npu sin op * add sin op npu --- paddle/fluid/operators/activation_op_npu.cc | 27 +++++++ .../tests/unittests/npu/test_sin_op_npu.py | 80 +++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/npu/test_sin_op_npu.py diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc index 5f2925784e4244..eb218507103dd6 100644 --- a/paddle/fluid/operators/activation_op_npu.cc +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -811,6 +811,27 @@ class ExpGradNPUKernel : public framework::OpKernel { } }; +template +class SinNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + const auto& runner = NpuOpRunner("Sin", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + } // namespace operators } // namespace paddle @@ -975,3 +996,9 @@ REGISTER_OP_NPU_KERNEL( REGISTER_OP_NPU_KERNEL( exp_grad, ops::ExpGradNPUKernel, ops::ExpGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + sin, ops::SinNPUKernel, + ops::SinNPUKernel, + ops::SinNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_sin_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sin_op_npu.py new file mode 100644 index 00000000000000..437f5c35e97022 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_sin_op_npu.py @@ -0,0 +1,80 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest + +import numpy as np +from scipy.special import expit, erf + +from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import compiler, Program, program_guard + +paddle.enable_static() + + +def test_class(op_type, typename): + class TestSin(OpTest): + def setUp(self): + self.op_type = "sin" + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + self.__class__.no_need_check_grad = True + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(typename) + out = np.sin(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + def test_out_name(self): + with fluid.program_guard(fluid.Program()): + np_x = np.array([0.1]) + data = fluid.layers.data(name="X", shape=[1]) + out = eval("paddle.%s(data, name='Y')" % self.op_type) + place = fluid.NPUPlace(0) + exe = fluid.Executor(place) + result, = exe.run(feed={"X": np_x}, fetch_list=[out]) + expected = eval("np.%s(np_x)" % self.op_type) + self.assertEqual(result, expected) + + def test_dygraph(self): + with fluid.dygraph.guard(paddle.NPUPlace(0)): + np_x = np.array([0.1]) + x = fluid.dygraph.to_variable(np_x) + z = eval("paddle.%s(x).numpy()" % self.op_type) + z_expected = eval("np.%s(np_x)" % self.op_type) + self.assertEqual(z, z_expected) + + cls_name = "{0}_{1}_1".format(op_type, typename) + TestSin.__name__ = cls_name + globals()[cls_name] = TestSin + + +for _typename in {'float16', 'float32', 'float64'}: + test_class("sin", _typename) + +if __name__ == "__main__": + unittest.main() From 096b0f2e12ab2f58a67eb815f3a6bd0818015537 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Fri, 20 Aug 2021 10:39:38 +0800 Subject: [PATCH 109/126] Add op benchmark run function log (#35034) * Add run function log * test=document_fix --- tools/test_ci_op_benchmark.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh index 8dd5c9703c01af..b76013e224f087 100644 --- a/tools/test_ci_op_benchmark.sh +++ b/tools/test_ci_op_benchmark.sh @@ -36,6 +36,7 @@ function LOG { # Limit cu file directory function match_cu_file_directory { + LOG "[INFO] run function match_cu_file_directory" local sub_dir cu_file_dir cu_file_dir=$(dirname ${1}) for sub_dir in "" "/elementwise" "/reduce_ops" @@ -47,6 +48,7 @@ function match_cu_file_directory { # Load op files by header file function load_CHANGE_OP_FILES_by_header_file { + LOG "[INFO] run function load_CHANGE_OP_FILES_by_header_file" local change_file for change_file in $(grep -rl "${1}" paddle/fluid/operators) do @@ -68,6 +70,7 @@ function load_CHANGE_OP_FILES_by_header_file { # Load op files that PR changes function load_CHANGE_OP_FILES { + LOG "[INFO] run function load_CHANGE_OP_FILES" local sub_dir change_file # TODO(Avin0323): Need to filter the files added by the new OP. for change_file in $(git diff --name-only origin/develop) @@ -108,6 +111,7 @@ function prepare_benchmark_environment { # Load unique op name from CHANGE_OP_FILES function load_CHANGE_OP_MAP { + LOG "[INFO] run function load_CHANGE_OP_MAP" local op_name change_file change_file_name source benchmark/ci/scripts/op_benchmark.config for change_file in ${CHANGE_OP_FILES[@]} @@ -133,6 +137,7 @@ function load_CHANGE_OP_MAP { # Load ops that will run benchmark test function load_BENCHMARK_OP_MAP { + LOG "[INFO] run function load_BENCHMARK_OP_MAP" local line op_name api_name source benchmark/ci/scripts/op_benchmark.config for line in $(cat api_info.txt) @@ -173,6 +178,7 @@ function compile_install_paddlepaddle { } function build_whl { + LOG "[INFO] run function build_whl" for branch_name in "develop" "test" do git checkout ${branch_name} @@ -184,6 +190,7 @@ function build_whl { # run op benchmark test function run_op_benchmark_test { + LOG "[INFO] run function run_op_benchmark_test" [ ${#BENCHMARK_OP_MAP[*]} -eq 0 ] && return local logs_dir op_name branch_name api_info_file [ -z "$VISIBLE_DEVICES" ] && export VISIBLE_DEVICES=0 @@ -219,6 +226,7 @@ function run_op_benchmark_test { # check benchmark result function check_op_benchmark_result { + LOG "[INFO] run function check_op_benchmark_result" local logs_dir api_info_file check_status_code # default 3 times [ -z "${RETRY_TIMES}" ] && RETRY_TIMES=3 @@ -254,6 +262,7 @@ function check_op_benchmark_result { } function check_CHANGE_OP_MAP { + LOG "[INFO] run function check_CHANGE_OP_MAP" for op_name in ${!CHANGE_OP_MAP[@]} do if [ -z "${BENCHMARK_OP_MAP[$op_name]}" ] @@ -271,6 +280,7 @@ function check_CHANGE_OP_MAP { # diff benchmakr result and miss op function summary_problems { + LOG "[INFO] run function summary_problems" local op_name exit_code exit_code=0 if [ ${#BENCHMARK_OP_MAP[*]} -ne 0 ] @@ -297,6 +307,8 @@ function cpu_op_benchmark { function gpu_op_benchmark { LOG "[INFO] Start run op benchmark gpu test ..." + load_CHANGE_OP_FILES + load_CHANGE_OP_MAP load_BENCHMARK_OP_MAP run_op_benchmark_test summary_problems From 1aa2bde0e27fe7a1f1699e8aff0c6de5621844f2 Mon Sep 17 00:00:00 2001 From: shangliang Xu Date: Fri, 20 Aug 2021 11:21:25 +0800 Subject: [PATCH 110/126] [bug fix] fix spectral_norm bug (#35005) --- python/paddle/fluid/dygraph/nn.py | 6 ++++++ python/paddle/fluid/layers/nn.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 608e85acec3f27..d9a431990c10bd 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -3062,6 +3062,12 @@ def __init__(self, self._dtype = dtype self._weight_shape = list(weight_shape) + assert np.prod(self._weight_shape) > 0,\ + "Any dimension of `weight_shape` cannot be equal to 0." + assert dim < len(self._weight_shape), \ + ("The input `dim` should be less than the " + "length of `weight_shape`, but received dim=" + "{}".format(dim)) h = self._weight_shape[self._dim] w = np.prod(self._weight_shape) // h diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d0d15e92bfb635..bd7ecfeee65e9e 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3720,6 +3720,10 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None): # create intput and parameters inputs = {'Weight': weight} input_shape = weight.shape + assert weight.numel() > 0, "Any dimension of input cannot be equal to 0." + assert dim < len(input_shape), ("The input `dim` should be less than the " + "rank of `weight`, but received dim=" + "{}".format(dim)) h = input_shape[dim] w = np.prod(input_shape) // h From 46371515fb1c3a4085484852783ddde46b629ef6 Mon Sep 17 00:00:00 2001 From: JYChen Date: Fri, 20 Aug 2021 11:41:37 +0800 Subject: [PATCH 111/126] add (N,C,*) input support for GroupNorm (#34773) * add (N,C,*) input support for GroupNorm * --amend --- paddle/fluid/operators/group_norm_op.cc | 7 ++ paddle/fluid/operators/group_norm_op.cu | 25 ++++-- paddle/fluid/operators/group_norm_op.h | 25 ++++-- .../tests/unittests/test_group_norm_op_v2.py | 78 +++++++++++++++---- python/paddle/nn/layer/norm.py | 4 +- 5 files changed, 112 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index 978170b296b421..e076444626e6af 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -37,6 +37,13 @@ class GroupNormOp : public framework::OperatorWithKernel { "GroupNorm"); auto x_dim = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE( + x_dim.size(), 2, + platform::errors::InvalidArgument( + "The Input(X)'s dimension of Op(group_norm) must be " + "greater than 1. But received: %u-D Tensor, which shape is [%s].", + x_dim.size(), x_dim)); + const std::string data_layout_str = ctx->Attrs().Get("data_layout"); const framework::DataLayout data_layout = diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index 18a248f55314f7..f199bfeb9443b6 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -171,9 +171,16 @@ class GroupNormKernel const T* bias_data = nullptr; if (bias) bias_data = bias->data(); - int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3] - : x_dims[1] * x_dims[2]); - + int imsize = 1; + if (data_layout == DataLayout::kNCHW) { + for (int i = 2; i < x_dims.size(); ++i) { + imsize *= x_dims[i]; + } + } else { + for (int i = 1; i < x_dims.size() - 1; ++i) { + imsize *= x_dims[i]; + } + } #ifdef __HIPCC__ int block_size = std::max(std::min(256, imsize), 64); #else @@ -349,8 +356,16 @@ class GroupNormGradKernel const T* bias_data = nullptr; if (bias) bias_data = bias->data(); - int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3] - : x_dims[1] * x_dims[2]); + int imsize = 1; + if (data_layout == DataLayout::kNCHW) { + for (int i = 2; i < x_dims.size(); ++i) { + imsize *= x_dims[i]; + } + } else { + for (int i = 1; i < x_dims.size() - 1; ++i) { + imsize *= x_dims[i]; + } + } #ifdef __HIPCC__ int block_size = std::max(std::min(256, imsize), 64); diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h index 2f0edd0451a3b7..f2388699e266f5 100644 --- a/paddle/fluid/operators/group_norm_op.h +++ b/paddle/fluid/operators/group_norm_op.h @@ -68,9 +68,16 @@ class GroupNormKernel : public framework::OpKernel { const T* bias_data = nullptr; if (bias) bias_data = bias->data(); - int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3] - : x_dims[1] * x_dims[2]); - + int imsize = 1; + if (data_layout == DataLayout::kNCHW) { + for (int i = 2; i < x_dims.size(); ++i) { + imsize *= x_dims[i]; + } + } else { + for (int i = 1; i < x_dims.size() - 1; ++i) { + imsize *= x_dims[i]; + } + } auto* iter_x_data = x_data; auto* iter_y_data = y_data; for (int bid = 0; bid < x_dims[0]; bid++) { @@ -257,8 +264,16 @@ class GroupNormGradKernel : public framework::OpKernel { const T* bias_data = nullptr; if (bias) bias_data = bias->data(); - int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3] - : x_dims[1] * x_dims[2]); + int imsize = 1; + if (data_layout == DataLayout::kNCHW) { + for (int i = 2; i < x_dims.size(); ++i) { + imsize *= x_dims[i]; + } + } else { + for (int i = 1; i < x_dims.size() - 1; ++i) { + imsize *= x_dims[i]; + } + } auto* iter_x_data = x_data; auto* iter_d_x_data = d_x_data; auto* iter_y_data = y_data; diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py index 0e13ca17562025..fbdf4a1cfd1ac0 100644 --- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py +++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py @@ -25,13 +25,29 @@ import paddle +def group_norm_naive_for_general_dimension(x, scale, bias, epsilon, groups): + # original version group norm only support 4-D tensor + # this function generalizes to support differnt dimensions tensor (>= 2-D) + input_shape = x.shape + N, C = x.shape[0], x.shape[1] + G = groups + x = x.reshape((N * G, -1)) + mean = np.mean(x, axis=1, keepdims=True) + var = np.var(x, axis=1, keepdims=True) + output = (x - mean) / np.sqrt(var + epsilon) + output = output.reshape(input_shape) * scale.reshape( + (-1, 1, 1)) + bias.reshape((-1, 1, 1)) + return output + + class TestDygraphGroupNormv2(unittest.TestCase): def test_dygraph(self): places = [fluid.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"): places.append(fluid.CUDAPlace(0)) + shapes = [[2, 2, 2, 2], [2, 2, 4], [4, 2], [4, 2, 6, 6, 2], + [2, 2, 2, 2, 2, 2]] for p in places: - shape = [2, 2, 2, 2] def compute_v1(x): with fluid.dygraph.guard(p): @@ -62,23 +78,26 @@ def attr_data_format(): self.assertRaises(ValueError, attr_data_format) - x = np.random.randn(*shape).astype("float32") - y1 = compute_v1(x) - y2 = compute_v2(x) - result = np.allclose(y1, y2, atol=1e-5) - if not result: - print("y1:", y1, "\ty2:", y2) - self.assertTrue(result) - test_weight_bias_false() - test_nn_exception() + for shape in shapes: + x = np.random.randn(*shape).astype("float32") + y1 = compute_v1(x) + y2 = compute_v2(x) + result = np.allclose(y1, y2, atol=1e-5) + if not result: + print("y1:", y1, "\ty2:", y2) + self.assertTrue(result) + test_weight_bias_false() + test_nn_exception() def test_static(self): + paddle.enable_static() places = [fluid.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"): places.append(fluid.CUDAPlace(0)) + shapes = [[2, 6, 2, 2], [2, 6, 4], [4, 6], [4, 6, 6, 6, 2], + [4, 6, 2, 2, 2, 2]] for p in places: exe = fluid.Executor(p) - shape = [2, 6, 2, 2] def compute_v1(x_np): with program_guard(Program(), Program()): @@ -98,10 +117,39 @@ def compute_v2(x_np): r = exe.run(feed={'x': x_np}, fetch_list=[y])[0] return r - x = np.random.randn(*shape).astype("float32") - y1 = compute_v1(x) - y2 = compute_v2(x) - self.assertTrue(np.allclose(y1, y2, atol=1e-5)) + for shape in shapes: + x = np.random.randn(*shape).astype("float32") + y1 = compute_v1(x) + y2 = compute_v2(x) + self.assertTrue(np.allclose(y1, y2, atol=1e-5)) + + +class TestGroupNormAPIV2_With_General_Dimensions(unittest.TestCase): + def test_numerical_accuracy(self): + paddle.disable_static() + shapes = [(2, 6), (2, 6, 4), (2, 6, 4, 4), (2, 6, 6, 6, 2), (2, 6, 6, 6, + 2, 3)] + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"): + places.append(fluid.CUDAPlace(0)) + + for place in places: + for shape in shapes: + scale = np.array([1]).astype("float32") + bias = np.array([0]).astype("float32") + data = np.random.random(shape).astype("float32") + expect_res1 = group_norm_naive_for_general_dimension( + data, scale, bias, epsilon=1e-5, groups=6) + expect_res2 = group_norm_naive_for_general_dimension( + data, scale, bias, epsilon=1e-5, groups=2) + + gn1 = paddle.nn.GroupNorm(num_channels=6, num_groups=6) + gn2 = paddle.nn.GroupNorm(num_channels=6, num_groups=2) + data_pd = paddle.to_tensor(data) + result1 = gn1(data_pd).numpy() + result2 = gn2(data_pd).numpy() + self.assertTrue(np.allclose(result1, expect_res1, atol=1e-5)) + self.assertTrue(np.allclose(result2, expect_res2, atol=1e-5)) if __name__ == '__main__': diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index 41599809810ee7..147e7fca3ff19d 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -338,8 +338,8 @@ class GroupNorm(Layer): name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. Shape: - - x: 4-D tensor with shape: (batch, num_features, height, weight). - - output: 4-D tensor with same shape as input x. + - x: Tensor with shape: (batch, num_features, *). + - output: The same shape as input x. Returns: None From f927b65317ff0673eaa650bb77705ea5b4f8b2b9 Mon Sep 17 00:00:00 2001 From: Peihan Date: Fri, 20 Aug 2021 11:48:51 +0800 Subject: [PATCH 112/126] temporary disable resnet50-quant multi-thread test (#35035) --- paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc b/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc index bf26f38c083fa2..64fe189de6ecc4 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc @@ -52,7 +52,7 @@ paddle::test::Record PrepareInput(int batch_size) { return image_Record; } -TEST(test_resnet50_quant, multi_thread4_trt_int8_bz1) { +TEST(DISABLED_test_resnet50_quant, multi_thread4_trt_int8_bz1) { int thread_num = 4; // init input data std::map input_data_map; @@ -94,7 +94,7 @@ TEST(test_resnet50_quant, multi_thread4_trt_int8_bz1) { std::cout << "finish test" << std::endl; } -TEST(test_resnet50_quant, multi_thread_multi_instance) { +TEST(DISABLED_test_resnet50_quant, multi_thread_multi_instance) { int thread_num = 4; // init input data std::map input_data_fp32, input_data_quant; From d082955e1c464d2a8a1912965b6ba823258e0fc0 Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+lili0826@users.noreply.github.com> Date: Fri, 20 Aug 2021 13:30:07 +0800 Subject: [PATCH 113/126] [NPU] Support npu op where and where grad (#34587) * [NPU] Support npu op where and where grad * fix use const_cast * delete a test --- paddle/fluid/operators/where_op_npu.cc | 96 ++++++++++ .../tests/unittests/npu/test_where_op_npu.py | 165 ++++++++++++++++++ 2 files changed, 261 insertions(+) create mode 100755 paddle/fluid/operators/where_op_npu.cc create mode 100755 python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py diff --git a/paddle/fluid/operators/where_op_npu.cc b/paddle/fluid/operators/where_op_npu.cc new file mode 100755 index 00000000000000..6b7f5b1dd5be85 --- /dev/null +++ b/paddle/fluid/operators/where_op_npu.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/where_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class WhereNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* condition = ctx.Input("Condition"); + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + const auto& runner = + NpuOpRunner("Select", {*condition, *X, *Y}, {*out}, {}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class WhereGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* condition = ctx.Input("Condition"); + auto* dout_t = ctx.Input(framework::GradVarName("Out")); + auto* dx_t = ctx.Output(framework::GradVarName("X")); + auto* dy_t = ctx.Output(framework::GradVarName("Y")); + + if (dx_t != nullptr) { + dx_t->mutable_data(ctx.GetPlace()); + } + if (dy_t != nullptr) { + dy_t->mutable_data(ctx.GetPlace()); + } + + auto stream = + ctx.template device_context() + .stream(); + + framework::Tensor tensor_zeros(dout_t->type()); + tensor_zeros.mutable_data(dout_t->dims(), ctx.GetPlace()); + const auto& runner = + NpuOpRunner("ZerosLike", {*dout_t}, {tensor_zeros}, {}); + runner.Run(stream); + + if (dx_t != nullptr) { + const auto& runner = NpuOpRunner( + "Select", {*condition, *dout_t, tensor_zeros}, {*dx_t}, {}); + runner.Run(stream); + } + if (dy_t != nullptr) { + const auto& runner = NpuOpRunner( + "Select", {*condition, tensor_zeros, *dout_t}, {*dy_t}, {}); + runner.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + where, ops::WhereNPUKernel, + ops::WhereNPUKernel, + ops::WhereNPUKernel, + ops::WhereNPUKernel); + +REGISTER_OP_NPU_KERNEL( + where_grad, + ops::WhereGradNPUKernel, + ops::WhereGradNPUKernel, + ops::WhereGradNPUKernel, + ops::WhereGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py new file mode 100755 index 00000000000000..cf877ff2872afa --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py @@ -0,0 +1,165 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import Program +from paddle.fluid.backward import append_backward + +paddle.enable_static() + + +class TestNPUWhereOp(OpTest): + def setUp(self): + self.op_type = "where" + self.set_npu() + self.init_config() + self.inputs = {'Condition': self.cond, 'X': self.x, 'Y': self.y} + self.outputs = {'Out': np.where(self.cond, self.x, self.y)} + + def init_config(self): + self.x = np.random.uniform(-3, 5, (100)).astype("float64") + self.y = np.random.uniform(-3, 5, (100)).astype("float64") + self.cond = np.zeros((100)).astype("bool") + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') + + +class TestNPUWhereOp2(TestNPUWhereOp): + def init_config(self): + self.x = np.random.uniform(-5, 5, (60, 2)).astype("float64") + self.y = np.random.uniform(-5, 5, (60, 2)).astype("float64") + self.cond = np.ones((60, 2)).astype("bool") + + +class TestNPUWhereOp3(TestNPUWhereOp): + def init_config(self): + self.x = np.random.uniform(-3, 5, (20, 2, 4)).astype("float64") + self.y = np.random.uniform(-3, 5, (20, 2, 4)).astype("float64") + self.cond = np.array(np.random.randint(2, size=(20, 2, 4)), dtype=bool) + + +class TestNPUWhereAPI(unittest.TestCase): + def setUp(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + self.init_data() + + def init_data(self): + self.shape = [10, 15] + self.cond = np.array(np.random.randint(2, size=self.shape), dtype=bool) + self.x = np.random.uniform(-2, 3, self.shape).astype(np.float32) + self.y = np.random.uniform(-2, 3, self.shape).astype(np.float32) + self.out = np.where(self.cond, self.x, self.y) + + def ref_x_backward(self, dout): + return np.where(self.cond == True, dout, 0) + + def ref_y_backward(self, dout): + return np.where(self.cond == False, dout, 0) + + def test_api(self): + for x_stop_gradient in [False, True]: + for y_stop_gradient in [False, True]: + train_prog = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(train_prog, startup): + cond = fluid.data( + name='cond', shape=self.shape, dtype='bool') + x = fluid.data(name='x', shape=self.shape, dtype='float32') + y = fluid.data(name='y', shape=self.shape, dtype='float32') + + x.stop_gradient = x_stop_gradient + y.stop_gradient = y_stop_gradient + + result = paddle.where(cond, x, y) + append_backward(fluid.layers.mean(result)) + + exe = fluid.Executor(self.place) + exe.run(startup) + + fetch_list = [result, result.grad_name] + if x_stop_gradient is False: + fetch_list.append(x.grad_name) + if y_stop_gradient is False: + fetch_list.append(y.grad_name) + out = exe.run( + train_prog, + feed={'cond': self.cond, + 'x': self.x, + 'y': self.y}, + fetch_list=fetch_list) + assert np.array_equal(out[0], self.out) + + if x_stop_gradient is False: + assert np.array_equal(out[2], + self.ref_x_backward(out[1])) + if y.stop_gradient is False: + assert np.array_equal(out[3], + self.ref_y_backward(out[1])) + elif y.stop_gradient is False: + assert np.array_equal(out[2], + self.ref_y_backward(out[1])) + + def test_api_broadcast(self, use_cuda=False): + train_prog = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(train_prog, startup): + x = fluid.layers.data(name='x', shape=[4, 1], dtype='float32') + y = fluid.layers.data(name='y', shape=[4, 2], dtype='float32') + x_i = np.array([[0.9383, 0.1983, 3.2, 1.2]]).astype("float32") + y_i = np.array([[1.0, 1.0, 1.0, 1.0], + [1.0, 1.0, 1.0, 1.0]]).astype("float32") + result = paddle.where(x > 1, x=x, y=y) + + exe = fluid.Executor(self.place) + exe.run(startup) + + out = exe.run(train_prog, + feed={'x': x_i, + 'y': y_i}, + fetch_list=[result]) + assert np.array_equal(out[0], np.where(x_i > 1, x_i, y_i)) + + +class TestWhereDygraphAPI(unittest.TestCase): + def test_api(self): + with fluid.dygraph.guard(paddle.NPUPlace(0)): + x_i = np.array([0.9383, 0.1983, 3.2, 1.2]).astype("float64") + y_i = np.array([1.0, 1.0, 1.0, 1.0]).astype("float64") + cond_i = np.array([False, False, True, True]).astype("bool") + x = fluid.dygraph.to_variable(x_i) + y = fluid.dygraph.to_variable(y_i) + cond = fluid.dygraph.to_variable(cond_i) + out = paddle.where(cond, x, y) + assert np.array_equal(out.numpy(), np.where(cond_i, x_i, y_i)) + + +if __name__ == '__main__': + unittest.main() From 4c115a82feed0a44c840f73c736953d4ab93823d Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+lili0826@users.noreply.github.com> Date: Fri, 20 Aug 2021 14:12:00 +0800 Subject: [PATCH 114/126] [NPU] Support npu op depthwise_conv2d (#34853) * add depthwise_conv2d npu * add some tests * Delete test_unique_op_npu.py * delete trans input --- paddle/fluid/operators/conv_op.cc | 9 +- paddle/fluid/operators/conv_op_npu.cc | 137 +++++++++ .../npu/test_conv2d_op_depthwise_conv_npu.py | 283 ++++++++++++++++++ 3 files changed, 426 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/operators/conv_op_npu.cc create mode 100755 python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 1266cfe6081acf..9defe3262ff4ce 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -194,11 +194,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( paddle::framework::DataTypeToString(input_data_type), paddle::framework::DataTypeToString(filter_data_type))); } +#ifndef PADDLE_WITH_ASCEND_CL if (input_data_type == framework::proto::VarType::FP16) { - PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, - platform::errors::InvalidArgument( - "float16 can only be used when CUDNN is used")); + PADDLE_ENFORCE_EQ( + library, framework::LibraryType::kCUDNN, + platform::errors::InvalidArgument( + "float16 can only be used when CUDNN or NPU is used")); } +#endif #if PADDLE_WITH_CUDA if (input_data_type == framework::proto::VarType::BF16 && library == framework::LibraryType::kCUDNN) { diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc new file mode 100644 index 00000000000000..4065394effa47b --- /dev/null +++ b/paddle/fluid/operators/conv_op_npu.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/conv_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class DepthwiseConvNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // input + const Tensor* input = context.Input("Input"); + const Tensor* filter = context.Input("Filter"); + // output + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + // attr + const std::vector stride = context.Attr>("strides"); + std::vector padding = context.Attr>("paddings"); + std::vector dilation = context.Attr>("dilations"); + const std::string data_format = context.Attr("data_format"); + const std::string padding_algorithm = + context.Attr("padding_algorithm"); + + // npu stream + auto stream = + context.template device_context().stream(); + + // check dimension + const bool channel_last = data_format == "NHWC"; + if (channel_last) { + // NHWC + PADDLE_ENFORCE_EQ( + output->dims()[output->dims().size() - 1], + input->dims()[input->dims().size() - 1], + platform::errors::InvalidArgument( + "ShapeError: The output channels must be equal to the " + "input channels. But receivced output channel number is %d " + "and input channel number is %d", + output->dims()[output->dims().size() - 1], + input->dims()[input->dims().size() - 1])); + } else { + // NCHW + PADDLE_ENFORCE_EQ( + output->dims()[1], input->dims()[1], + platform::errors::InvalidArgument( + "ShapeError: The output channels must be equal to the " + "input channels. But receivced output channel number is %d " + "and input channel number is %d", + output->dims()[1], input->dims()[1])); + } + + // update padding and dilation + auto in_dims = input->dims(); + auto filter_dims = filter->dims(); + framework::DDim in_data_dims; + framework::DDim filter_data_dims; + + if (channel_last) { + in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); + } else { + in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + } + filter_data_dims = framework::slice_ddim(filter_dims, 2, in_dims.size()); + + std::vector ksize = framework::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm, + in_data_dims, stride, ksize); + + // Transform filter (n, 1, h, w) --> (1, n, h, w) + Tensor transformed_filter(filter->type()); + transformed_filter.mutable_data({filter->dims()[1], filter->dims()[0], + filter->dims()[2], filter->dims()[3]}, + context.device_context().GetPlace()); + std::vector perm = {1, 0, 2, 3}; + const auto& runner_trans = NpuOpRunner( + "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}}); + runner_trans.Run(stream); + + // construct NPU attr + std::vector strides(4, 1); + std::vector dilations(4, 1); + + Tensor input_tensor, output_tensor; + input_tensor.ShareDataWith(*input); + output_tensor.ShareDataWith(*output); + + if (channel_last) { + input_tensor.set_layout(DataLayout::kNHWC); + output_tensor.set_layout(DataLayout::kNHWC); + strides[1] = stride[0]; + strides[2] = stride[1]; + dilations[1] = dilation[0]; + dilations[2] = dilation[1]; + } else { + strides[2] = stride[0]; + strides[3] = stride[1]; + dilations[2] = dilation[0]; + dilations[3] = dilation[1]; + } + + // CANN OP + const auto& runner = + NpuOpRunner("DepthwiseConv2D", {input_tensor, transformed_filter}, + {output_tensor}, {{"strides", strides}, + {"dilations", dilations}, + {"pads", padding}, + {"data_format", data_format}}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + depthwise_conv2d, + ops::DepthwiseConvNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py new file mode 100755 index 00000000000000..b62ad1b8b8e552 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py @@ -0,0 +1,283 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.fluid as fluid +import sys +sys.path.append("..") +from op_test import OpTest, skip_check_grad_ci +from test_conv2d_op import conv2d_forward_naive + +paddle.enable_static() + + +def create_test_channel_last_class(parent): + class TestChannelLastCase(parent): + def init_data_format(self): + self.data_format = "NHWC" + + def init_test_case_2(self): + N, C, H, W = self.input_size + self.input_size = [N, H, W, C] + + cls_name = "{0}_{1}".format(parent.__name__, "ChannelLast") + TestChannelLastCase.__name__ = cls_name + globals()[cls_name] = TestChannelLastCase + + +def create_test_padding_SAME_class(parent): + class TestPaddingSMAECase(parent): + def init_paddings(self): + self.pad = [0, 0] + self.padding_algorithm = "SAME" + + cls_name = "{0}_{1}".format(parent.__name__, "PaddingSAMEOp") + TestPaddingSMAECase.__name__ = cls_name + globals()[cls_name] = TestPaddingSMAECase + + +def create_test_padding_VALID_class(parent): + class TestPaddingVALIDCase(parent): + def init_paddings(self): + self.pad = [1, 1] + self.padding_algorithm = "VALID" + + cls_name = "{0}_{1}".format(parent.__name__, "PaddingVALIDOp") + TestPaddingVALIDCase.__name__ = cls_name + globals()[cls_name] = TestPaddingVALIDCase + + +@skip_check_grad_ci( + reason='''Inference only, it doesn't need to call check_grad.''') +class TestDepthwiseConvNPU(OpTest): + def setUp(self): + self.op_type = "depthwise_conv2d" + self.dtype = np.float16 + self.set_npu() + self.init_data_format() + self.init_test_case() + self.init_test_case_2() + + conv2d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilation': self.dilations + } + + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype) + + output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups, + conv2d_param, "EXPLICIT", + self.data_format) + + output = output.astype(self.dtype) + + self.inputs = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) + } + + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'groups': self.groups, + 'dilations': self.dilations, + 'data_format': self.data_format, + } + self.outputs = {'Output': output} + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def init_test_case(self): + self.pad = [1, 1] + self.dilations = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [3, f_c, 3, 3] + + def test_check_output(self): + self.check_output_with_place(self.place) + + def init_data_format(self): + self.data_format = "NCHW" + + def init_test_case_2(self): + pass + + +class TestDepthwiseConvNPU2(TestDepthwiseConvNPU): + def init_test_case(self): + self.pad = [1, 1] + self.dilations = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [3, f_c, 3, 3] + + +class TestDepthwiseConvNPU3(TestDepthwiseConvNPU): + def init_test_case(self): + self.pad = [1, 1] + self.dilations = [2, 2] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [3, f_c, 3, 3] + + +class TestDepthwiseConvNPU4(TestDepthwiseConvNPU): + def init_test_case(self): + self.pad = [1, 1] + self.dilations = [2, 2] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [3, f_c, 3, 3] + + +@skip_check_grad_ci( + reason='''Inference only, it doesn't need to call check_grad.''') +class TestDepthwiseConvNPU_Padding(OpTest): + def setUp(self): + self.op_type = "depthwise_conv2d" + self.dtype = np.float16 + self.set_npu() + self.init_data_format() + self.init_paddings() + self.init_test_case() + self.init_test_case_2() + + conv2d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilation': self.dilations + } + + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype) + + output, _, _, _, _ = conv2d_forward_naive( + input, filter, self.groups, conv2d_param, self.padding_algorithm, + self.data_format) + output = output.astype(self.dtype) + + self.inputs = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) + } + + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'padding_algorithm': self.padding_algorithm, + 'groups': self.groups, + 'dilations': self.dilations, + 'data_format': self.data_format + } + self.outputs = {'Output': output} + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def init_test_case(self): + self.pad = [1, 1, 0, 1] + self.dilations = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [3, f_c, 3, 3] + + def test_check_output(self): + self.check_output_with_place(self.place) + + def init_data_format(self): + self.data_format = "NCHW" + + def init_paddings(self): + self.pad = [1, 1, 0, 1] + self.padding_algorithm = "EXPLICIT" + + def init_test_case_2(self): + pass + + +class TestDepthwiseConvNPU2_Padding(TestDepthwiseConvNPU_Padding): + def init_test_case(self): + self.pad = [1, 1, 0, 1] + self.dilations = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [3, f_c, 3, 3] + + def init_paddings(self): + self.pad = [0, 1, 0, 2] + self.padding_algorithm = "EXPLICIT" + + +class TestDepthwiseConvNPU3_Padding(TestDepthwiseConvNPU_Padding): + def init_test_case(self): + self.pad = [1, 1, 0, 1] + self.dilations = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [3, f_c, 3, 3] + + def init_paddings(self): + self.pad = [2, 1, 2, 3] + self.padding_algorithm = "EXPLICIT" + + +# test channel last +create_test_channel_last_class(TestDepthwiseConvNPU) +create_test_channel_last_class(TestDepthwiseConvNPU2) +create_test_channel_last_class(TestDepthwiseConvNPU_Padding) +create_test_channel_last_class(TestDepthwiseConvNPU2_Padding) + +# test padding SAME +create_test_padding_SAME_class(TestDepthwiseConvNPU_Padding) +create_test_padding_SAME_class(TestDepthwiseConvNPU2_Padding) +create_test_padding_SAME_class(TestDepthwiseConvNPU3_Padding) + +# test padding VALID +create_test_padding_VALID_class(TestDepthwiseConvNPU_Padding) +create_test_padding_VALID_class(TestDepthwiseConvNPU2_Padding) +create_test_padding_VALID_class(TestDepthwiseConvNPU3_Padding) + +if __name__ == '__main__': + unittest.main() From 4416c793af53d199e055c44750bac4f34f43c1e6 Mon Sep 17 00:00:00 2001 From: wangguanqun Date: Fri, 20 Aug 2021 15:56:22 +0800 Subject: [PATCH 115/126] fix set_lod in data_feed (#35000) * add trainer desc config to distributed strategy * code style modified * data_feed set lod --- paddle/fluid/framework/data_feed.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index cc4609a740f474..87afda459624f9 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -847,8 +847,10 @@ void MultiSlotDataFeed::PutToFeedVec( total_instance * sizeof(int64_t)); } - LoD data_lod{offset}; - feed_vec_[i]->set_lod(data_lod); + if (!use_slots_is_dense_[i]) { + LoD data_lod{offset}; + feed_vec_[i]->set_lod(data_lod); + } if (use_slots_is_dense_[i]) { if (inductive_shape_index_[i] != -1) { use_slots_shape_[i][inductive_shape_index_[i]] = @@ -1206,8 +1208,10 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( } auto& slot_offset = offset_[i]; if (this->input_type_ == 0) { - LoD data_lod{slot_offset}; - feed_vec_[i]->set_lod(data_lod); + if (!use_slots_is_dense_[i]) { + LoD data_lod{slot_offset}; + feed_vec_[i]->set_lod(data_lod); + } } else if (this->input_type_ == 1) { if (!use_slots_is_dense_[i]) { std::vector tmp_offset; From 6bacfb0e002e458fe42334cff2ffa751fea301bf Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Fri, 20 Aug 2021 16:19:21 +0800 Subject: [PATCH 116/126] use spin lock in auto growth allocator (#34910) * use spin lock in auto growth allocator, test=develop * use pthread spin lock, test=develop * use lock guard, test=develop * use malloc spin lock, test=develop * use lock_guard, test=develop --- .../auto_growth_best_fit_allocator.cc | 5 +- .../auto_growth_best_fit_allocator.h | 3 +- paddle/fluid/memory/allocation/spin_lock.h | 36 +++ paddle/fluid/memory/allocation/spin_lock_c.h | 239 ++++++++++++++++++ 4 files changed, 280 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/memory/allocation/spin_lock.h create mode 100644 paddle/fluid/memory/allocation/spin_lock_c.h diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index d29a33b47018e2..9936db36be8988 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -45,7 +45,7 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { size = AlignedSize(size, alignment_); - std::lock_guard guard(mtx_); + std::lock_guard guard(spinlock_); auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); BlockIt block_it; if (iter != free_blocks_.end()) { @@ -94,11 +94,12 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining " << remaining_size; } + return new BlockAllocation(block_it); } void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { - std::lock_guard guard(mtx_); + std::lock_guard guard(spinlock_); auto block_it = static_cast(allocation)->block_it_; auto &blocks = block_it->chunk_->blocks_; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index eb52cab2594df0..5ed6eb94f158fe 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -21,6 +21,7 @@ #include #include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/allocation/spin_lock.h" namespace paddle { namespace memory { @@ -86,7 +87,7 @@ class AutoGrowthBestFitAllocator : public Allocator { size_t alignment_; size_t chunk_size_; - mutable std::mutex mtx_; + SpinLock spinlock_; }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/spin_lock.h b/paddle/fluid/memory/allocation/spin_lock.h new file mode 100644 index 00000000000000..b26515ba3913a4 --- /dev/null +++ b/paddle/fluid/memory/allocation/spin_lock.h @@ -0,0 +1,36 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/memory/allocation/spin_lock_c.h" + +namespace paddle { +namespace memory { + +class SpinLock { + public: + SpinLock() { INITIAL_LOCK(&mlock_); } + + void lock() { ACQUIRE_LOCK(&mlock_); } + + void unlock() { RELEASE_LOCK(&mlock_); } + DISABLE_COPY_AND_ASSIGN(SpinLock); + + private: + MLOCK_T mlock_; +}; + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/spin_lock_c.h b/paddle/fluid/memory/allocation/spin_lock_c.h new file mode 100644 index 00000000000000..397431899b4d3c --- /dev/null +++ b/paddle/fluid/memory/allocation/spin_lock_c.h @@ -0,0 +1,239 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + the spinlock implementation is borrowed from Doug Lea's malloc, released to the + public domain, as explained at + http://creativecommons.org/licenses/publicdomain. Send questions, + comments, complaints, performance data, etc to dl@cs.oswego.edu +*/ + +#pragma once + +#ifndef WIN32 +#include +#if defined(__SVR4) && defined(__sun) /* solaris */ +#include +#endif /* solaris */ +#else +#ifndef _M_AMD64 +/* These are already defined on AMD64 builds */ +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ +LONG __cdecl _InterlockedCompareExchange(LONG volatile* Dest, LONG Exchange, + LONG Comp); +LONG __cdecl _InterlockedExchange(LONG volatile* Target, LONG Value); +#ifdef __cplusplus +} +#endif /* __cplusplus */ +#endif /* _M_AMD64 */ +#pragma intrinsic(_InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchange) +#define interlockedcompareexchange _InterlockedCompareExchange +#define interlockedexchange _InterlockedExchange +#endif /* Win32 */ + +#ifndef FORCEINLINE +#if defined(__GNUC__) +#define FORCEINLINE __inline __attribute__((always_inline)) +#elif defined(_MSC_VER) +#define FORCEINLINE __forceinline +#endif +#endif +#ifndef NOINLINE +#if defined(__GNUC__) +#define NOINLINE __attribute__((noinline)) +#elif defined(_MSC_VER) +#define NOINLINE __declspec(noinline) +#else +#define NOINLINE +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#ifndef FORCEINLINE +#define FORCEINLINE inline +#endif +#endif /* __cplusplus */ +#ifndef FORCEINLINE +#define FORCEINLINE +#endif +#ifdef __cplusplus +}; /* end of extern "C" */ +#endif /* __cplusplus */ + +#ifndef WIN32 + +/* Custom pthread-style spin locks on x86 and x64 for gcc */ +struct pthread_mlock_t { + volatile unsigned int l; + unsigned int c; + pthread_t threadid; +}; +#define MLOCK_T struct pthread_mlock_t +#define CURRENT_THREAD pthread_self() +#define INITIAL_LOCK(sl) ((sl)->threadid = 0, (sl)->l = (sl)->c = 0, 0) +#define ACQUIRE_LOCK(sl) pthread_acquire_lock(sl) +#define RELEASE_LOCK(sl) pthread_release_lock(sl) +#define TRY_LOCK(sl) pthread_try_lock(sl) +#define SPINS_PER_YIELD 63 + +// static MLOCK_T malloc_global_mutex = { 0, 0, 0}; + +static FORCEINLINE int pthread_acquire_lock(MLOCK_T* sl) { + int spins = 0; + volatile unsigned int* lp = &sl->l; + for (;;) { + if (*lp != 0) { + if (sl->threadid == CURRENT_THREAD) { + ++sl->c; + return 0; + } + } else { + /* place args to cmpxchgl in locals to evade oddities in some gccs */ + int cmp = 0; + int val = 1; + int ret; + __asm__ __volatile__("lock; cmpxchgl %1, %2" + : "=a"(ret) + : "r"(val), "m"(*(lp)), "0"(cmp) + : "memory", "cc"); + if (!ret) { + assert(!sl->threadid); + sl->threadid = CURRENT_THREAD; + sl->c = 1; + return 0; + } + } + if ((++spins & SPINS_PER_YIELD) == 0) { +#if defined(__SVR4) && defined(__sun) /* solaris */ + thr_yield(); +#else +#if defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__) + sched_yield(); +#else /* no-op yield on unknown systems */ + ; // NOLINT +#endif /* __linux__ || __FreeBSD__ || __APPLE__ */ +#endif /* solaris */ + } + } +} + +static FORCEINLINE void pthread_release_lock(MLOCK_T* sl) { + volatile unsigned int* lp = &sl->l; + assert(*lp != 0); + assert(sl->threadid == CURRENT_THREAD); + if (--sl->c == 0) { + sl->threadid = 0; + int prev = 0; + int ret; + __asm__ __volatile__("lock; xchgl %0, %1" + : "=r"(ret) + : "m"(*(lp)), "0"(prev) + : "memory"); + } +} + +static FORCEINLINE int pthread_try_lock(MLOCK_T* sl) { + volatile unsigned int* lp = &sl->l; + if (*lp != 0) { + if (sl->threadid == CURRENT_THREAD) { + ++sl->c; + return 1; + } + } else { + int cmp = 0; + int val = 1; + int ret; + __asm__ __volatile__("lock; cmpxchgl %1, %2" + : "=a"(ret) + : "r"(val), "m"(*(lp)), "0"(cmp) + : "memory", "cc"); + if (!ret) { + assert(!sl->threadid); + sl->threadid = CURRENT_THREAD; + sl->c = 1; + return 1; + } + } + return 0; +} + +#else /* WIN32 */ +/* Custom win32-style spin locks on x86 and x64 for MSC */ +struct win32_mlock_t { + volatile long l; // NOLINT + unsigned int c; + long threadid; // NOLINT +}; + +#define MLOCK_T struct win32_mlock_t +#define CURRENT_THREAD GetCurrentThreadId() +#define INITIAL_LOCK(sl) ((sl)->threadid = 0, (sl)->l = (sl)->c = 0, 0) +#define ACQUIRE_LOCK(sl) win32_acquire_lock(sl) +#define RELEASE_LOCK(sl) win32_release_lock(sl) +#define TRY_LOCK(sl) win32_try_lock(sl) +#define SPINS_PER_YIELD 63 + +// static MLOCK_T malloc_global_mutex = { 0, 0, 0}; + +static FORCEINLINE int win32_acquire_lock(MLOCK_T *sl) { + int spins = 0; + for (;;) { + if (sl->l != 0) { + if (sl->threadid == CURRENT_THREAD) { + ++sl->c; + return 0; + } + } else { + if (!interlockedexchange(&sl->l, 1)) { + assert(!sl->threadid); + sl->threadid = CURRENT_THREAD; + sl->c = 1; + return 0; + } + } + if ((++spins & SPINS_PER_YIELD) == 0) SleepEx(0, FALSE); + } +} + +static FORCEINLINE void win32_release_lock(MLOCK_T *sl) { + assert(sl->threadid == CURRENT_THREAD); + assert(sl->l != 0); + if (--sl->c == 0) { + sl->threadid = 0; + interlockedexchange(&sl->l, 0); + } +} + +static FORCEINLINE int win32_try_lock(MLOCK_T *sl) { + if (sl->l != 0) { + if (sl->threadid == CURRENT_THREAD) { + ++sl->c; + return 1; + } + } else { + if (!interlockedexchange(&sl->l, 1)) { + assert(!sl->threadid); + sl->threadid = CURRENT_THREAD; + sl->c = 1; + return 1; + } + } + return 0; +} + +#endif /* WIN32 */ From ef517a56d1d4d241b36e884e467e60bc5860ef24 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Fri, 20 Aug 2021 16:51:56 +0800 Subject: [PATCH 117/126] [NPU] Support npu kernel for pad3d op (#34815) * [NPU] Support npu kernel for pad3d op * fix for comment of zhouwei25 * fix some bugs according to qili93's comments * add support and test for paddings in input * delete VLOG used for debug --- paddle/fluid/operators/pad3d_op_npu.cc | 139 +++++ .../tests/unittests/npu/test_pad3d_op_npu.py | 535 ++++++++++++++++++ 2 files changed, 674 insertions(+) create mode 100644 paddle/fluid/operators/pad3d_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_pad3d_op_npu.py diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc new file mode 100644 index 00000000000000..3a1fba94550032 --- /dev/null +++ b/paddle/fluid/operators/pad3d_op_npu.cc @@ -0,0 +1,139 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +static inline std::vector GetPaddings( + const framework::ExecutionContext& context) { + std::vector paddings(6); + auto* paddings_t = context.Input("Paddings"); + if (paddings_t) { + TensorToVector(*paddings_t, context.device_context(), &paddings); + } else { + auto pads = context.Attr>("paddings"); + std::copy(pads.begin(), pads.end(), paddings.data()); + } + return paddings; +} + +template +class Pad3dNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto in_dims = x->dims(); + + std::vector pads = GetPaddings(context); + auto mode = context.Attr("mode"); + float value = context.Attr("value"); + auto data_format = context.Attr("data_format"); + + auto* out = context.Output("Out"); + + PADDLE_ENFORCE_LT(abs(value), 1e-5, + platform::errors::Unimplemented( + "Ascend npu only support constant_values=0 right now," + "but received constant_value is %f .", + value)); + + PADDLE_ENFORCE_EQ(mode, "constant", + platform::errors::Unimplemented( + "Ascend npu only support mode=constant right now," + "but received mode is %s .", + mode)); + + std::vector paddings( + {0, 0, 0, 0, pads[4], pads[5], pads[2], pads[3], pads[0], pads[1]}); + if (data_format == "NCDHW") { + out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[4] + pads[5], + in_dims[3] + pads[2] + pads[3], + in_dims[4] + pads[0] + pads[1]}); + } else { + out->Resize({in_dims[0], in_dims[1] + pads[4] + pads[5], + in_dims[2] + pads[2] + pads[3], + in_dims[3] + pads[0] + pads[1], in_dims[4]}); + paddings = {0, 0, pads[4], pads[5], pads[2], + pads[3], pads[0], pads[1], 0, 0}; + } + out->mutable_data(context.GetPlace()); + + NpuOpRunner runner; + runner.SetType("PadV3") + .AddInput(*x) + .AddInput(std::move(paddings)) + .AddInput( + std::vector({0})) // npu only support constant_value=0 now + .AddOutput(*out) + .AddAttr("mode", mode); + + auto stream = + context.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class Pad3dGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + std::vector pads = GetPaddings(context); + auto mode = context.Attr("mode"); + auto data_format = context.Attr("data_format"); + + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* d_in = context.Output(framework::GradVarName("X")); + auto d_in_dims = d_in->dims(); + d_in->mutable_data(context.GetPlace()); + + const int pad_left = pads[0]; + const int pad_top = pads[2]; + const int pad_front = pads[4]; + + auto stream = + context.template device_context() + .stream(); + + std::vector size( + {d_in_dims[0], d_in_dims[1], d_in_dims[2], d_in_dims[3], d_in_dims[4]}); + if (mode == "constant") { // this method can be only used for constant mode + std::vector offsets({0, 0, pad_front, pad_top, pad_left}); + if (data_format == "NDHWC") { + offsets = {0, pad_front, pad_top, pad_left, 0}; + } + const auto& runner = NpuOpRunner("SliceD", {*d_out}, {*d_in}, + {{"offsets", offsets}, {"size", size}}); + runner.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(pad3d, ops::Pad3dNPUKernel, + ops::Pad3dNPUKernel, ops::Pad3dNPUKernel); + +REGISTER_OP_NPU_KERNEL(pad3d_grad, ops::Pad3dNPUKernel, + ops::Pad3dGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_pad3d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pad3d_op_npu.py new file mode 100644 index 00000000000000..234ceb2f0b7ec3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_pad3d_op_npu.py @@ -0,0 +1,535 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +sys.path.append("..") +import op_test +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import paddle.fluid.core as core +from paddle.fluid import Program, program_guard, Executor, default_main_program +import paddle.fluid as fluid + + +class TestPad3dNPUOp(op_test.OpTest): + def setUp(self): + paddle.enable_static() + self.__class__.use_npu = True + self.op_type = "pad3d" + self.place = paddle.NPUPlace(0) + + self.x_type = "float32" + self.mode = "constant" + self.variable_paddings = False + self.initTestCase() + + self.value = 0 #Asend npu only support constant_values = 0 right now. + self.inputs = {'X': np.random.random(self.shape).astype(self.x_type)} + self.attrs = {} + if self.variable_paddings: + self.attrs['paddings'] = [] + self.inputs['Paddings'] = np.array(self.paddings).flatten().astype( + "int32") + else: + self.attrs['paddings'] = np.array(self.paddings).flatten().astype( + "int32") + self.attrs['value'] = self.value + self.attrs['mode'] = self.mode + self.attrs['data_format'] = self.data_format + if self.data_format == "NCDHW": + paddings = [ + (0, 0), + (0, 0), + (self.paddings[4], self.paddings[5]), + (self.paddings[2], self.paddings[3]), + (self.paddings[0], self.paddings[1]), + ] + else: + paddings = [ + (0, 0), + (self.paddings[4], self.paddings[5]), + (self.paddings[2], self.paddings[3]), + (self.paddings[0], self.paddings[1]), + (0, 0), + ] + + out = np.pad(self.inputs['X'], + paddings, + mode=self.mode, + constant_values=self.value) + + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.paddings = [0, 0, 0, 0, 0, 0] + self.data_format = "NCDHW" + + +class TestCase1(TestPad3dNPUOp): + def initTestCase(self): + self.shape = (3, 4, 5, 6, 7) + self.paddings = [0, 1, 2, 3, 4, 5] + self.data_format = "NCDHW" + self.x_type = "float16" + + def test_check_grad(self): + self.__class__.no_need_check_grad = True + pass + + +class TestCase2(TestPad3dNPUOp): + def initTestCase(self): + self.shape = (4, 5, 6, 7, 8) + self.paddings = [1, 1, 1, 1, 1, 1] + self.data_format = "NDHWC" + self.variable_paddings = True + + +class TestPadAPI(unittest.TestCase): + def _get_numpy_out(self, + input_data, + pad, + mode, + value=0, + data_format="NCDHW"): + if mode == "constant" and len(pad) == len(input_data.shape) * 2: + pad = np.reshape(pad, (-1, 2)).tolist() + elif data_format == "NCDHW": + pad = [ + (0, 0), + (0, 0), + (pad[4], pad[5]), + (pad[2], pad[3]), + (pad[0], pad[1]), + ] + elif data_format == "NDHWC": + pad = [ + (0, 0), + (pad[4], pad[5]), + (pad[2], pad[3]), + (pad[0], pad[1]), + (0, 0), + ] + elif data_format == "NCHW": + pad = [ + (0, 0), + (0, 0), + (pad[2], pad[3]), + (pad[0], pad[1]), + ] + elif data_format == "NHWC": + pad = [ + (0, 0), + (pad[2], pad[3]), + (pad[0], pad[1]), + (0, 0), + ] + elif data_format == "NCL": + pad = [ + (0, 0), + (0, 0), + (pad[0], pad[1]), + ] + elif data_format == "NLC": + pad = [ + (0, 0), + (pad[0], pad[1]), + (0, 0), + ] + + out = np.pad(input_data, pad, mode=mode, constant_values=value) + return out + + def test_static(self): + paddle.enable_static() + self.place = fluid.NPUPlace(0) if fluid.core.is_compiled_with_npu( + ) else fluid.CPUPlace() + with program_guard(Program(), Program()): + input_shape = (1, 2, 3, 4, 5) + pad = [1, 2, 1, 1, 3, 4] + mode = "constant" + value = 0 + input_data = np.random.rand(*input_shape).astype(np.float32) + x = paddle.fluid.data(name="x", shape=input_shape) + result1 = F.pad(x=x, + pad=pad, + value=value, + mode=mode, + data_format="NCDHW") + result2 = F.pad(x=x, + pad=pad, + value=value, + mode=mode, + data_format="NDHWC") + exe = Executor(self.place) + fetches = exe.run(default_main_program(), + feed={"x": input_data}, + fetch_list=[result1, result2]) + + np_out1 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NCDHW") + np_out2 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NDHWC") + self.assertTrue(np.allclose(fetches[0], np_out1)) + self.assertTrue(np.allclose(fetches[1], np_out2)) + + def test_dygraph_1(self): + paddle.disable_static() + paddle.device.set_device("npu") + input_shape = (1, 2, 3, 4, 5) + pad = [1, 2, 1, 1, 3, 4] + + mode = "constant" + value = 0 + input_data = np.random.rand(*input_shape).astype(np.float32) + tensor_data = paddle.to_tensor(input_data) + + np_out1 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NCDHW") + np_out2 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NDHWC") + + y1 = F.pad(tensor_data, + pad=pad, + mode=mode, + value=value, + data_format="NCDHW") + y2 = F.pad(tensor_data, + pad=pad, + mode=mode, + value=value, + data_format="NDHWC") + + self.assertTrue(np.allclose(y1.numpy(), np_out1)) + self.assertTrue(np.allclose(y2.numpy(), np_out2)) + + def test_dygraph_2(self): + paddle.disable_static() + paddle.device.set_device("npu") + input_shape = (2, 3, 4, 5) + pad = [1, 1, 3, 4] + + mode = "constant" + value = 0 + input_data = np.random.rand(*input_shape).astype(np.float32) + tensor_data = paddle.to_tensor(input_data) + + np_out1 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NCHW") + np_out2 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NHWC") + + y1 = F.pad(tensor_data, + pad=pad, + mode=mode, + value=value, + data_format="NCHW") + y2 = F.pad(tensor_data, + pad=pad, + mode=mode, + value=value, + data_format="NHWC") + + self.assertTrue(np.allclose(y1.numpy(), np_out1)) + self.assertTrue(np.allclose(y2.numpy(), np_out2)) + + def test_dygraph_3(self): + paddle.disable_static() + paddle.device.set_device("npu") + input_shape = (3, 4, 5) + pad = [3, 4] + + mode = "constant" + value = 0 + input_data = np.random.rand(*input_shape).astype(np.float32) + tensor_data = paddle.to_tensor(input_data) + + np_out1 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NCL") + np_out2 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NLC") + + y1 = F.pad(tensor_data, + pad=pad, + mode=mode, + value=value, + data_format="NCL") + y2 = F.pad(tensor_data, + pad=pad, + mode=mode, + value=value, + data_format="NLC") + + self.assertTrue(np.allclose(y1.numpy(), np_out1)) + self.assertTrue(np.allclose(y2.numpy(), np_out2)) + + +class TestPad1dAPI(unittest.TestCase): + def _get_numpy_out(self, + input_data, + pad, + mode, + value=0.0, + data_format="NCL"): + if data_format == "NCL": + pad = [ + (0, 0), + (0, 0), + (pad[0], pad[1]), + ] + else: + pad = [ + (0, 0), + (pad[0], pad[1]), + (0, 0), + ] + + out = np.pad(input_data, pad, mode=mode, constant_values=value) + return out + + def test_class(self): + paddle.disable_static() + paddle.device.set_device("npu") + input_shape = (3, 4, 5) + pad = [1, 2] + pad_int = 1 + value = 0 + input_data = np.random.rand(*input_shape).astype(np.float32) + + pad_constant = nn.Pad1D(padding=pad, mode="constant", value=value) + pad_constant_int = nn.Pad1D( + padding=pad_int, mode="constant", value=value) + + data = paddle.to_tensor(input_data) + + output = pad_constant(data) + np_out = self._get_numpy_out( + input_data, pad, "constant", value=value, data_format="NCL") + self.assertTrue(np.allclose(output.numpy(), np_out)) + + output = pad_constant_int(data) + np_out = self._get_numpy_out( + input_data, [pad_int] * 2, + "constant", + value=value, + data_format="NCL") + self.assertTrue(np.allclose(output.numpy(), np_out)) + + +class TestPad2dAPI(unittest.TestCase): + def _get_numpy_out(self, + input_data, + pad, + mode, + value=0.0, + data_format="NCHW"): + if data_format == "NCHW": + pad = [ + (0, 0), + (0, 0), + (pad[2], pad[3]), + (pad[0], pad[1]), + ] + else: + pad = [ + (0, 0), + (pad[2], pad[3]), + (pad[0], pad[1]), + (0, 0), + ] + + out = np.pad(input_data, pad, mode=mode, constant_values=value) + return out + + def test_class(self): + paddle.disable_static() + paddle.device.set_device("npu") + input_shape = (3, 4, 5, 6) + pad = [1, 2, 2, 1] + pad_int = 1 + value = 0 + input_data = np.random.rand(*input_shape).astype(np.float32) + + pad_constant = nn.Pad2D(padding=pad, mode="constant", value=value) + pad_constant_int = nn.Pad2D( + padding=pad_int, mode="constant", value=value) + + data = paddle.to_tensor(input_data) + + output = pad_constant(data) + np_out = self._get_numpy_out( + input_data, pad, "constant", value=value, data_format="NCHW") + self.assertTrue(np.allclose(output.numpy(), np_out)) + + output = pad_constant_int(data) + np_out = self._get_numpy_out( + input_data, [pad_int] * 4, + "constant", + value=value, + data_format="NCHW") + self.assertTrue(np.allclose(output.numpy(), np_out)) + + +class TestPad3dAPI(unittest.TestCase): + def _get_numpy_out(self, + input_data, + pad, + mode, + value=0.0, + data_format="NCDHW"): + if data_format == "NCDHW": + pad = [ + (0, 0), + (0, 0), + (pad[4], pad[5]), + (pad[2], pad[3]), + (pad[0], pad[1]), + ] + else: + pad = [ + (0, 0), + (pad[4], pad[5]), + (pad[2], pad[3]), + (pad[0], pad[1]), + (0, 0), + ] + + out = np.pad(input_data, pad, mode=mode, constant_values=value) + return out + + def test_class(self): + paddle.disable_static() + paddle.device.set_device("npu") + input_shape = (3, 4, 5, 6, 7) + pad = [1, 2, 2, 1, 1, 0] + pad_int = 1 + value = 0 + input_data = np.random.rand(*input_shape).astype(np.float32) + + pad_constant = nn.Pad3D(padding=pad, mode="constant", value=value) + pad_constant_int = nn.Pad3D( + padding=pad_int, mode="constant", value=value) + + data = paddle.to_tensor(input_data) + + output = pad_constant(data) + np_out = self._get_numpy_out( + input_data, pad, "constant", value=value, data_format="NCDHW") + self.assertTrue(np.allclose(output.numpy(), np_out)) + + output = pad_constant_int(data) + np_out = self._get_numpy_out( + input_data, [pad_int] * 6, + "constant", + value=value, + data_format="NCDHW") + self.assertTrue(np.allclose(output.numpy(), np_out)) + + +class TestPad3dOpNpuError(unittest.TestCase): + def test_errors(self): + def test_value(): + input_shape = (1, 2, 3, 4, 5) + data = np.random.rand(*input_shape).astype(np.float32) + x = paddle.fluid.data(name="x", shape=input_shape) + y = F.pad(x, pad=[1, 1, 1, 1, 1, 1], value=1, mode='constant') + place = paddle.NPUPlace() + exe = Executor(place) + outputs = exe.run(feed={'x': data}, fetch_list=[y.name]) + + def test_mode_1(): + input_shape = (1, 2, 3, 4, 5) + data = np.random.rand(*input_shape).astype(np.float32) + x = paddle.fluid.data(name="x", shape=input_shape) + y = F.pad(x, pad=[1, 1, 1, 1, 1, 1], mode='reflect') + place = paddle.NPUPlace() + exe = Executor(place) + outputs = exe.run(feed={'x': data}, fetch_list=[y.name]) + + def test_mode_2(): + input_shape = (1, 2, 3, 4, 5) + data = np.random.rand(*input_shape).astype(np.float32) + x = paddle.fluid.data(name="x", shape=input_shape) + y = F.pad(x, pad=[1, 1, 1, 1, 1, 1], mode='replicate') + place = paddle.NPUPlace() + exe = Executor(place) + outputs = exe.run(feed={'x': data}, fetch_list=[y.name]) + + def test_mode_3(): + input_shape = (1, 2, 3, 4, 5) + data = np.random.rand(*input_shape).astype(np.float32) + x = paddle.fluid.data(name="x", shape=input_shape) + y = F.pad(x, pad=[1, 1, 1, 1, 1, 1], mode='circular') + place = paddle.CPUPlace() + exe = Executor(place) + outputs = exe.run(feed={'x': data}, fetch_list=[y.name]) + + self.assertRaises(Exception, test_value) + + self.assertRaises(Exception, test_mode_1) + + self.assertRaises(Exception, test_mode_2) + + self.assertRaises(Exception, test_mode_3) + + +class TestPadDataformatError(unittest.TestCase): + def test_errors(self): + def test_ncl(): + input_shape = (1, 2, 3, 4) + pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32')) + data = np.arange( + np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1 + my_pad = nn.Pad1D(padding=pad, mode="replicate", data_format="NCL") + data = paddle.to_tensor(data) + result = my_pad(data) + + def test_nchw(): + input_shape = (1, 2, 4) + pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32')) + data = np.arange( + np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1 + my_pad = nn.Pad1D(padding=pad, mode="replicate", data_format="NCHW") + data = paddle.to_tensor(data) + result = my_pad(data) + + def test_ncdhw(): + input_shape = (1, 2, 3, 4) + pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32')) + data = np.arange( + np.prod(input_shape), dtype=np.float64).reshape(input_shape) + 1 + my_pad = nn.Pad1D( + padding=pad, mode="replicate", data_format="NCDHW") + data = paddle.to_tensor(data) + result = my_pad(data) + + self.assertRaises(AssertionError, test_ncl) + + self.assertRaises(AssertionError, test_nchw) + + self.assertRaises(AssertionError, test_ncdhw) + + +if __name__ == '__main__': + unittest.main() From 99ffeffef19393979be136d8122f7d395b03e60f Mon Sep 17 00:00:00 2001 From: lzzyzlbb <287246233@qq.com> Date: Fri, 20 Aug 2021 16:56:43 +0800 Subject: [PATCH 118/126] [npu]Add argsort op (#34865) * add rmsprop npu * add argsort npu * add argsort npu * modify according to review * modify sharedatawith according to review * modify reshape according to review * rm dygraph=false --- paddle/fluid/operators/argsort_op_npu.cc | 261 ++++++++++++++++++ .../unittests/npu/test_argsort_op_npu.py | 215 +++++++++++++++ 2 files changed, 476 insertions(+) create mode 100644 paddle/fluid/operators/argsort_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc new file mode 100644 index 00000000000000..e36dd322e0ea1d --- /dev/null +++ b/paddle/fluid/operators/argsort_op_npu.cc @@ -0,0 +1,261 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/argsort_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class ArgsortNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + output->mutable_data(ctx.GetPlace()); + auto* indices = ctx.Output("Indices"); + indices->mutable_data(ctx.GetPlace()); + + int32_t axis = ctx.Attr("axis"); + auto in_dims = indices->dims(); + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + bool descending = ctx.Attr("descending"); + auto stream = + ctx.template device_context() + .stream(); + framework::NPUAttributeMap sort_attr_input = { + {"axis", static_cast(-1)}, {"descending", descending}}; + + if (axis == -1 || axis + 1 == in_dims.size()) { + const auto& sort_runner = + NpuOpRunner("Sort", {*input}, {*output, *indices}, sort_attr_input); + sort_runner.Run(stream); + } else { + // transpose + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.push_back(i); + } + trans.push_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.push_back(i); + } + trans.push_back(axis); + framework::DDim trans_dims(in_dims); + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + } + framework::NPUAttributeMap trans_attr_input = {{"perm", trans}}; + Tensor trans_input; + trans_input.mutable_data(trans_dims, ctx.GetPlace()); + const auto& trans_input_runner = + NpuOpRunner("TransposeD", {*input}, {trans_input}, trans_attr_input); + trans_input_runner.Run(stream); + Tensor trans_indices; + trans_indices.mutable_data(trans_dims, ctx.GetPlace()); + const auto& trans_indice_runner = NpuOpRunner( + "TransposeD", {*indices}, {trans_indices}, trans_attr_input); + trans_indice_runner.Run(stream); + Tensor trans_output; + trans_output.mutable_data(trans_dims, ctx.GetPlace()); + const auto& trans_output_runner = NpuOpRunner( + "TransposeD", {*output}, {trans_output}, trans_attr_input); + trans_output_runner.Run(stream); + const auto& sort_runner = + NpuOpRunner("Sort", {trans_input}, {trans_output, trans_indices}, + sort_attr_input); + sort_runner.Run(stream); + // transpose back + const auto& trans_indices_back_runner = NpuOpRunner( + "TransposeD", {trans_indices}, {*indices}, trans_attr_input); + trans_indices_back_runner.Run(stream); + const auto& trans_output_back_runner = NpuOpRunner( + "TransposeD", {trans_output}, {*output}, trans_attr_input); + trans_output_back_runner.Run(stream); + } + } +}; + +template +static void ReshapeNPU(const framework::Tensor* input, + const std::vector& input_shapes, + framework::Tensor* output) { + output->ShareDataWith(*input); + output->Resize(framework::make_ddim(std::move(input_shapes))); +} + +template +static void FullAssignNPU(const framework::ExecutionContext& ctx, + Type ind_lastdim, Type outer_dim, + const framework::DDim& trans_dims, + const framework::Tensor* input, + const framework::Tensor* indices, + framework::Tensor* t_out) { + // reshape input + Type input_shape = ind_lastdim * outer_dim; + std::vector input_shapes = {input_shape}; + Tensor input_reshape_tensor(input->type()); + ReshapeNPU(input, input_shapes, &input_reshape_tensor); + // reshape index + std::vector index_shapes = {outer_dim, ind_lastdim}; + framework::DDim ind_2d = framework::make_ddim({outer_dim, ind_lastdim}); + Tensor ind_2d_tensor(indices->type()); + ReshapeNPU(indices, index_shapes, &ind_2d_tensor); + // range_flatten_index + std::vector range_flatten_index; + for (Type i = 0; i < input_shape; i += ind_lastdim) { + range_flatten_index.push_back(static_cast(i)); + } + Tensor range_flatten_index_tensor(framework::proto::VarType::INT32); + range_flatten_index_tensor.Resize(framework::make_ddim({outer_dim})); + range_flatten_index_tensor.mutable_data( + {static_cast(range_flatten_index.size())}, ctx.GetPlace()); + TensorFromVector(range_flatten_index, ctx.device_context(), + &range_flatten_index_tensor); + Tensor range_flatten_index_expand_tensor(range_flatten_index_tensor.type()); + std::vector flatten_shape = {outer_dim, 1}; + ReshapeNPU(&range_flatten_index_tensor, flatten_shape, + &range_flatten_index_expand_tensor); + auto stream = + ctx.template device_context() + .stream(); + Tensor ind_2d_add_tensor; + ind_2d_add_tensor.mutable_data(ind_2d, ctx.GetPlace()); + const auto& runner_ind_2d_tensor = NpuOpRunner( + std::string("Add"), {ind_2d_tensor, range_flatten_index_expand_tensor}, + {ind_2d_add_tensor}, {}); + runner_ind_2d_tensor.Run(stream); + Tensor ind_reshape_tensor(ind_2d_add_tensor.type()); + ReshapeNPU(&ind_2d_add_tensor, input_shapes, &ind_reshape_tensor); + Tensor ind_reshape_expand_tensor(ind_reshape_tensor.type()); + std::vector ind_shape = {input_shape, 1}; + ReshapeNPU(&ind_reshape_tensor, ind_shape, &ind_reshape_expand_tensor); + // expand_index + Tensor input_scatter_tensor; + input_scatter_tensor.Resize({input_shape}); + input_scatter_tensor.mutable_data(ctx.GetPlace()); + Tensor input_scatter_tensor_ori; + input_scatter_tensor_ori.Resize({input_shape}); + input_scatter_tensor_ori.mutable_data(ctx.GetPlace()); + std::vector trans_shapes; + + for (int i = 0; i < trans_dims.size(); i++) { + trans_shapes.push_back(trans_dims[i]); + } + NpuOpRunner runner_scatter; + runner_scatter.SetType("TensorScatterUpdate") + .AddInput(input_scatter_tensor_ori) + .AddInput(ind_reshape_expand_tensor) + .AddInput(input_reshape_tensor) + .AddOutput(input_scatter_tensor); + runner_scatter.Run(stream); + framework::TensorCopy(input_scatter_tensor, ctx.GetPlace(), + ctx.template device_context(), + t_out); + t_out->Resize(framework::make_ddim(trans_shapes)); +} + +template +class ArgsortGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* indices = ctx.Input("Indices"); + auto* dX = ctx.Output(framework::GradVarName("X")); + auto* dO = ctx.Input(framework::GradVarName("Out")); + int axis = ctx.Attr("axis"); + auto in_dims = indices->dims(); + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + auto place = ctx.GetPlace(); + + auto stream = + ctx.template device_context() + .stream(); + dX->mutable_data(ctx.GetPlace()); + Tensor dxt; + dxt.mutable_data(dX->dims(), place); + const auto& runner_flatten = + NpuOpRunner(std::string("Flatten"), {*dX}, {dxt}, {}); + runner_flatten.Run(stream); + FillNpuTensorWithConstant(&dxt, static_cast(0)); + if (dO->numel() == 0) return; + // Do full assig n + if (axis == -1 || axis + 1 == in_dims.size()) { + const int64_t outer_dim = framework::product( + framework::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t ind_lastdim = in_dims[in_dims.size() - 1]; + FullAssignNPU(ctx, ind_lastdim, outer_dim, in_dims, dO, + indices, dX); + + } else { + // If not full assign do transpose + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.push_back(i); + } + trans.push_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.push_back(i); + } + trans.push_back(axis); + framework::DDim trans_dims(in_dims); + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + } + std::vector axis; + for (size_t i = 0; i < trans.size(); i++) { + axis.push_back(in_dims[trans[i]]); + } + framework::NPUAttributeMap attr_input = {{"perm", trans}}; + Tensor trans_dO; + trans_dO.mutable_data(trans_dims, ctx.GetPlace()); + Tensor trans_ind; + trans_ind.mutable_data(trans_dims, ctx.GetPlace()); + // Do transpose + const auto& runner_transpose_dx = NpuOpRunner( + std::string("TransposeD"), {*dO}, {trans_dO}, {attr_input}); + runner_transpose_dx.Run(stream); + const auto& runner_transpose_ind = NpuOpRunner( + std::string("TransposeD"), {*indices}, {trans_ind}, {attr_input}); + runner_transpose_ind.Run(stream); + + const int64_t outer_dim = framework::product( + framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t ind_lastdim = trans_dims[trans_dims.size() - 1]; + + Tensor tmp_out; + tmp_out.mutable_data(trans_dims, ctx.GetPlace()); + + FullAssignNPU(ctx, ind_lastdim, outer_dim, trans_dims, + &trans_dO, &trans_ind, &tmp_out); + + // transpose back + const auto& runner_transpose_out = NpuOpRunner( + std::string("TransposeD"), {tmp_out}, {*dX}, {attr_input}); + runner_transpose_out.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + argsort, ops::ArgsortNPUKernel, + ops::ArgsortNPUKernel); + +REGISTER_OP_NPU_KERNEL(argsort_grad, + ops::ArgsortGradNPUKernel, + ops::ArgsortGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py new file mode 100644 index 00000000000000..824266578b9e57 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_argsort_op_npu.py @@ -0,0 +1,215 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest, _set_use_system_allocator +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core + +from paddle.fluid import ParamAttr +from paddle.fluid.framework import Program, grad_var_name +from paddle.fluid.executor import Executor +from paddle.fluid.backward import append_backward + +paddle.enable_static() + + +class TestArgsortOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "argsort" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.init_inputshape() + self.init_axis() + self.init_direction() + + self.x = np.random.random(self.input_shape).astype(self.dtype) + self.inputs = {"X": self.x} + self.attrs = {"axis": self.axis, "descending": self.descending} + self.get_output() + self.outputs = {"Out": self.sorted_x, "Indices": self.indices} + + def get_output(self): + if self.descending: + self.indices = np.flip( + np.argsort( + self.x, kind='heapsort', axis=self.axis), self.axis) + self.sorted_x = np.flip( + np.sort( + self.x, kind='heapsort', axis=self.axis), self.axis) + else: + self.indices = np.argsort(self.x, kind='heapsort', axis=self.axis) + self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis) + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_kernel_type(self): + self.use_mkldnn = False + + def init_inputshape(self): + self.input_shape = (2, 2, 2, 3, 3) + + def init_dtype(self): + self.dtype = np.float16 + + def init_axis(self): + self.axis = -1 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def init_direction(self): + self.descending = False + + +class TestArgsortOpAxis0NPU(TestArgsortOp): + def init_axis(self): + self.axis = 0 + + +class TestArgsortOpAxis1NPU(TestArgsortOp): + def init_axis(self): + self.axis = 1 + + +class TestArgsortOpAxis2NPU(TestArgsortOp): + def init_axis(self): + self.axis = 2 + + +class TestArgsortOpAxisNeg1NPU(TestArgsortOp): + def init_axis(self): + self.axis = -1 + + +class TestArgsortOpAxisNeg2NPU(TestArgsortOp): + def init_axis(self): + self.axis = -2 + + +class TestArgsortOpDescendingAxisNPU(TestArgsortOp): + def init_direction(self): + self.descending = True + + +class TestArgsortOpDescendingAxis0NPU(TestArgsortOpAxis0NPU): + def init_direction(self): + self.descending = True + + +class TestArgsortOpDescendingAxis1NPU(TestArgsortOpAxis1NPU): + def init_direction(self): + self.descending = True + + +class TestArgsortOpDescendingAxis2NPU(TestArgsortOpAxis2NPU): + def init_direction(self): + self.descending = True + + +class TestArgsortOpDescendingAxisNeg1NPU(TestArgsortOpAxisNeg1NPU): + def init_direction(self): + self.descending = True + + +class TestArgsortOpDescendingAxisNeg2NPU(TestArgsortOpAxisNeg2NPU): + def init_direction(self): + self.descending = True + + +# liurui25: argsort of npu has bug with type fp32, +# it will change the type from fp32 to fp16, +# so the check_output_with_place add thw atol +# this test is only used to test the grad +# issue: https://gitee.com/ascend/modelzoo/issues/I44I7K + + +class TestArgsortOpAxis0NPUFP32(TestArgsortOp): + def init_axis(self): + self.axis = 0 + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-2) + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + +class TestArgsortOpAxis1NPUFP32(TestArgsortOpAxis0NPUFP32): + def init_axis(self): + self.axis = 1 + + +class TestArgsortOpAxis2NPUFP32(TestArgsortOpAxis0NPUFP32): + def init_axis(self): + self.axis = 2 + + +class TestArgsortOpAxisNeg1NPUFP32(TestArgsortOpAxis0NPUFP32): + def init_axis(self): + self.axis = -1 + + +class TestArgsortOpAxisNeg2NPUFP32(TestArgsortOpAxis0NPUFP32): + def init_axis(self): + self.axis = -2 + + +class TestArgsortOpDescendingAxisNPUFP32(TestArgsortOpAxis0NPUFP32): + def init_direction(self): + self.descending = True + + +class TestArgsortOpDescendingAxis0NPUFP32(TestArgsortOpAxis0NPUFP32): + def init_direction(self): + self.descending = True + + +class TestArgsortOpDescendingAxis1NPUFP32(TestArgsortOpAxis1NPUFP32): + def init_direction(self): + self.descending = True + + +class TestArgsortOpDescendingAxis2NPUFP32(TestArgsortOpAxis2NPUFP32): + def init_direction(self): + self.descending = True + + +class TestArgsortOpDescendingAxisNeg1NPUFP32(TestArgsortOpAxisNeg1NPUFP32): + def init_direction(self): + self.descending = True + + +class TestArgsortOpDescendingAxisNeg2NPUFP32(TestArgsortOpAxisNeg2NPUFP32): + def init_direction(self): + self.descending = True + + +if __name__ == '__main__': + unittest.main() From f6015d0de3fa266e6663b2fb4a9c097bd8448e0f Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Fri, 20 Aug 2021 17:03:02 +0800 Subject: [PATCH 119/126] fix model-benchmark build error (#35041) --- tools/test_ci_model_benchmark.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/test_ci_model_benchmark.sh b/tools/test_ci_model_benchmark.sh index 33d76dea2c5cae..657e03944a06e6 100644 --- a/tools/test_ci_model_benchmark.sh +++ b/tools/test_ci_model_benchmark.sh @@ -31,6 +31,7 @@ function check_whl { [ $? -ne 0 ] && echo "install paddle failed." && exit 1 cd build make -j `nproc` + [ $? -ne 0 ] && echo "build paddle failed." && exit 1 unzip -q python/dist/*.whl -d /tmp/develop sed -i '/version.py/d' /tmp/pr/*/RECORD From 4d9b2d6da262b5aa8ec413478a3ebd428bb0e199 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Fri, 20 Aug 2021 17:11:19 +0800 Subject: [PATCH 120/126] [hybrid performance] Grad fuse for gradient merge under pipeline mode (#35004) --- .../framework/distributed_strategy.proto | 1 + paddle/fluid/operators/coalesce_tensor_op.cc | 65 ++++- .../fleet/base/distributed_strategy.py | 22 ++ .../sharding/offload_helper.py | 3 + .../fleet/meta_optimizers/sharding/utils.py | 6 +- .../meta_optimizers/sharding_optimizer.py | 4 +- python/paddle/fluid/optimizer.py | 255 +++++++++++++++++- .../npu/test_coalesce_tensor_op_npu.py | 3 +- .../unittests/test_coalesce_tensor_op.py | 3 +- .../test_fleet_sharding_meta_optimizer.py | 183 +++++++++++++ 10 files changed, 534 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 546b9d2601df57..58ae35f2689799 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -200,6 +200,7 @@ message DistributedStrategy { optional int32 fuse_grad_size_in_num = 31 [ default = 8 ]; optional bool calc_comm_same_stream = 32 [ default = false ]; optional bool asp = 33 [ default = false ]; + optional bool fuse_grad_merge = 34 [ default = false ]; optional RecomputeConfig recompute_configs = 101; optional AMPConfig amp_configs = 102; diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index 6ea8809dae13f2..4c5f3a2a47bd84 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -20,10 +20,49 @@ #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_memory_aligment.h" +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/operators/npu_op_runner.h" +#endif namespace paddle { namespace operators { +template +struct FillConstantVisitor { + FillConstantVisitor(const DeviceContext &dev_ctx, + framework::LoDTensor *tensor, const float value) + : dev_ctx_(dev_ctx), tensor_(tensor), value_(value) {} + + template + void apply(typename std::enable_if::value || + std::is_same::value>::type * = + nullptr) const { + PADDLE_THROW(platform::errors::InvalidArgument( + "Not support data type for set_constant attr")); + } + + template + void apply(typename std::enable_if::value || + std::is_same::value)>::type + * = nullptr) const { +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(dev_ctx_.GetPlace())) { + FillNpuTensorWithConstant(tensor_, static_cast(value_)); + } else { + math::SetConstant set_constant; + set_constant(dev_ctx_, tensor_, static_cast(value_)); + } +#else + math::SetConstant set_constant; + set_constant(dev_ctx_, tensor_, static_cast(value_)); +#endif + } + + const DeviceContext &dev_ctx_; + framework::LoDTensor *tensor_; + float value_; +}; + template class CoalesceTensorOpKernel : public framework::OpKernel { public: @@ -70,6 +109,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel { auto in_tensors = context.MultiInput("Input"); bool use_align = context.Attr("use_align"); auto align_size = context.Attr("align_size"); + auto size_of_dtype = context.Attr("user_defined_size_of_dtype"); if (context.Attr("check_name")) { for (size_t i = 0; i < in_var_names.size(); ++i) { @@ -94,7 +134,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel { size_t numel = 0; auto dtype = static_cast( context.Attr("dtype")); - size_t size_of_dtype = framework::SizeOfType(dtype); + if (size_of_dtype == -1) { + size_of_dtype = framework::SizeOfType(dtype); + } GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype, context.GetPlace(), use_align, align_size); @@ -121,10 +163,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel { : len; } } else if (context.Attr("set_constant")) { - // TODO(Liu yuang) ADD NPU SET_CONSTANT FUNCTION. - math::SetConstant set_constant; - set_constant(dev_ctx, fused_tensor, - static_cast(context.Attr("constant"))); + framework::VisitDataType( + dtype, FillConstantVisitor( + dev_ctx, fused_tensor, context.Attr("constant"))); } else if (context.Attr("persist_output")) { for (size_t i = 0; i < out_var_names.size(); ++i) { size_t len = static_cast(out_tensors[i]->numel()); @@ -227,10 +268,13 @@ class CoalesceTensorOp : public framework::OperatorWithKernel { } auto use_align = ctx->Attrs().Get("use_align"); auto align_size = ctx->Attrs().Get("align_size"); + auto size_of_dtype = ctx->Attrs().Get("user_defined_size_of_dtype"); auto dtype = static_cast( ctx->Attrs().Get("dtype")); - size_t size_of_dtype = framework::SizeOfType(dtype); + if (size_of_dtype == -1) { + size_of_dtype = framework::SizeOfType(dtype); + } auto alignment = [](size_t size, size_t align_size) { size_t remaining = size % align_size; @@ -308,6 +352,15 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(true); AddAttr("align_size", "The alignment size when use_align is True") .SetDefault(-1); + AddAttr("user_defined_size_of_dtype", + "The user defined size of dtype. This is used to coalesce " + "grad vars and merged_grad vars at the same time. For some " + "strategy, the dtype of fused_grad_vars and the dtype of " + "fused_grad_merged_vars are not identical, which will cause " + "the shape of these two coalesced vars are different. To " + "make sure the shape of these two vars are identical with " + "each other, this attr is added.") + .SetDefault(-1); AddComment(R"DOC( CoalesceTensor Operator. diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index d43292ddbd32e9..d19cfd21698021 100644 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -967,6 +967,28 @@ def _calc_comm_same_stream(self, same): "WARNING: calc_comm_same_stream should have value of boolean type" ) + @property + def fuse_grad_merge(self): + """ + Set whether fuse the grad for gradient merge. + Note: this flag will only effect the gradient merge under pipeline mode + The default value for the fuse_grad_merge is False + Examples: + .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.fuse_param_grad = True + """ + return self.strategy.fuse_grad_merge + + @fuse_grad_merge.setter + @is_strict_auto + def fuse_grad_merge(self, fuse_grad_merge): + if isinstance(fuse_grad_merge, bool): + self.strategy.fuse_grad_merge = fuse_grad_merge + else: + print("WARNING: fuse_grad_merge should have value of boolean type") + @property def fuse_grad_size_in_num(self): """ diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py index a96705b09e835e..8aee34960332ac 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py @@ -122,6 +122,9 @@ def remove_param(input_name): for idx, op in enumerate(block.ops): if is_optimizer_op(op): break + # TODO (Yuang Liu): tmp solution for fuse_grad_merge + optimize_cast + if not offload and op.type == 'coalesce_tensor': + continue for input_name in op.desc.input_arg_names(): if input_name not in param_to_idx: continue diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index 52ef843aa0d751..16fbc7bea6c8b6 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -341,7 +341,11 @@ def insert_allreduce_ops(block, if len(allreduce_vars) == 0: return - if user_defined_strategy and user_defined_strategy.fuse_all_reduce_ops: + if user_defined_strategy and \ + user_defined_strategy.fuse_all_reduce_ops and \ + not user_defined_strategy.fuse_grad_merge: + # If fuse_grad_merge is enable, the grad vars have already been fused during + # gradient merge pass, therefore, those vars are not need to be fused here insert_fused_allreduce_ops(block, insert_idx, ring_id, allreduce_vars, op_role, use_calc_stream, user_defined_strategy.fuse_grad_size_in_MB) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 5c2f24054f835c..c94bd572f05878 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -319,7 +319,9 @@ def _insert_allreduce_for_pp(self): main_block._remove_op(idx) accumulated_grad_names = self._pp_optimizer._accumulate_gradients( - main_block, fp16_allreduce=fp16_allreduce) + main_block, + fp16_allreduce=fp16_allreduce, + user_defined_strategy=strategy) len_of_ops = len(main_block.ops) first_optimize_op_index = get_first_optimize_op_idx(main_block) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 9e87681c4bef30..58f61b77fd1fe0 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -5037,11 +5037,18 @@ def _rename_gradient_var_name(self, block): def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False, - fp16_allreduce=False): + fp16_allreduce=False, + user_defined_strategy=None): """ Create a new merged gradient for each parameter and accumulate the corresponding gradient to it. """ + if user_defined_strategy and user_defined_strategy.fuse_grad_merge: + fused_gradient_names = self._accumulate_gradients_with_fuse( + block, fp16_allreduce, + user_defined_strategy.fuse_grad_size_in_MB) + return fused_gradient_names + merged_gradient_names = [] first_opt_op_idx = None @@ -5171,6 +5178,252 @@ def _accumulate_gradients(self, return merged_gradient_names + def _accumulate_gradients_with_fuse(self, main_block, fp16, fused_size): + first_opt_op_idx = None + grad_param_pairs = [] + # obtain all param/grad pairs that needed to be fused + for index, op in reversed(tuple(enumerate(list(main_block.ops)))): + # remove the cast op of fp16 grad to fp32 grad + if self._is_optimize_op(op) and op.type == 'cast': + in_name = op.input_arg_names[0] + out_name = op.output_arg_names[0] + if out_name.strip('@GRAD') in self._param_device_map: + assert in_name.replace('.cast_fp16', '') == out_name + main_block._remove_op(index) + continue + + if self._is_backward_op(op) and first_opt_op_idx is None: + first_opt_op_idx = index + 1 + # no optimize phase + if first_opt_op_idx == len(main_block.ops): + return + + if self._is_backward_op(op) and ( + self._op_role_var_key in op.attr_names): + op_role_var = op.attr(self._op_role_var_key) + if len(op_role_var) == 0: + continue + assert len(op_role_var) % 2 == 0 + for i in range(0, len(op_role_var), 2): + param_name = op_role_var[i] + if not main_block.has_var(param_name): + continue + if '@BroadCast' in param_name: + continue + grad_param_pairs.append( + (op_role_var[i + 1], op_role_var[i])) + + if len(grad_param_pairs) == 0: + return + + grad_param_segments = [] + merged_suffix = '@MERGED@FP16' if fp16 else '@MERGED' + dtype = paddle.float16 if fp16 else paddle.float32 + cur_size = 0. + last_dtype = None + # split the grad based on dtype and fused size + for grad, param in grad_param_pairs: + real_grad = main_block.var(grad) + # create the gradient merged var for each grad + merged_grad_var = main_block.create_var( + name=param + core.grad_var_suffix() + merged_suffix, + dtype=dtype, + shape=real_grad.shape, + persistable=True, + stop_gradient=False) + real_param = main_block.var(param) + tmp_size = self._get_var_size(real_grad) + # two strategies for splitting the grad + # 1. the current segment's size reach the user defined grad_size_in_MB + # 2. the upcoming grad holds different dtype compared with grads in current segment + if len(grad_param_segments) == 0 \ + or cur_size + tmp_size > fused_size \ + or real_grad.dtype != last_dtype: + grad_param_segments.append( + ([real_grad], [real_param], [merged_grad_var])) + last_dtype = real_grad.dtype + cur_size = 0. + else: + grad_param_segments[-1][0].append(real_grad) + grad_param_segments[-1][1].append(real_param) + grad_param_segments[-1][2].append(merged_grad_var) + cur_size += tmp_size + + fused_gradients = [] + fused_merged_gradients = [] + # create fused vars for grad and param + for grad_param_segment in grad_param_segments: + grad_segment = grad_param_segment[0] + merged_grad_segment = grad_param_segment[2] + fused_grad = main_block.create_var( + name='FusedGrad_{}'.format(grad_segment[0].name), + dtype=grad_segment[0].dtype, + persistable=False, + stop_gradient=False) + # keep the '.cast_fp16' info in the fuse var name + fused_merged_grad_name_prefix = 'FusedMergedGrad.cast_fp16.' if \ + merged_grad_segment[0].dtype == paddle.float16 else 'FusedMergedGrad' + fused_merged_grad_name = fused_merged_grad_name_prefix + '_{}'.format( + merged_grad_segment[0].name) + fused_merged_grad = main_block.create_var( + name=fused_merged_grad_name, + dtype=merged_grad_segment[0].dtype, + persistable=True, + stop_gradient=False) + fused_gradients.append(fused_grad) + fused_merged_gradients.append(fused_merged_grad) + + assert len(fused_gradients) == len(grad_param_segments) + assert len(fused_merged_gradients) == len(grad_param_segments) + + # insert coalesce op at the start of the backward pass + # use param as the coalesce input to make sure the two Fused vars are in same shape + first_back_op_idx = None + for index, op in enumerate(main_block.ops): + if self._is_backward_op(op) and first_back_op_idx is None: + first_back_op_idx = index + break + assert first_back_op_idx is not None + offset = 0 + for i in range(len(grad_param_segments)): + fused_grad = fused_gradients[i] + fused_merged_grad = fused_merged_gradients[i] + grads = grad_param_segments[i][0] + params = grad_param_segments[i][1] + merged_grads = grad_param_segments[i][2] + main_block._insert_op_without_sync( + first_back_op_idx + offset, + type="coalesce_tensor", + inputs={"Input": params}, + outputs={"Output": grads, + "FusedOutput": fused_grad}, + attrs={ + # Explanation of user_defined_size_of_dtype: + # In coalesce op, the align size is 256 bytes + # the float takes 4 bytes while fp16 takes 2 bytes. + # To meet the requirement, 128 fp16 or 64 float will be aligned + # Think the total shape of the input tensors if [64], + # if the dtype is float, then the shape of the fuse var is [64] + # however if the dytpe if fp16, the shape of the fuse var is [128], + # which will cause the fused vars' shape vary between each other. + # To make sure the shape of the fused vars are identical, + # we set the dtype of float and fp16 both to 2. + # Under this way, the fused vars' shape for float and fp16 are all [128] + "user_defined_size_of_dtype": 2, + "copy_data": False, + "use_align": True, + "dtype": grads[0].dtype, + self._op_role_key: self._op_role.Backward + }) + offset += 1 + # For the gradient_merged_fused_var, given a init value during the coalesce op + # this will remove a problematic fill_constant op. This op role of this coalesce + # is set to be LRSched to make this coalesce (with init) only run once + main_block._insert_op_without_sync( + first_back_op_idx + offset, + type="coalesce_tensor", + inputs={"Input": params}, + outputs={ + "Output": merged_grads, + "FusedOutput": fused_merged_grad + }, + attrs={ + "user_defined_size_of_dtype": 2, + "set_constant": True, + "constant": float(0.0), + "copy_data": False, + "use_align": True, + "dtype": merged_grads[0].dtype, + self._op_role_key: self._op_role.Optimize.LRSched + }) + offset += 1 + + # insert gradient merge relating ops + first_opt_op_idx += offset + offset = 0 + for i in range(len(fused_gradients)): + fused_grad = fused_gradients[i] + fused_merged_grad = fused_merged_gradients[i] + is_fp16_grad = 'cast_fp16' in fused_grad.name + need_cast = (is_fp16_grad is not fp16) + if need_cast: + # for fp16 allreduce, cast fp32 grad to fp16 + # for fp32 allreduce, cast fp16 grad to fp32 + cast_grad_var_name = fused_grad.name + '@TMP' + cast_grad_var = main_block.create_var( + name=cast_grad_var_name, + dtype=dtype, + persistable=False, + stop_gradient=False) + main_block._insert_op( + index=first_opt_op_idx + offset, + type='cast', + inputs={'X': fused_grad}, + outputs={'Out': cast_grad_var}, + attrs={ + 'in_dtype': fused_grad.dtype, + 'out_dtype': cast_grad_var.dtype, + self._op_role_key: self._op_role.Backward, + }) + offset += 1 + fused_grad = cast_grad_var + main_block._insert_op( + index=first_opt_op_idx + offset, + type='sum', + inputs={'X': [fused_merged_grad, fused_grad]}, + outputs={'Out': fused_merged_grad}, + attrs={self._op_role_key: self._op_role.Backward}) + offset += 1 + + if fp16: + # if using fp16 allreduce, the optimizer needs fp32 grads, cast them back to fp32 + for grad, param in grad_param_pairs: + real_grad = main_block.var(grad) + fp16_grad_name = param + core.grad_var_suffix() + '@MERGED@FP16' + assert main_block.has_var(fp16_grad_name) + fp16_grad = main_block.var(fp16_grad_name) + fp32_grad_name = param + core.grad_var_suffix() + '@MERGED' + fp32_grad = main_block.create_var( + name=fp32_grad_name, + dtype=paddle.float32, + shape=real_grad.shape, + persistable=False, + stop_gradient=False) + main_block._insert_op( + index=first_opt_op_idx + offset, + type='cast', + inputs={'X': fp16_grad}, + outputs={'Out': fp32_grad}, + attrs={ + 'in_dtype': paddle.float16, + 'out_dtype': paddle.float32, + self._op_role_key: self._op_role.Optimize, + }) + offset += 1 + + # replace the var with it's name, which will be used for inserting allreduce + for i in range(len(fused_merged_gradients)): + fused_merged_gradients[i] = fused_merged_gradients[i].name + + main_block._sync_with_cpp() + + return fused_merged_gradients + + def _get_var_size(self, var): + dtype_to_size = { + core.VarDesc.VarType.FP16: 2, + core.VarDesc.VarType.FP32: 4, + core.VarDesc.VarType.FP64: 8, + core.VarDesc.VarType.INT16: 2, + core.VarDesc.VarType.INT32: 4, + core.VarDesc.VarType.INT64: 8, + core.VarDesc.VarType.BOOL: 1, + core.VarDesc.VarType.UINT8: 1, + } + assert -1 not in var.shape + return reduce(lambda x, y: x * y, + var.shape) * dtype_to_size[var.dtype] / 1024.0 / 1024.0 + def _add_sub_blocks(self, main_block, program_list): main_program = main_block.program for prog in program_list: diff --git a/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py index f1bbf0becf1950..93a969bf10f030 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py @@ -90,7 +90,8 @@ def init_attr(self): "set_constant": False, "constant": 0.5, "use_align": True, - "dtype": self.fluid_dtype + "dtype": self.fluid_dtype, + "user_defined_size_of_dtype": 2 } def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py b/python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py index a5b30330448d29..868a72334247d0 100644 --- a/python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py +++ b/python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py @@ -92,7 +92,8 @@ def init_attr(self): "copy_data": False, "set_constant": True, "constant": 0.5, - "dtype": self.fluid_dtype + "dtype": self.fluid_dtype, + "user_defined_size_of_dtype": 2 } def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index 5a981a470cb4ef..3b0df74d3e6b4b 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -1050,6 +1050,189 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_offload(self): self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002']) + def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast_with_gradient_fuse( + self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.pp_net(train_prog, startup_prog) + strategy.amp = True + strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], } + strategy.sharding = True + strategy.sharding_configs = { + "sharding_degree": 1, + "mp_degree": 1, + "pp_degree": 2, + "dp_degree": 2, + "optimize_cast": True, + } + strategy.pipeline = True + strategy.pipeline_configs = { + "schedule_mode": "1F1B", + "micro_batch_size": 2, + "accumulate_steps": 4, + } + strategy.fp16_allreduce = True + strategy.fuse_grad_merge = True + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + train_prog = train_prog._pipeline_opt['section_program'] + startup_prog = startup_prog._pipeline_opt['startup_program'] + + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check program + startup_prog_op_types = [op.type for op in startup_prog_ops] + main_prog_op_types = [op.type for op in main_prog_ops] + + # ring: mp, pp_group, pp_pair, pp_pair + self.assertEqual(startup_prog_op_types, [ + 'uniform_random', 'cast', 'fill_constant', 'cast', 'uniform_random', + 'cast', 'fill_constant', 'cast', 'uniform_random', 'cast', + 'fill_constant', 'cast', 'uniform_random', 'cast', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', + 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', + 'c_sync_comm_stream' + ]) + + self.assertEqual(main_prog_op_types, [ + 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', + 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', + 'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', + 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', + 'coalesce_tensor', 'coalesce_tensor', 'fill_constant', 'scale', + 'scale', 'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', + 'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', + 'sum', 'cast', 'sum', 'c_allreduce_sum', 'c_allreduce_sum', 'cast', + 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', + 'c_sync_comm_stream', 'check_finite_and_unscale', 'cast', + 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', + 'cast', 'momentum', 'cast', 'momentum', 'cast', 'momentum', 'cast', + 'momentum', 'cast', 'momentum', 'cast', 'momentum', 'momentum', + 'cast' + ]) + + # amp check_finite_and_unscale, allreduce(pp) + self.assertEqual(main_prog_op_types.count('c_allreduce_max'), 1) + + # should has ring id for pp + created_ring_ids = [ + op.desc.attr("ring_id") for op in startup_prog_ops + if op.type == "c_comm_init" + ] + self.assertIn(self.pp_pair_ring_id, created_ring_ids) + self.assertIn(self.dp_ring_id, created_ring_ids) + + # check correctness of pp group + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "comm_id_0": + pp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36003']) + + # check correctness of dp group + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "comm_id_3": + dp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002']) + + def test_hybrid_with_pp_dp_amp_with_gradient_fuse(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.pp_net(train_prog, startup_prog) + strategy.amp = True + strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], } + strategy.sharding = True + strategy.sharding_configs = { + "sharding_degree": 1, + "mp_degree": 1, + "pp_degree": 2, + "dp_degree": 2, + } + strategy.pipeline = True + strategy.pipeline_configs = { + "schedule_mode": "1F1B", + "micro_batch_size": 2, + "accumulate_steps": 4, + } + strategy.fuse_grad_merge = True + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + train_prog = train_prog._pipeline_opt['section_program'] + startup_prog = startup_prog._pipeline_opt['startup_program'] + + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check program + startup_prog_op_types = [op.type for op in startup_prog_ops] + main_prog_op_types = [op.type for op in main_prog_ops] + + # ring: mp, pp_group, pp_pair, pp_pair + self.assertEqual(startup_prog_op_types, [ + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', + 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', + 'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream' + ]) + + self.assertEqual(main_prog_op_types, [ + 'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', + 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', + 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', + 'softmax', 'cross_entropy2', 'mean', 'elementwise_mul', + 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', + 'coalesce_tensor', 'fill_constant', 'scale', 'scale', + 'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', + 'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', + 'cast', 'sum', 'sum', 'c_allreduce_sum', 'c_allreduce_sum', + 'c_sync_comm_stream', 'check_finite_and_unscale', 'cast', + 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', + 'momentum', 'momentum', 'momentum', 'momentum', 'momentum', + 'momentum', 'momentum' + ]) + + # amp check_finite_and_unscale, allreduce(pp) + self.assertEqual(main_prog_op_types.count('c_allreduce_max'), 1) + + # should has ring id for pp + created_ring_ids = [ + op.desc.attr("ring_id") for op in startup_prog_ops + if op.type == "c_comm_init" + ] + self.assertIn(self.pp_pair_ring_id, created_ring_ids) + self.assertIn(self.dp_ring_id, created_ring_ids) + + # check correctness of pp group + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "comm_id_0": + pp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36003']) + + # check correctness of dp group + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "comm_id_3": + dp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002']) + if __name__ == "__main__": unittest.main() From e2241a43aef2f4c4fada26e3a99efea584987c4b Mon Sep 17 00:00:00 2001 From: Hao Lin Date: Fri, 20 Aug 2021 17:58:59 +0800 Subject: [PATCH 121/126] Add paddle.linalg.matrix_power OP (#34667) --- paddle/fluid/operators/matrix_power_op.cc | 131 +++++++ paddle/fluid/operators/matrix_power_op.cu | 27 ++ paddle/fluid/operators/matrix_power_op.h | 277 ++++++++++++++ python/paddle/__init__.py | 2 + .../tests/unittests/test_matrix_power_op.py | 353 ++++++++++++++++++ .../white_list/op_threshold_white_list.py | 1 + python/paddle/linalg.py | 4 +- python/paddle/tensor/__init__.py | 2 + python/paddle/tensor/linalg.py | 70 ++++ 9 files changed, 866 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/matrix_power_op.cc create mode 100644 paddle/fluid/operators/matrix_power_op.cu create mode 100644 paddle/fluid/operators/matrix_power_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_matrix_power_op.py diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc new file mode 100644 index 00000000000000..c65af3129f3646 --- /dev/null +++ b/paddle/fluid/operators/matrix_power_op.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/matrix_power_op.h" + +namespace paddle { +namespace operators { + +class MatrixPowerOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matrix_power"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matrix_power"); + auto dims = ctx->GetInputDim("X"); + auto n_dim = dims.size(); + PADDLE_ENFORCE_GE(n_dim, 2, + platform::errors::InvalidArgument( + "The Input(X) should have at least 2 dimensions. But " + "received a %d dimension tensor.", + n_dim)); + PADDLE_ENFORCE_EQ(dims[n_dim - 2], dims[n_dim - 1], + platform::errors::InvalidArgument( + "The inner-most 2 dimensions of Input(X) all should " + "be square matrices " + "But received X's shape[-2] = %d and shape[-1] = %d.", + dims[n_dim - 2], dims[n_dim - 1])); + ctx->SetOutputDim("Out", dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class MatrixPowerOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "X", + "(Tensor), The input tensor of matrix_power op. Its shape should be " + "[*, M, M] where * is zero or more batch dimensions, and matrices " + "on the inner-most 2 dimensions all should be square matrices."); + AddOutput("Out", + "(Tensor), The output tensor of matrix_power op. It has the same " + "shape as the input."); + AddAttr("n", "(int), The exponent used to calculate the power of X."); + AddComment(R"DOC( +Matrix Power Operator. + +Computes the n-th power of a square matrix or a batch of square matrices. + +)DOC"); + } +}; + +class MatrixPowerOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map& GetInputOutputWithSameType() + const override { + static std::unordered_map u_map{ + {"X", /*->*/ "Out"}}; + return u_map; + } +}; + +class MatrixPowerGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* context) const override { + OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "matrix_power_grad"); + OP_INOUT_CHECK(context->HasInput("Out"), "Input", "Out", + "matrix_power_grad"); + OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")), "Input", + "Out@GRAD", "matrix_power_grad"); + auto x_dims = context->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + if (context->HasOutput(x_grad_name)) { + context->SetOutputDim(x_grad_name, x_dims); + } + } +}; + +template +class MatrixPowerGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType(this->ForwardOpType() + "_grad"); + op->SetInput("X", this->Input("X")); + op->SetInput("Out", this->Output("Out")); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetAttrMap(this->Attrs()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker, + ops::MatrixPowerOpInferVarType, + ops::MatrixPowerGradOpMaker, + ops::MatrixPowerGradOpMaker); + +REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp); + +REGISTER_OP_CPU_KERNEL( + matrix_power, + ops::MatrixPowerKernel, + ops::MatrixPowerKernel); + +REGISTER_OP_CPU_KERNEL( + matrix_power_grad, + ops::MatrixPowerGradKernel, + ops::MatrixPowerGradKernel); diff --git a/paddle/fluid/operators/matrix_power_op.cu b/paddle/fluid/operators/matrix_power_op.cu new file mode 100644 index 00000000000000..d972e9499dc884 --- /dev/null +++ b/paddle/fluid/operators/matrix_power_op.cu @@ -0,0 +1,27 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/matrix_power_op.h" + +namespace ops = paddle::operators; +namespace plf = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(matrix_power, + ops::MatrixPowerKernel, + ops::MatrixPowerKernel); + +REGISTER_OP_CUDA_KERNEL( + matrix_power_grad, + ops::MatrixPowerGradKernel, + ops::MatrixPowerGradKernel); diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h new file mode 100644 index 00000000000000..6c4b8860bf8c66 --- /dev/null +++ b/paddle/fluid/operators/matrix_power_op.h @@ -0,0 +1,277 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/matrix_inverse.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct IdentityMatrixFunctor { + IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {} + + HOSTDEVICE void operator()(size_t index) const { + const int row = index / m_ % m_; + const int col = index % m_; + output_[index] = col == row ? static_cast(1) : static_cast(0); + } + + const int m_; + T* output_; +}; + +template +void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out, + const paddle::framework::ExecutionContext& ctx) { + const auto& x_dims = X->dims(); + const int x_ndim = x_dims.size(); + T* out_data = Out->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + platform::ForRange for_range(dev_ctx, X->numel()); + + if (n == 0) { + // Out = Identity Matrix + IdentityMatrixFunctor functor(x_dims[x_ndim - 1], out_data); + for_range(functor); + return; + } + + auto blas = math::GetBlas(dev_ctx); + + Tensor new_x = ctx.AllocateTmpTensor(X->dims(), dev_ctx); + int new_n = n; + if (n > 0) { + // newX = X + framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); + } else { + // newX = X^{-1}, n = -n + math::MatrixInverseFunctor mat_inv; + mat_inv(dev_ctx, *X, &new_x); + new_n = -n; + } + + if (new_n == 1) { + framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, Out); + return; + } + + auto no_trans_desc = math::CreateMatrixDescriptor(x_dims, 0, false); + + if (new_n == 2) { + // Out = newX * newX + Out->mutable_data(ctx.GetPlace()); + blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), + Out, static_cast(0)); + return; + } else if (new_n == 3) { + // Out = (newX * newX) * newX + // Note: C[i] matrices in MatMul must not overlap, i.e. the individual + // gemm operations must be computable independently; otherwise, + // undefined behavior is expected. + Tensor temp = ctx.AllocateTmpTensor(X->dims(), dev_ctx); + blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), + &temp, static_cast(0)); + blas.MatMul(temp, no_trans_desc, new_x, no_trans_desc, static_cast(1), + Out, static_cast(0)); + return; + } else if (new_n == 4) { + // Out = (newX * newX) * (newX * newX) + Tensor temp = ctx.AllocateTmpTensor(X->dims(), dev_ctx); + blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), + &temp, static_cast(0)); + blas.MatMul(temp, no_trans_desc, temp, no_trans_desc, static_cast(1), + Out, static_cast(0)); + return; + } + + // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN) + int bit = 0; + Tensor z = Tensor(X->type()); + bool out_inited = false; + Tensor temp_out = ctx.AllocateTmpTensor(X->dims(), dev_ctx); + Tensor temp_z = ctx.AllocateTmpTensor(X->dims(), dev_ctx); + while (new_n > 0) { + bit = new_n & 0x1; + new_n >>= 1; + if (z.IsInitialized()) { + blas.MatMul(z, no_trans_desc, z, no_trans_desc, static_cast(1), + &temp_z, static_cast(0)); + framework::TensorCopy(temp_z, ctx.GetPlace(), dev_ctx, &z); + } else { + z = ctx.AllocateTmpTensor(X->dims(), dev_ctx); + framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, &z); + } + if (bit == 1) { + if (out_inited == true) { + blas.MatMul(*Out, no_trans_desc, z, no_trans_desc, static_cast(1), + &temp_out, static_cast(0)); + framework::TensorCopy(temp_out, ctx.GetPlace(), dev_ctx, Out); + } else { + framework::TensorCopy(z, ctx.GetPlace(), dev_ctx, Out); + out_inited = true; + } + } + } + return; +} + +template +class MatrixPowerKernel : public framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + const Tensor* X = ctx.Input("X"); + Tensor* Out = ctx.Output("Out"); + int n = ctx.Attr("n"); + + const auto& x_dims = X->dims(); + const int x_ndim = x_dims.size(); + PADDLE_ENFORCE_EQ( + x_dims[x_ndim - 2], x_dims[x_ndim - 1], + platform::errors::InvalidArgument( + "The inner-most 2 dimensions of Input(X) should be equal." + "X's shape[-2] = %d and shape[-1] = %d.", + x_dims[x_ndim - 2], x_dims[x_ndim - 1])); + + MatrixPowerFunction(X, n, Out, ctx); + } +}; + +template +void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out, + const Tensor* dOut, const int n, Tensor* dX, + const paddle::framework::ExecutionContext& ctx) { + dX->mutable_data(ctx.GetPlace()); + const auto& x_dims = X->dims(); + + auto& dev_ctx = ctx.template device_context(); + auto blas = math::GetBlas(dev_ctx); + + if (n == 0) { + // \nabla X = O + math::SetConstant zero; + zero(dev_ctx, dX, static_cast(0)); + return; + } else if (n == 1) { + // \nabla X = \nabla Out + framework::TensorCopy(*dOut, ctx.GetPlace(), dev_ctx, dX); + return; + } + + auto trans_desc = math::CreateMatrixDescriptor(x_dims, 0, true); + auto no_trans_desc = math::CreateMatrixDescriptor(x_dims, 0, false); + + if (n == -1) { + // \nabla X = Out^{T} * \nabla Out * Out^{T} + Tensor temp_dx = + ctx.AllocateTmpTensor(X->dims(), dev_ctx); + blas.MatMul(*Out, trans_desc, *dOut, no_trans_desc, static_cast(-1), + &temp_dx, static_cast(0)); + blas.MatMul(temp_dx, no_trans_desc, *Out, trans_desc, static_cast(1), dX, + static_cast(0)); + return; + } + + Tensor new_x = ctx.AllocateTmpTensor(X->dims(), dev_ctx); + int new_n = n; + if (n > 0) { + // newX = X + framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); + } else { + // newX = X^{-1}, n = -n + math::MatrixInverseFunctor mat_inv; + mat_inv(dev_ctx, *X, &new_x); + new_n = -n; + } + + // Use chain rule blow to compute \nabla newX^{n} + // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1}, + // Note that newX^{0} can be omitted + std::vector> tensor_list(new_n - 1); + tensor_list[0] = std::make_shared(new_x); + int index = 1; + while (index < new_n - 1) { + tensor_list[index] = std::make_shared( + ctx.AllocateTmpTensor(X->dims(), dev_ctx)); + blas.MatMul(*tensor_list[index - 1], no_trans_desc, new_x, no_trans_desc, + static_cast(1), tensor_list[index].get(), static_cast(0)); + index++; + } + + // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i} + // * \nabla Out + // * (newX^{T}^{n - i - 1}) + Tensor dx_new = ctx.AllocateTmpTensor(X->dims(), dev_ctx); + blas.MatMul(*tensor_list[new_n - 2], trans_desc, *dOut, no_trans_desc, + static_cast(1), &dx_new, static_cast(0)); + Tensor da_an_minus1 = + ctx.AllocateTmpTensor(X->dims(), dev_ctx); + blas.MatMul(*dOut, no_trans_desc, *tensor_list[new_n - 2], trans_desc, + static_cast(1), &da_an_minus1, static_cast(0)); + blas.AXPY(X->numel(), static_cast(1), da_an_minus1.data(), + dx_new.data()); + int start = 0; + while (start < new_n - 2) { + Tensor a_da = ctx.AllocateTmpTensor(X->dims(), dev_ctx); + Tensor a_da_a = ctx.AllocateTmpTensor(X->dims(), dev_ctx); + blas.MatMul(*tensor_list[start], trans_desc, *dOut, no_trans_desc, + static_cast(1), &a_da, static_cast(0)); + blas.MatMul(a_da, no_trans_desc, *tensor_list[new_n - 3 - start], + trans_desc, static_cast(1), &a_da_a, static_cast(0)); + blas.AXPY(X->numel(), static_cast(1), a_da_a.data(), + dx_new.data()); + start++; + } + + if (n > 0) { + // \nabla X = \nabla newX + framework::TensorCopy(dx_new, ctx.GetPlace(), dev_ctx, dX); + } else { + // \nabla X = newX^{T} * \nabla newX * newX^{T} + Tensor temp_dx = + ctx.AllocateTmpTensor(X->dims(), dev_ctx); + blas.MatMul(new_x, trans_desc, dx_new, no_trans_desc, static_cast(-1), + &temp_dx, static_cast(0)); + blas.MatMul(temp_dx, no_trans_desc, new_x, trans_desc, static_cast(1), + dX, static_cast(0)); + } + return; +} + +template +class MatrixPowerGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* X = ctx.Input("X"); + const Tensor* Out = ctx.Input("Out"); + const Tensor* dOut = ctx.Input(framework::GradVarName("Out")); + const int n = ctx.Attr("n"); + Tensor* dX = ctx.Output(framework::GradVarName("X")); + + MatrixPowerGradFunction(X, Out, dOut, n, dX, ctx); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 27a414e092802d..1c38d519798666 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -99,6 +99,7 @@ from .tensor.linalg import bmm # noqa: F401 from .tensor.linalg import histogram # noqa: F401 from .tensor.linalg import mv # noqa: F401 +from .tensor.linalg import matrix_power # noqa: F401 from .tensor.logic import equal # noqa: F401 from .tensor.logic import greater_equal # noqa: F401 from .tensor.logic import greater_than # noqa: F401 @@ -491,6 +492,7 @@ 'stack', 'sqrt', 'cholesky', + 'matrix_power', 'randperm', 'linspace', 'reshape', diff --git a/python/paddle/fluid/tests/unittests/test_matrix_power_op.py b/python/paddle/fluid/tests/unittests/test_matrix_power_op.py new file mode 100644 index 00000000000000..96823f49d2f08b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_matrix_power_op.py @@ -0,0 +1,353 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle +from op_test import OpTest + +paddle.enable_static() + + +class TestMatrixPowerOp(OpTest): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = 0 + + def setUp(self): + self.op_type = "matrix_power" + self.config() + + np.random.seed(123) + mat = np.random.random(self.matrix_shape).astype(self.dtype) + powered_mat = np.linalg.matrix_power(mat, self.n) + + self.inputs = {"X": mat} + self.outputs = {"Out": powered_mat} + self.attrs = {"n": self.n} + + def test_check_output(self): + self.check_output() + + def test_grad(self): + self.check_grad( + ["X"], "Out", numeric_grad_delta=1e-5, max_relative_error=1e-7) + + +class TestMatrixPowerOpN1(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = 1 + + +class TestMatrixPowerOpN2(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = 2 + + +class TestMatrixPowerOpN3(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = 3 + + +class TestMatrixPowerOpN4(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = 4 + + +class TestMatrixPowerOpN5(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = 5 + + +class TestMatrixPowerOpN6(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = 6 + + +class TestMatrixPowerOpN10(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = 10 + + +class TestMatrixPowerOpNMinus(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = -1 + + def test_grad(self): + self.check_grad( + ["X"], "Out", numeric_grad_delta=1e-5, max_relative_error=1e-6) + + +class TestMatrixPowerOpNMinus2(TestMatrixPowerOpNMinus): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = -2 + + +class TestMatrixPowerOpNMinus3(TestMatrixPowerOpNMinus): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = -3 + + +class TestMatrixPowerOpNMinus4(TestMatrixPowerOpNMinus): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = -4 + + +class TestMatrixPowerOpNMinus5(TestMatrixPowerOpNMinus): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = -5 + + +class TestMatrixPowerOpNMinus6(TestMatrixPowerOpNMinus): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = -6 + + +class TestMatrixPowerOpNMinus10(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = -10 + + def test_grad(self): + self.check_grad( + ["X"], "Out", numeric_grad_delta=1e-5, max_relative_error=1e-6) + + +class TestMatrixPowerOpBatched1(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [8, 4, 4] + self.dtype = "float64" + self.n = 5 + + +class TestMatrixPowerOpBatched2(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [2, 6, 4, 4] + self.dtype = "float64" + self.n = 4 + + +class TestMatrixPowerOpBatched3(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [2, 6, 4, 4] + self.dtype = "float64" + self.n = 0 + + +class TestMatrixPowerOpBatchedLong(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [1, 2, 3, 4, 4, 3, 3] + self.dtype = "float64" + self.n = 3 + + +class TestMatrixPowerOpLarge1(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [32, 32] + self.dtype = "float64" + self.n = 3 + + +class TestMatrixPowerOpLarge2(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float64" + self.n = 32 + + +class TestMatrixPowerOpFP32(TestMatrixPowerOp): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float32" + self.n = 2 + + def test_grad(self): + self.check_grad(["X"], "Out", max_relative_error=1e-2) + + +class TestMatrixPowerOpBatchedFP32(TestMatrixPowerOpFP32): + def config(self): + self.matrix_shape = [2, 8, 4, 4] + self.dtype = "float32" + self.n = 2 + + +class TestMatrixPowerOpLarge1FP32(TestMatrixPowerOpFP32): + def config(self): + self.matrix_shape = [32, 32] + self.dtype = "float32" + self.n = 2 + + +class TestMatrixPowerOpLarge2FP32(TestMatrixPowerOpFP32): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float32" + self.n = 32 + + +class TestMatrixPowerOpFP32Minus(TestMatrixPowerOpFP32): + def config(self): + self.matrix_shape = [10, 10] + self.dtype = "float32" + self.n = -1 + + +class TestMatrixPowerAPI(unittest.TestCase): + def setUp(self): + np.random.seed(123) + self.places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + self.places.append(fluid.CUDAPlace(0)) + + def check_static_result(self, place): + with fluid.program_guard(fluid.Program(), fluid.Program()): + input_x = fluid.data(name="input_x", shape=[4, 4], dtype="float64") + result = paddle.linalg.matrix_power(x=input_x, n=-2) + input_np = np.random.random([4, 4]).astype("float64") + result_np = np.linalg.matrix_power(input_np, -2) + + exe = fluid.Executor(place) + fetches = exe.run(fluid.default_main_program(), + feed={"input_x": input_np}, + fetch_list=[result]) + self.assertTrue( + np.allclose(fetches[0], np.linalg.matrix_power(input_np, -2))) + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + def test_dygraph(self): + for place in self.places: + with fluid.dygraph.guard(place): + input_np = np.random.random([4, 4]).astype("float64") + input = paddle.to_tensor(input_np) + result = paddle.linalg.matrix_power(input, -2) + self.assertTrue( + np.allclose(result.numpy(), + np.linalg.matrix_power(input_np, -2))) + + +class TestMatrixPowerAPIError(unittest.TestCase): + def test_errors(self): + input_np = np.random.random([4, 4]).astype("float64") + + # input must be Variable. + self.assertRaises(TypeError, paddle.linalg.matrix_power, input_np) + + # n must be int + for n in [2.0, '2', -2.0]: + input = fluid.data( + name="input_float32", shape=[4, 4], dtype='float32') + self.assertRaises(TypeError, paddle.linalg.matrix_power, input, n) + + # The data type of input must be float32 or float64. + for dtype in ["bool", "int32", "int64", "float16"]: + input = fluid.data(name="input_" + dtype, shape=[4, 4], dtype=dtype) + self.assertRaises(TypeError, paddle.linalg.matrix_power, input, 2) + + # When out is set, the data type must be the same as input. + input = fluid.data(name="input_1", shape=[4, 4], dtype="float32") + out = fluid.data(name="output", shape=[4, 4], dtype="float64") + self.assertRaises(TypeError, paddle.linalg.matrix_power, input, 2, out) + + # The number of dimensions of input must be >= 2. + input = fluid.data(name="input_2", shape=[4], dtype="float32") + self.assertRaises(ValueError, paddle.linalg.matrix_power, input, 2) + + # The inner-most 2 dimensions of input should be equal to each other + input = fluid.data(name="input_3", shape=[4, 5], dtype="float32") + self.assertRaises(ValueError, paddle.linalg.matrix_power, input, 2) + + +class TestMatrixPowerSingularAPI(unittest.TestCase): + def setUp(self): + self.places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + self.places.append(fluid.CUDAPlace(0)) + + def check_static_result(self, place): + with fluid.program_guard(fluid.Program(), fluid.Program()): + input = fluid.data(name="input", shape=[4, 4], dtype="float64") + result = paddle.linalg.matrix_power(x=input, n=-2) + + input_np = np.zeros([4, 4]).astype("float64") + + exe = fluid.Executor(place) + try: + fetches = exe.run(fluid.default_main_program(), + feed={"input": input_np}, + fetch_list=[result]) + except RuntimeError as ex: + print("The mat is singular") + pass + except ValueError as ex: + print("The mat is singular") + pass + + def test_static(self): + paddle.enable_static() + for place in self.places: + self.check_static_result(place=place) + paddle.disable_static() + + def test_dygraph(self): + for place in self.places: + with fluid.dygraph.guard(place): + input_np = np.ones([4, 4]).astype("float64") + input = fluid.dygraph.to_variable(input_np) + try: + result = paddle.linalg.matrix_power(input, -2) + except RuntimeError as ex: + print("The mat is singular") + pass + except ValueError as ex: + print("The mat is singular") + pass + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py index c771531b7b61be..929a9696d1c12d 100644 --- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py @@ -46,6 +46,7 @@ 'cudnn_lstm', \ 'rnn', \ 'lgamma', \ + 'matrix_power', \ ] NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp',\ diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py index 5cef01d18aca48..ec6b7aa9e3d821 100644 --- a/python/paddle/linalg.py +++ b/python/paddle/linalg.py @@ -14,10 +14,12 @@ from .tensor.linalg import cholesky # noqa: F401 from .tensor.linalg import norm # noqa: F401 +from .tensor.linalg import matrix_power # noqa: F401 from .tensor import inverse as inv # noqa: F401 __all__ = [ 'cholesky', #noqa 'norm', - 'inv' + 'inv', + 'matrix_power' ] diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index bcb508d11922fc..cc20e98006fec4 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -44,6 +44,7 @@ from .linalg import bmm # noqa: F401 from .linalg import histogram # noqa: F401 from .linalg import mv # noqa: F401 +from .linalg import matrix_power # noqa: F401 from .logic import equal # noqa: F401 from .logic import greater_equal # noqa: F401 from .logic import greater_than # noqa: F401 @@ -220,6 +221,7 @@ 'bmm', 'histogram', 'mv', + 'matrix_power', 'abs', 'acos', 'all', diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index a1610581b67c03..74d9876cddd5cb 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -941,3 +941,73 @@ def __check_input(x, vec): type='mv', inputs={'X': x, 'Vec': vec}, outputs={'Out': out}) return out + + +def matrix_power(x, n, name=None): + r""" + Computes the n-th power of a square matrix or a batch of square matrices. + + Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be + an exponent, the equation should be: + + .. math:: + Out = X ^ {n} + + Specifically, + + - If `n > 0`, it returns the matrix or a batch of matrices raised to the power + of `n`. + + - If `n = 0`, it returns the identity matrix or a batch of identity matrices. + + - If `n < 0`, it returns the inverse of each matrix (if invertible) raised to + the power of `abs(n)`. + + Args: + x (Tensor): A square matrix or a batch of square matrices to be raised + to power `n`. Its shape should be `[*, M, M]`, where `*` is zero or + more batch dimensions. Its data type should be float32 or float64. + n (int): The exponent. It can be any positive, negative integer or zero. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor: The n-th power of the matrix (or the batch of matrices) `x`. Its + data type should be the same as that of `x`. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([[1, 2, 3], + [1, 4, 9], + [1, 8, 27]], dtype='float64') + print(paddle.matrix_power(x, 2)) + # [[6. , 34. , 102.], + # [14. , 90. , 282.], + # [36. , 250., 804.]] + + print(paddle.matrix_power(x, 0)) + # [[1., 0., 0.], + # [0., 1., 0.], + # [0., 0., 1.]] + + print(paddle.matrix_power(x, -2)) + # [[ 12.91666667, -12.75000000, 2.83333333 ], + # [-7.66666667 , 8. , -1.83333333 ], + # [ 1.80555556 , -1.91666667 , 0.44444444 ]] + """ + if in_dygraph_mode(): + return core.ops.matrix_power(x, "n", n) + + check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'matrix_power') + check_type(n, 'n', int, 'matrix_power') + helper = LayerHelper('matrix_power', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type='matrix_power', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'n': n}) + return out From 56c5e2107611690fb36513ed70c9c5c17d7f5c08 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Sun, 22 Aug 2021 21:27:46 +0800 Subject: [PATCH 122/126] implementation of broadcast add backward by reduce (#34143) --- .../elementwise/elementwise_add_op.cu | 52 ++++ .../elementwise/elementwise_add_op.h | 282 ++---------------- .../elementwise/elementwise_op_function.h | 20 ++ 3 files changed, 90 insertions(+), 264 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index aff0cb281642ec..69bcd6d0d06ff6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" @@ -83,6 +85,56 @@ static __global__ void SimpleElemwiseAddGradCUDAKernel( } } +template +typename std::enable_if< + std::is_same::value>::type +default_elementwise_add_grad(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, + const framework::Tensor* out, + const framework::Tensor* dout, + framework::Tensor* dx, framework::Tensor* dy) { + int axis = ctx.Attr("axis"); + auto* dout_data = dout->data(); + + // dx + if (dx != nullptr) { + auto* dx_data = dx->mutable_data(ctx.GetPlace()); + if (dx->dims() == dout->dims()) { + if (dx_data != dout_data) { + framework::TensorCopy( + *dout, ctx.GetPlace(), + ctx.template device_context(), dx); + } + } else { + // For inplace strategy, dx will be stored in addr of dout, which makes + // the result of dy wrong. + if (dx->IsSharedBufferWith(*dout)) { + dx->clear(); + dx->mutable_data(x->dims(), ctx.GetPlace()); + } + std::vector reduce_dims = GetReduceDim(x->dims(), out->dims(), axis); + gpuStream_t stream = ctx.cuda_device_context().stream(); + TensorReduceFunctorImpl(*dout, dx, reduce_dims, stream); + } + } + // dy + if (dy != nullptr) { + auto* dy_data = dy->mutable_data(ctx.GetPlace()); + if (dy->dims() == dout->dims()) { + if (dy_data != dout_data) { + framework::TensorCopy( + *dout, ctx.GetPlace(), + ctx.template device_context(), dy); + } + } else { + std::vector reduce_dims = GetReduceDim(y->dims(), out->dims(), axis); + gpuStream_t stream = ctx.cuda_device_context().stream(); + TensorReduceFunctorImpl(*dout, dy, reduce_dims, stream); + } + } +} + template typename std::enable_if< std::is_same::value>::type diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index ad9066540c23bf..6f7b2a2e30bcd4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -85,13 +85,14 @@ struct IdentityGrad { }; template -void default_elementwise_add_grad(const framework::ExecutionContext &ctx, - const framework::Tensor *x, - const framework::Tensor *y, - const framework::Tensor *out, - const framework::Tensor *dout, - framework::Tensor *dx, - framework::Tensor *dy) { +typename std::enable_if< + std::is_same::value>::type +default_elementwise_add_grad(const framework::ExecutionContext &ctx, + const framework::Tensor *x, + const framework::Tensor *y, + const framework::Tensor *out, + const framework::Tensor *dout, + framework::Tensor *dx, framework::Tensor *dy) { int axis = ctx.Attr("axis"); ElemwiseExplicitGradCompute, @@ -133,167 +134,6 @@ elementwise_add_grad(const framework::ExecutionContext &ctx, default_elementwise_add_grad(ctx, x, y, out, dout, dx, dy); } -#ifdef PADDLE_WITH_CUDA -#ifdef __NVCC__ - -template -struct alignas(sizeof(T) * Size) AlignedVector { - T val[Size]; -}; - -template -inline int VectorizedSize(const T *pointer) { - uint64_t address = reinterpret_cast(pointer); - constexpr int vec4 = std::alignment_of>::value; // NOLINT - if (address % vec4 == 0) { - return 4; - } - return 1; -} -template -__global__ void MatrixColReduce(const T *__restrict__ in, T *__restrict__ out, - size_t width, size_t height) { - __shared__ T sdata[BLOCK_H][BLOCK_W + 1]; - size_t idx = threadIdx.x + blockDim.x * blockIdx.x; - size_t width_stride = gridDim.x * blockDim.x; - size_t full_width = (width & (~((uint64_t)(BLOCK_W - 1)))) + - ((width & (BLOCK_W - 1)) ? BLOCK_W : 0); - size_t full_height = (height & (~((uint64_t)(BLOCK_H - 1)))) + - ((height & (BLOCK_H - 1)) ? BLOCK_H : 0); - -#pragma unroll - for (size_t w = idx; w < full_width; w += width_stride) { - sdata[threadIdx.y][threadIdx.x] = 0; - __syncthreads(); - size_t offset = w + threadIdx.y * width; -#pragma unroll - for (size_t h = threadIdx.y; h < full_height; - h += BLOCK_H) { // block-stride loop across matrix height - sdata[threadIdx.y][threadIdx.x] += - (w < width && h < height) ? in[offset] : (static_cast(0)); - offset += width * BLOCK_H; - } - __syncthreads(); - - T val = sdata[threadIdx.x][threadIdx.y]; - for (int i = warpSize >> 1; i > 0; i >>= 1) - val += platform::CudaShuffleXorSync(0xFFFFFFFF, val, i); - - __syncthreads(); - if (threadIdx.x == 0) sdata[0][threadIdx.y] = val; - __syncthreads(); - if ((threadIdx.y == 0) && ((w) < width)) out[w] = sdata[0][threadIdx.x]; - } -} - -#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000 -template -__global__ void VecFP16MatrixColReduce(const __half2 *__restrict__ in, - __half2 *__restrict__ out, size_t width, - size_t height) { -#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int by = blockIdx.y; - __half2 zero = __half2half2(static_cast<__half>(0)); - const int cols = width / 2; - for (; idx < cols; idx += blockDim.x * gridDim.x) { - __half2 sum = zero; - for (int row = 0; row < SIZE; row++) { - int index = idx + (row + by * SIZE) * cols; - sum = __hadd2(sum, in[index]); - } - - atomicAdd(&(out[idx]), sum); - } -#endif -} -#endif - -template -__global__ void MatrixReduceLongWidth(const T *__restrict__ in, T *out, - size_t width, size_t height) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - - for (; idx < width; idx += blockDim.x * gridDim.x) { - T sum = static_cast(0); - for (int row = 0; row < height; row++) { - sum += in[idx + row * width]; - } - - out[idx] = sum; - } -} - -template -__global__ void VecMatrixReduceLongWidth(const T *__restrict__ in, T *out, - size_t width, size_t height) { - using LoadT = AlignedVector; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int w = idx * VEC_SIZE; - int width_stride = blockDim.x * gridDim.x * VEC_SIZE; - for (; w < width; w += width_stride) { - T zero = static_cast(0); - T sum[VEC_SIZE] = {zero}; - T tmp_vec[VEC_SIZE] = {zero}; - LoadT *tmp_ptr = reinterpret_cast(&tmp_vec); - for (int row = 0; row < height; row++) { - int offset = width * row + w; - *tmp_ptr = *reinterpret_cast(&in[offset]); - for (int v = 0; v < VEC_SIZE; v++) { - sum[v] += tmp_vec[v]; - } - } - - for (int v = 0; v < VEC_SIZE; v++) out[w + v] = sum[v]; - } -} -#endif -#endif -bool static RunSpecialDims(const framework::DDim &dx_dims, - const framework::DDim &dy_dims, - const framework::DDim &dout_dims, int axis) { - auto smaller_dims = dx_dims; - auto bigger_dims = dy_dims; - auto smaller_dims_size = smaller_dims.size(); - auto bigger_dims_size = bigger_dims.size(); - int smaller_ignore_size = 0; - int bigger_ignore_size = 0; - for (int i = 0; i < smaller_dims_size; i++) { - if (smaller_dims[i] == 1) - smaller_ignore_size++; - else - break; - } - for (int i = 0; i < bigger_dims_size; i++) { - if (bigger_dims[i] == 1) - bigger_ignore_size++; - else - break; - } - - int smaller_real_size = smaller_dims.size() - smaller_ignore_size; - int bigger_real_size = bigger_dims.size() - bigger_ignore_size; - - if (smaller_real_size == bigger_real_size) return false; - - if (bigger_real_size < smaller_real_size) { - smaller_dims = dy_dims; - bigger_dims = dx_dims; - std::swap(smaller_real_size, bigger_real_size); - } - int big_size = bigger_dims.size(); - int small_size = smaller_dims.size(); - for (int i = 1; i <= smaller_real_size; i++) { - if (bigger_dims[big_size - i] != smaller_dims[small_size - i]) return false; - } - - if (axis != -1 && (axis != (bigger_real_size - smaller_real_size))) { - return false; - } - - return true; -} - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // cuda definition template @@ -304,6 +144,16 @@ elementwise_add_grad(const framework::ExecutionContext &ctx, const framework::Tensor *out, const framework::Tensor *dout, framework::Tensor *dx, framework::Tensor *dy); + +template +typename std::enable_if< + std::is_same::value>::type +default_elementwise_add_grad(const framework::ExecutionContext &ctx, + const framework::Tensor *x, + const framework::Tensor *y, + const framework::Tensor *out, + const framework::Tensor *dout, + framework::Tensor *dx, framework::Tensor *dy); #endif template @@ -322,102 +172,6 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel { // skip out auto *out = dout; -// TODO(@wangchaochaohu, zhouwei35): Fix conv_transpose2d API(dataformat NHWC) -// error in Windows -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) -#ifdef __NVCC__ - - int axis = ctx.Attr("axis"); - if (ctx.GetPlace() == platform::CUDAPlace() && dx != nullptr && - dy != nullptr && dout != nullptr && dx->numel() != dy->numel() && - RunSpecialDims(dx->dims(), dy->dims(), dout->dims(), axis)) { - auto *dx_data = dx->mutable_data(ctx.GetPlace()); - auto *dy_data = dy->mutable_data(ctx.GetPlace()); - auto *dout_data = dout->data(); - auto stream = ctx.cuda_device_context().stream(); - auto *out_data = dx_data; - int width = dx->numel(); - int height = dout->numel() / width; - if (dx->dims() == dout->dims()) { - width = dy->numel(); - height = dout->numel() / width; - out_data = dy_data; - framework::TensorCopy( - *dout, ctx.GetPlace(), - ctx.template device_context(), dx); - } else { - framework::TensorCopy( - *dout, ctx.GetPlace(), - ctx.template device_context(), dy); - } - // special optimization using cub - if (width == 1) { - int nums = height; - size_t temp_storage_bytes = 0; - auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, - dout_data, out_data, nums, stream); - PADDLE_ENFORCE_CUDA_SUCCESS(err); - framework::Tensor tmp; - auto *temp_storage = tmp.mutable_data( - framework::make_ddim({static_cast(temp_storage_bytes)}), - ctx.GetPlace()); - err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, - dout_data, out_data, nums, stream); - PADDLE_ENFORCE_CUDA_SUCCESS(err); - return; - } - - constexpr int block_x = 32; - constexpr int block_y = 32; - dim3 blocks(block_x, block_y); - - int max_physical_threads = - ctx.cuda_device_context().GetMaxPhysicalThreadCount(); - int max_blocks = std::max(max_physical_threads / (block_x * block_y), 1); - int theory_block = (width + blocks.x - 1) / blocks.x; - dim3 grids(std::min(theory_block, max_blocks)); -#if CUDA_VERSION >= 10000 - if (std::is_same::value && width < 2048 && - width % 2 == 0 && height % 64 == 0) { - auto &dev_ctx = - ctx.template device_context(); - math::SetConstant functor; - if (dout->dims() == dx->dims()) - functor(dev_ctx, dy, static_cast(0)); - else - functor(dev_ctx, dx, static_cast(0)); - const __half2 *ptr1 = reinterpret_cast(dout_data); - __half2 *ptr2 = reinterpret_cast<__half2 *>(out_data); - const int threads = 128; - dim3 grid(1, (height + 64 - 1) / 64); - VecFP16MatrixColReduce<64><<>>(ptr1, ptr2, - width, height); - return; - } -#endif - - if (width / height < 32) { - MatrixColReduce<<>>( - dout_data, out_data, width, height); - } else { - size_t thread_nums = 1024; - size_t block_nums = (width + thread_nums - 1) / thread_nums; - int vec_size = VectorizedSize(dout_data); - if (vec_size == 4 && width % 4 == 0) { - block_nums = (width / vec_size + thread_nums - 1) / thread_nums; - VecMatrixReduceLongWidth<<>>( - dout_data, out_data, width, height); - } else { - MatrixReduceLongWidth<<>>( - dout_data, out_data, width, height); - } - } - return; - } - -#endif -#endif // Special case when dy is not needed and dx doesn't reduce if (dx != nullptr && dy == nullptr && dx->dims() == dout->dims()) { VLOG(4) << "Special case when dy is not needed and dx doesn't " diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index cc291ae471386f..dd8e3d409c01e6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -3038,5 +3038,25 @@ static inline void GetDoubleGradSafeTensor( } } +// for broadcast backwards +static inline std::vector GetReduceDim(const framework::DDim &in, + const framework::DDim &out, + int axis) { + axis = + (axis == -1 ? std::abs(static_cast(out.size() - in.size())) : axis); + std::vector dims; + for (int i = 0; i < axis; ++i) { + dims.push_back(i); + } + for (int i = 0; i < in.size(); ++i) { + if (out[i + axis] != in[i]) { + dims.push_back(i + axis); + } + } + for (int i = axis + in.size(); i < out.size(); ++i) { + dims.push_back(i); + } + return dims; +} } // namespace operators } // namespace paddle From cf99c0d5f8f03e95fd65aff1c65ddf74c4d061ef Mon Sep 17 00:00:00 2001 From: Linjie Chen <40840292+linjieccc@users.noreply.github.com> Date: Mon, 23 Aug 2021 11:00:30 +0800 Subject: [PATCH 123/126] Add cuda.device_count api (#34811) * Add cuda device count api * update coda format * fix unittest error * update code format * update comment --- python/paddle/device/cuda/__init__.py | 23 ++++++++++++++++ .../tests/unittests/test_cuda_device_count.py | 26 +++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_device_count.py diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index 8b5879fe9a4ad3..834cda71fdc5f1 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -22,6 +22,7 @@ 'Event', 'current_stream', 'synchronize', + 'device_count', ] @@ -94,3 +95,25 @@ def synchronize(device=None): raise ValueError("device type must be int or paddle.CUDAPlace") return core._device_synchronize(device_id) + + +def device_count(): + ''' + Return the number of GPUs available. + + Returns: + int: the number of GPUs available. + + Examples: + .. code-block:: python + + import paddle + + paddle.device.cuda.device_count() + + ''' + + num_gpus = core.get_cuda_device_count() if hasattr( + core, 'get_cuda_device_count') else 0 + + return num_gpus diff --git a/python/paddle/fluid/tests/unittests/test_cuda_device_count.py b/python/paddle/fluid/tests/unittests/test_cuda_device_count.py new file mode 100644 index 00000000000000..f4114c9d451b39 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_device_count.py @@ -0,0 +1,26 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest + + +class TestDeviceCount(unittest.TestCase): + def test_device_count(self): + s = paddle.device.cuda.device_count() + self.assertIsNotNone(s) + + +if __name__ == "__main__": + unittest.main() From 77a8a3944a01ac2cb3a62c99cd7de459872a01b8 Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+lili0826@users.noreply.github.com> Date: Mon, 23 Aug 2021 12:04:01 +0800 Subject: [PATCH 124/126] add adamw cuda kernel (#35020) * adamw support cuda * adamw support cuda --- paddle/fluid/operators/optimizers/adamw_op.cu | 438 ++++++++++++++++++ paddle/fluid/operators/optimizers/adamw_op.h | 104 ++++- paddle/fluid/pybind/op_function_generator.cc | 2 + .../fluid/tests/unittests/test_adamw_op.py | 27 -- python/paddle/optimizer/adamw.py | 82 +--- 5 files changed, 551 insertions(+), 102 deletions(-) create mode 100644 paddle/fluid/operators/optimizers/adamw_op.cu diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu new file mode 100644 index 00000000000000..af2bb93e06db9b --- /dev/null +++ b/paddle/fluid/operators/optimizers/adamw_op.cu @@ -0,0 +1,438 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/optimizers/adamw_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +__global__ void AdamWKernelREG(MT beta1, MT beta2, MT epsilon, MT coeff, + MT beta1_pow_, MT beta2_pow_, const MT* moment1, + MT* moment1_out, const MT* moment2, + MT* moment2_out, const MT* lr_, const T* grad, + const T* param, T* param_out, + const MT* master_param, MT* master_param_out, + int ndim) { + MT lr = *lr_; + MT beta1_pow = beta1_pow_; + MT beta2_pow = beta2_pow_; + + MT wd = static_cast(1.0) - coeff * lr; + lr *= sqrt(static_cast(1.0) - beta2_pow) / + (static_cast(1.0) - beta1_pow); + + int id = blockIdx.x * blockDim.x + threadIdx.x; + + for (; id < ndim; id += gridDim.x * blockDim.x) { + MT p = master_param ? master_param[id] : static_cast(param[id]); + MT g = static_cast(grad[id]); + MT mom1 = moment1[id]; + MT mom2 = moment2[id]; + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + p = wd * p - + lr * (mom1 / + (sqrt(mom2) + epsilon * sqrt(static_cast(1.0) - beta2_pow))); + + moment1_out[id] = mom1; + moment2_out[id] = mom2; + param_out[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } +} + +template +__global__ void AdamWKernelMEM(MT beta1, MT beta2, MT epsilon, MT coeff, + const MT* beta1_pow_, const MT* beta2_pow_, + const MT* moment1, MT* moment1_out, + const MT* moment2, MT* moment2_out, + const MT* lr_, const T* grad, const T* param, + T* param_out, const MT* master_param, + MT* master_param_out, int ndim) { + MT lr = *lr_; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + + MT wd = static_cast(1.0) - coeff * lr; + lr *= sqrt(static_cast(1.0) - beta2_pow) / + (static_cast(1.0) - beta1_pow); + + int id = blockIdx.x * blockDim.x + threadIdx.x; + + for (; id < ndim; id += gridDim.x * blockDim.x) { + MT p = master_param ? master_param[id] : static_cast(param[id]); + MT g = static_cast(grad[id]); + MT mom1 = static_cast(moment1[id]); + MT mom2 = static_cast(moment2[id]); + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + p = wd * p - + lr * (mom1 / + (sqrt(mom2) + epsilon * sqrt(static_cast(1.0) - beta2_pow))); + + moment1_out[id] = mom1; + moment2_out[id] = mom2; + param_out[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } +} +template +__global__ void UpdateAdamWBetaPow(T beta1, T beta2, const T* beta1_pow_, + const T* beta2_pow_, T* beta1_pow_out, + T* beta2_pow_out) { + *beta1_pow_out = beta1 * beta1_pow_[0]; + *beta2_pow_out = beta2 * beta2_pow_[0]; +} + +template +__global__ void SparseAdamWCUDAKernelREG( + MT beta1, MT beta2, MT epsilon, MT coeff, const MT beta1_pow, + const MT beta2_pow, const MT* mom1_, MT* mom1_out_, const MT* mom2_, + MT* mom2_out_, const MT* lr_, const T* grad_, const T* param_, + T* param_out_, const MT* master_param, MT* master_param_out, + const int64_t* rows_, int64_t row_numel, int64_t row_count, bool lazy_mode, + int ndim) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + MT lr = *lr_; + + MT wd = static_cast(1.0) - coeff * lr; + lr *= sqrt(static_cast(1.0) - beta2_pow) / + (static_cast(1.0) - beta1_pow); + + for (; id < ndim; id += blockDim.x * gridDim.x) { + auto row_idx = + math::BinarySearch(rows_, row_count, id / row_numel); + if (lazy_mode && row_idx < 0) { + return; + } else { + MT mom1 = mom1_[id]; + MT mom2 = mom2_[id]; + MT p = master_param ? master_param[id] : static_cast(param_[id]); + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel + id % row_numel]) + : static_cast(0); + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + p = wd * p - + lr * (mom1 / (sqrt(mom2) + + epsilon * sqrt(static_cast(1.0) - beta2_pow))); + + // Write back to global memory + mom1_out_[id] = mom1; + mom2_out_[id] = mom2; + param_out_[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } + } +} + +template +class AdamWOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + + using paddle::framework::LoDTensor; + using MPDType = typename details::MPTypeTrait::Type; + + int64_t min_row_size_to_use_multithread = + ctx.Attr("min_row_size_to_use_multithread"); + bool lazy_mode = ctx.Attr("lazy_mode"); + bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + float coeff = ctx.Attr("coeff"); + + auto* param = ctx.Input("Param"); + auto* grad_var = ctx.InputVar("Grad"); + auto* mom1 = ctx.Input("Moment1"); + auto* mom2 = ctx.Input("Moment2"); + auto* lr = ctx.Input("LearningRate"); + + auto* beta1_pow = ctx.Input("Beta1Pow"); + auto* beta2_pow = ctx.Input("Beta2Pow"); + + auto* param_out = ctx.Output("ParamOut"); + auto* mom1_out = ctx.Output("Moment1Out"); + auto* mom2_out = ctx.Output("Moment2Out"); + auto* beta1_pow_out = ctx.Output("Beta1PowOut"); + auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + TensorToVector(*skip_update_tensor, ctx.device_context(), + &skip_update_vec); + skip_update = skip_update_vec[0]; + } + + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update) { + VLOG(4) << "Adamw skip update"; + framework::TensorCopy( + *param, ctx.GetPlace(), + ctx.template device_context(), param_out); + framework::TensorCopy( + *mom1, ctx.GetPlace(), + ctx.template device_context(), mom1_out); + framework::TensorCopy( + *mom2, ctx.GetPlace(), + ctx.template device_context(), mom2_out); + framework::TensorCopy( + *beta1_pow, ctx.GetPlace(), + ctx.template device_context(), + beta1_pow_out); + framework::TensorCopy( + *beta2_pow, ctx.GetPlace(), + ctx.template device_context(), + beta2_pow_out); + return; + } + + // if with_decay = false, coeff = 0 + bool with_decay = ctx.Attr("with_decay"); + if (!with_decay) { + coeff = static_cast(0.0); + } + + MPDType beta1 = static_cast(ctx.Attr("beta1")); + if (ctx.HasInput("Beta1Tensor")) { + auto* beta1_tensor = ctx.Input("Beta1Tensor"); + PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta1Tensor) size must be 1, but get %d", + beta1_tensor->numel())); + beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); + } + MPDType beta2 = static_cast(ctx.Attr("beta2")); + if (ctx.HasInput("Beta2Tensor")) { + auto* beta2_tensor = ctx.Input("Beta2Tensor"); + PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta2Tensor) size must be 1, but get %d", + beta2_tensor->numel())); + beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); + } + MPDType epsilon = static_cast(ctx.Attr("epsilon")); + if (ctx.HasInput("EpsilonTensor")) { + auto* epsilon_tensor = ctx.Input("EpsilonTensor"); + PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(EpsilonTensor) size must be 1, but get %d", + epsilon_tensor->numel())); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + } + VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() + << "beta2_pow.numel() : " << beta2_pow->numel(); + VLOG(3) << "param.numel(): " << param->numel(); + PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + const bool multi_precision = ctx.Attr("multi_precision"); + const LoDTensor* master_param = nullptr; + LoDTensor* master_param_out = nullptr; + if (multi_precision) { + bool has_master = + ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); + PADDLE_ENFORCE_EQ(has_master, true, + platform::errors::InvalidArgument( + "The Input(MasterParam) and Output(MasterParamOut) " + "should not be null when " + "the attr `multi_precision` is true")); + master_param = ctx.Input("MasterParam"); + master_param_out = ctx.Output("MasterParamOut"); + } + const MPDType* master_in_data = + multi_precision ? master_param->data() : nullptr; + MPDType* master_out_data = + multi_precision + ? master_param_out->mutable_data(ctx.GetPlace()) + : nullptr; + + auto& dev_ctx = ctx.template device_context(); + + if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); + + // update param and moment + int threads = 512; + int blocks = (param->numel() + threads - 1) / threads; + + if (beta1_pow->place() == platform::CPUPlace() && + beta2_pow->place() == platform::CPUPlace()) { + // Compute with betapow in REG + AdamWKernelREG<<>>( + beta1, beta2, epsilon, coeff, *beta1_pow->data(), + *beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad->data(), param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, param->numel()); + if (!use_global_beta_pow) { + // Cpu update + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + beta2 * beta2_pow->data()[0]; + } + } else { + AdamWKernelMEM<<>>( + beta1, beta2, epsilon, coeff, beta1_pow->data(), + beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad->data(), param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, param->numel()); + if (!use_global_beta_pow) { + // Update with gpu + UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1, beta2, beta1_pow->data(), + beta2_pow->data(), + beta1_pow_out->mutable_data(ctx.GetPlace()), + beta2_pow_out->mutable_data(ctx.GetPlace())); + } + } + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); + if (grad->rows().size() == 0) { + VLOG(3) << "grad row size is 0!!"; + return; + } + + std::vector cpu_rows(grad->rows().begin(), grad->rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + framework::SelectedRows tmp_grad_merge; + const framework::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + scatter::MergeAdd merge_func; + merge_func(ctx.template device_context(), + *grad, &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; + } + auto& grad_merge = *grad_merge_ptr; + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace()); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + if (beta1_pow->place() == platform::CPUPlace() && + beta2_pow->place() == platform::CPUPlace()) { + int threads = 512; + int ndim = param->numel(); + int blocks = (ndim + threads - 1) / threads; + + SparseAdamWCUDAKernelREG< + T, MPDType><<>>( + beta1, beta2, epsilon, coeff, *beta1_pow->data(), + *beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad_data, param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, rows, row_numel, grad_merge.rows().size(), + lazy_mode, ndim); + if (!use_global_beta_pow) { + // Update with cpu + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + beta2 * beta2_pow->data()[0]; + } + } else { + SparseAdamWFunctor functor( + beta1, beta2, epsilon, coeff, beta1_pow->data(), + beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad_data, param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, rows, row_numel, grad_merge.rows().size(), + lazy_mode); + + // FIXME(minqiyang): remove BinarySearch in GPU later + platform::ForRange for_range( + static_cast( + ctx.device_context()), + param->numel()); + for_range(functor); + if (!use_global_beta_pow) { + // update beta1 and beta2 + UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1, beta2, beta1_pow->data(), + beta2_pow->data(), + beta1_pow_out->mutable_data(ctx.GetPlace()), + beta2_pow_out->mutable_data(ctx.GetPlace())); + } + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Variable type not supported by adamw_op")); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(adamw, ops::AdamWOpCUDAKernel, + ops::AdamWOpCUDAKernel, + ops::AdamWOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/adamw_op.h b/paddle/fluid/operators/optimizers/adamw_op.h index 3301bc4808e3a8..d87f827bbf22bb 100644 --- a/paddle/fluid/operators/optimizers/adamw_op.h +++ b/paddle/fluid/operators/optimizers/adamw_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ class AdamWOp : public AdamOp { using AdamOp::AdamOp; }; +struct GPUAdamW; struct CPUAdamW; template @@ -46,6 +47,107 @@ class AdamWFunctor { } }; +template +class SparseAdamWFunctor; + +template +class SparseAdamWFunctor { + private: + MT beta1_; + MT beta2_; + MT epsilon_; + MT coeff_; + + const MT* beta1_pow_; + const MT* beta2_pow_; + const MT* moment1_; + MT* moment1_out_; + const MT* moment2_; + MT* moment2_out_; + const MT* lr_; + const T* grad_; + const T* param_; + T* param_out_; + const MT* master_param_; + MT* master_param_out_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t row_count_; + bool lazy_mode_; + + public: + SparseAdamWFunctor(MT beta1, MT beta2, MT epsilon, MT coeff, + const MT* beta1_pow, const MT* beta2_pow, const MT* mom1, + MT* mom1_out, const MT* mom2, MT* mom2_out, const MT* lr, + const T* grad, const T* param, T* param_out, + const MT* master_param, MT* master_param_out, + const int64_t* rows, int64_t row_numel, int64_t row_count, + bool lazy_mode) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + coeff_(coeff), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + master_param_(master_param), + master_param_out_(master_param_out), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count), + lazy_mode_(lazy_mode) {} + + inline HOSTDEVICE void adamw_update(size_t i, MT g) const { + // The following code is the same as dense + MT mom1 = moment1_[i]; + MT mom2 = moment2_[i]; + MT lr = *lr_; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + MT p = master_param_ ? master_param_[i] : static_cast(param_[i]); + + // Calculation + MT wd = static_cast(1.0) - coeff_ * lr; + lr *= sqrt(static_cast(1.0) - beta2_pow) / + (static_cast(1.0) - beta1_pow); + + mom1 = beta1_ * mom1 + (static_cast(1.0) - beta1_) * g; + mom2 = beta2_ * mom2 + (static_cast(1.0) - beta2_) * g * g; + p = wd * p - + lr * (mom1 / + (sqrt(mom2) + epsilon_ * sqrt(static_cast(1.0) - beta2_pow))); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = static_cast(p); + if (master_param_out_) { + master_param_out_[i] = p; + } + } + + inline HOSTDEVICE void operator()(size_t i) const { + auto row_idx = + math::BinarySearch(rows_, row_count_, i / row_numel_); + if (lazy_mode_ && row_idx < 0) { + return; + } else { + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel_ + i % row_numel_]) + : static_cast(0); + adamw_update(i, g); + } + } +}; + template class AdamWOpKernel : public AdamOpKernel { public: diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index dc27befd26cda8..4b610f3bccba0f 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -118,6 +118,8 @@ std::map> op_passing_outs_map = { {"sgd", {"ParamOut"}}, {"adam", {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, + {"adamw", + {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, {"average_accumulates", {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates", "out_old_num_accumulates", "out_num_updates"}}, diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index fdc3682f4b1bfb..d99e15b2128ce3 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -93,33 +93,6 @@ def test_adamw_op_invalid_input(self): adam = paddle.optimizer.AdamW( 0.1, epsilon=-1, parameters=linear.parameters()) - def test_adamw_lr_decay(self): - paddle.disable_static() - value = np.arange(26).reshape(2, 13).astype("float32") - a = paddle.to_tensor(value) - linear = paddle.nn.Linear(13, 5) - - lr = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=10) - wd = 0.1 - adam = paddle.optimizer.AdamW( - learning_rate=lr, - parameters=linear.parameters(), - apply_decay_param_fun=lambda name: True, - weight_decay=wd) - - for _ in range(2): - out = linear(a) - out.backward() - lr_to_coeff = adam._lr_to_coeff - adam.step() - - for i, value in enumerate(lr_to_coeff.values()): - self.assertAlmostEqual(value.numpy()[0], 1.0 - lr() * wd) - self.assertEqual(len(adam._lr_to_coeff), 0) - - lr.step() - adam.clear_gradients() - class TestAdamWOpGroup(TestAdamWOp): def test_adamw_op_dygraph(self): diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 965785908979bb..e6ec91dc415898 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -48,8 +48,8 @@ class AdamW(Adam): Args: learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``. It can be a float value or a LRScheduler. The default value is 0.001. - parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. And you can specify different options for \ + parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. And you can specify different options for \ different parameter groups such as the learning rate, weight decay, etc, \ then the parameters are list of dict. Note that the learning_rate in paramter groups \ represents the scale of base learning_rate. \ @@ -162,7 +162,6 @@ def __init__(self, self._params_name = set() self._apply_decay_param_fun = apply_decay_param_fun self._coeff = coeff - self._lr_to_coeff = dict() super(AdamW, self).__init__( learning_rate=learning_rate, @@ -178,9 +177,6 @@ def __init__(self, self.type = "adamw" - # now the adamw op doesn't support cuda - if core.is_compiled_with_cuda(): - self.type = "adam" # Use _auxiliary_vars together with _set_auxiliary_var/_get_auxiliary_var to achieve that. self._auxiliary_vars = dict() @@ -193,64 +189,7 @@ def _get_auxiliary_var(self, key): else: return None - def _append_decoupled_weight_decay(self, block, param_and_grad): - """ - Add decoupled weight decay op. - parameter = parameter - parameter * coeff * lr - - Args: - block: block in which variable is to be created - param_and_grad: (parameters, gradients) pairs, - the parameters need to decay. - Raises: - Exception: The type of coeff and parameter is not consistent. - """ - if isinstance(param_and_grad, dict): - param_and_grad = self._update_param_group(param_and_grad) - param, grad = param_and_grad - - if self._apply_decay_param_fun is not None \ - and not self._apply_decay_param_fun(param.name): - return - - if isinstance(self._learning_rate, float): - learning_rate = self._learning_rate - else: - # NOTE. We add this function to the _append_optimize_op(), - # for we must make sure _create_param_lr() be called after - # optimizer._create_global_learning_rate(). - learning_rate = self._create_param_lr(param_and_grad) - - with block.program._optimized_guard( - [param, grad]), framework.name_scope('weight decay'): - self._params_name.add(param.name) - - # If it has been calculated, the result will be reused. - # NOTE(wangxi): In dygraph mode, apply_gradient will be executed - # every step, so need clear _lr_to_coeff every step, - # we do this in _create_optimization_pass - decay_coeff = self._lr_to_coeff.get(learning_rate, None) - if decay_coeff is None: - # NOTE(wangxi): for pipeline to set device:all - with paddle.static.device_guard(None): - decay_coeff = 1.0 - learning_rate * self._coeff - self._lr_to_coeff[learning_rate] = decay_coeff - - find_master = (self._multi_precision and - param.dtype == core.VarDesc.VarType.FP16) - if find_master: - master_weight = self._master_weights[param.name] - scaled_param = master_weight * decay_coeff - paddle.fluid.layers.assign( - input=scaled_param, output=master_weight) - else: - scaled_param = param * decay_coeff - paddle.fluid.layers.assign(input=scaled_param, output=param) - def _append_optimize_op(self, block, param_and_grad): - if not core.is_compiled_with_npu(): - self._append_decoupled_weight_decay(block, param_and_grad) - return super(AdamW, self)._append_optimize_op(block, param_and_grad) assert isinstance(block, framework.Block) if isinstance(param_and_grad, dict): @@ -262,6 +201,8 @@ def _append_optimize_op(self, block, param_and_grad): if self._apply_decay_param_fun is not None \ and not self._apply_decay_param_fun(param.name): with_decay = False + else: + self._params_name.add(param.name) moment1 = self._get_accumulator(self._moment1_acc_str, param_and_grad[0]) @@ -277,19 +218,19 @@ def _append_optimize_op(self, block, param_and_grad): if find_master else None) lr = self._create_param_lr(param_and_grad) - # create the adam optimize op + # create the adamw optimize op if framework.in_dygraph_mode(): _beta1 = self._beta1 if not isinstance( self._beta1, Variable) else self._beta1.numpy().item(0) _beta2 = self._beta2 if not isinstance( self._beta2, Variable) else self._beta2.numpy().item(0) - _, _, _, _, _ = _C_ops.adam( + _, _, _, _, _ = _C_ops.adamw( param_and_grad[0], param_and_grad[1], lr, moment1, moment2, beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1, moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', - 1000, 'beta1', _beta1, 'beta2', _beta2) + 1000, 'beta1', _beta1, 'beta2', _beta2, 'coeff', self._coeff) return None @@ -350,13 +291,6 @@ def _append_optimize_op(self, block, param_and_grad): return adamw_op - def _create_optimization_pass(self, parameters_and_grads): - optimize_ops = super( - AdamW, self)._create_optimization_pass(parameters_and_grads) - # In dygraph mode, clear _lr_to_coeff after applied gradient - self._lr_to_coeff = dict() - return optimize_ops - def __str__(self): return " ".join(["Weight Decay, params:", ",".join(self._params_name)]) From c3efabeb8d024fb2133c4f0fd6794d8dee67c2b4 Mon Sep 17 00:00:00 2001 From: seemingwang Date: Mon, 23 Aug 2021 13:20:58 +0800 Subject: [PATCH 125/126] set node feature (#34994) --- .../distributed/service/graph_brpc_client.cc | 96 +++++++++++++++++++ .../distributed/service/graph_brpc_client.h | 5 + .../distributed/service/graph_brpc_server.cc | 42 ++++++++ .../distributed/service/graph_brpc_server.h | 4 + .../distributed/service/graph_py_service.cc | 13 +++ .../distributed/service/graph_py_service.h | 3 + .../fluid/distributed/service/sendrecv.proto | 1 + .../distributed/table/common_graph_table.cc | 28 ++++++ .../distributed/table/common_graph_table.h | 6 ++ .../fluid/distributed/test/graph_node_test.cc | 11 +++ paddle/fluid/pybind/fleet_py.cc | 15 +++ 11 files changed, 224 insertions(+) diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc index 70f2da6d7252ce..68d9c9669b6972 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/service/graph_brpc_client.cc @@ -479,6 +479,102 @@ std::future GraphBrpcClient::pull_graph_list( closure); return fut; } + +std::future GraphBrpcClient::set_node_feat( + const uint32_t &table_id, const std::vector &node_ids, + const std::vector &feature_names, + const std::vector> &features) { + std::vector request2server; + std::vector server2request(server_size, -1); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + if (server2request[server_index] == -1) { + server2request[server_index] = request2server.size(); + request2server.push_back(server_index); + } + } + size_t request_call_num = request2server.size(); + std::vector> node_id_buckets(request_call_num); + std::vector> query_idx_buckets(request_call_num); + std::vector>> features_idx_buckets( + request_call_num); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + int request_idx = server2request[server_index]; + node_id_buckets[request_idx].push_back(node_ids[query_idx]); + query_idx_buckets[request_idx].push_back(query_idx); + if (features_idx_buckets[request_idx].size() == 0) { + features_idx_buckets[request_idx].resize(feature_names.size()); + } + for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + features_idx_buckets[request_idx][feat_idx].push_back( + features[feat_idx][query_idx]); + } + } + + DownpourBrpcClosure *closure = new DownpourBrpcClosure( + request_call_num, + [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + size_t fail_num = 0; + for (int request_idx = 0; request_idx < request_call_num; + ++request_idx) { + if (closure->check_response(request_idx, PS_GRAPH_SET_NODE_FEAT) != + 0) { + ++fail_num; + } + if (fail_num == request_call_num) { + ret = -1; + } + } + closure->set_promise_value(ret); + }); + + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + + for (int request_idx = 0; request_idx < request_call_num; ++request_idx) { + int server_index = request2server[request_idx]; + closure->request(request_idx)->set_cmd_id(PS_GRAPH_SET_NODE_FEAT); + closure->request(request_idx)->set_table_id(table_id); + closure->request(request_idx)->set_client_id(_client_id); + size_t node_num = node_id_buckets[request_idx].size(); + + closure->request(request_idx) + ->add_params((char *)node_id_buckets[request_idx].data(), + sizeof(uint64_t) * node_num); + std::string joint_feature_name = + paddle::string::join_strings(feature_names, '\t'); + closure->request(request_idx) + ->add_params(joint_feature_name.c_str(), joint_feature_name.size()); + + // set features + std::string set_feature = ""; + for (size_t feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + for (size_t node_idx = 0; node_idx < node_num; ++node_idx) { + size_t feat_len = + features_idx_buckets[request_idx][feat_idx][node_idx].size(); + set_feature.append((char *)&feat_len, sizeof(size_t)); + set_feature.append( + features_idx_buckets[request_idx][feat_idx][node_idx].data(), + feat_len); + } + } + closure->request(request_idx) + ->add_params(set_feature.c_str(), set_feature.size()); + + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); + closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), + closure->response(request_idx), closure); + } + + return fut; +} + int32_t GraphBrpcClient::initialize() { // set_shard_num(_config.shard_num()); BrpcPsClient::initialize(); diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h index 5696e8b08037b7..8acb2047b8e972 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.h +++ b/paddle/fluid/distributed/service/graph_brpc_client.h @@ -79,6 +79,11 @@ class GraphBrpcClient : public BrpcPsClient { const std::vector& feature_names, std::vector>& res); + virtual std::future set_node_feat( + const uint32_t& table_id, const std::vector& node_ids, + const std::vector& feature_names, + const std::vector>& features); + virtual std::future clear_nodes(uint32_t table_id); virtual std::future add_graph_node( uint32_t table_id, std::vector& node_id_list, diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc index 52ac8c5d688a4a..110d4406fc5569 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/service/graph_brpc_server.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/distributed/service/brpc_ps_server.h" #include // NOLINT +#include #include "butil/endpoint.h" #include "iomanip" #include "paddle/fluid/distributed/service/brpc_ps_client.h" @@ -157,6 +158,8 @@ int32_t GraphBrpcService::initialize() { &GraphBrpcService::add_graph_node; _service_handler_map[PS_GRAPH_REMOVE_GRAPH_NODE] = &GraphBrpcService::remove_graph_node; + _service_handler_map[PS_GRAPH_SET_NODE_FEAT] = + &GraphBrpcService::graph_set_node_feat; // shard初始化,server启动后才可从env获取到server_list的shard信息 initialize_shard_info(); @@ -400,5 +403,44 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table, return 0; } + +int32_t GraphBrpcService::graph_set_node_feat(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 3) { + set_response_code( + response, -1, + "graph_set_node_feat request requires at least 2 arguments"); + return 0; + } + size_t node_num = request.params(0).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); + + std::vector feature_names = + paddle::string::split_string(request.params(1), "\t"); + + std::vector> features( + feature_names.size(), std::vector(node_num)); + + const char *buffer = request.params(2).c_str(); + + for (size_t feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + for (size_t node_idx = 0; node_idx < node_num; ++node_idx) { + size_t feat_len = *(size_t *)(buffer); + buffer += sizeof(size_t); + auto feat = std::string(buffer, feat_len); + features[feat_idx][node_idx] = feat; + buffer += feat_len; + } + } + + ((GraphTable *)table)->set_node_feat(node_ids, feature_names, features); + + return 0; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h index 47c370572826ac..6b4853fa679923 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.h +++ b/paddle/fluid/distributed/service/graph_brpc_server.h @@ -83,9 +83,13 @@ class GraphBrpcService : public PsBaseService { const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl); + int32_t graph_get_node_feat(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl); + int32_t graph_set_node_feat(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl); int32_t clear_nodes(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl); int32_t add_graph_node(Table *table, const PsRequestMessage &request, diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc index 39befb1a112c85..b4159627013174 100644 --- a/paddle/fluid/distributed/service/graph_py_service.cc +++ b/paddle/fluid/distributed/service/graph_py_service.cc @@ -330,6 +330,19 @@ std::vector> GraphPyClient::get_node_feat( return v; } +void GraphPyClient::set_node_feat( + std::string node_type, std::vector node_ids, + std::vector feature_names, + const std::vector> features) { + if (this->table_id_map.count(node_type)) { + uint32_t table_id = this->table_id_map[node_type]; + auto status = + worker_ptr->set_node_feat(table_id, node_ids, feature_names, features); + status.wait(); + } + return; +} + std::vector GraphPyClient::pull_graph_list(std::string name, int server_index, int start, int size, diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h index da027fbae3e6f0..8e03938801ce99 100644 --- a/paddle/fluid/distributed/service/graph_py_service.h +++ b/paddle/fluid/distributed/service/graph_py_service.h @@ -155,6 +155,9 @@ class GraphPyClient : public GraphPyService { std::vector> get_node_feat( std::string node_type, std::vector node_ids, std::vector feature_names); + void set_node_feat(std::string node_type, std::vector node_ids, + std::vector feature_names, + const std::vector> features); std::vector pull_graph_list(std::string name, int server_index, int start, int size, int step = 1); ::paddle::distributed::PSParameter GetWorkerProto(); diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto index a4b811e950a3b5..696c950d9b33ba 100644 --- a/paddle/fluid/distributed/service/sendrecv.proto +++ b/paddle/fluid/distributed/service/sendrecv.proto @@ -55,6 +55,7 @@ enum PsCmdID { PS_GRAPH_CLEAR = 34; PS_GRAPH_ADD_GRAPH_NODE = 35; PS_GRAPH_REMOVE_GRAPH_NODE = 36; + PS_GRAPH_SET_NODE_FEAT = 37; } message PsRequestMessage { diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc index 29bcc04d9c1dfb..41f4b0dac4d96e 100644 --- a/paddle/fluid/distributed/table/common_graph_table.cc +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -469,6 +469,34 @@ int32_t GraphTable::get_node_feat(const std::vector &node_ids, return 0; } +int32_t GraphTable::set_node_feat( + const std::vector &node_ids, + const std::vector &feature_names, + const std::vector> &res) { + size_t node_num = node_ids.size(); + std::vector> tasks; + for (size_t idx = 0; idx < node_num; ++idx) { + uint64_t node_id = node_ids[idx]; + tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( + [&, idx, node_id]() -> int { + size_t index = node_id % this->shard_num - this->shard_start; + auto node = shards[index].add_feature_node(node_id); + node->set_feature_size(this->feat_name.size()); + for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + const std::string &feature_name = feature_names[feat_idx]; + if (feat_id_map.find(feature_name) != feat_id_map.end()) { + node->set_feature(feat_id_map[feature_name], res[feat_idx][idx]); + } + } + return 0; + })); + } + for (size_t idx = 0; idx < node_num; ++idx) { + tasks[idx].get(); + } + return 0; +} + std::pair GraphTable::parse_feature( std::string feat_str) { // Return (feat_id, btyes) if name are in this->feat_name, else return (-1, diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h index 6ccce44c7ead69..f643337a80f7c2 100644 --- a/paddle/fluid/distributed/table/common_graph_table.h +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -46,6 +46,7 @@ class GraphShard { } return res; } + GraphNode *add_graph_node(uint64_t id); FeatureNode *add_feature_node(uint64_t id); Node *find_node(uint64_t id); @@ -122,6 +123,11 @@ class GraphTable : public SparseTable { const std::vector &feature_names, std::vector> &res); + virtual int32_t set_node_feat( + const std::vector &node_ids, + const std::vector &feature_names, + const std::vector> &res); + protected: std::vector shards; size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num; diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index b8630aed02ffe6..810530cdbec94d 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -558,6 +558,17 @@ void RunBrpcPushSparse() { VLOG(0) << "get_node_feat: " << node_feat[1][0]; VLOG(0) << "get_node_feat: " << node_feat[1][1]; + node_feat[1][0] = "helloworld"; + + client1.set_node_feat(std::string("user"), node_ids, feature_names, + node_feat); + + // sleep(5); + node_feat = + client1.get_node_feat(std::string("user"), node_ids, feature_names); + VLOG(0) << "get_node_feat: " << node_feat[1][0]; + ASSERT_TRUE(node_feat[1][0] == "helloworld"); + // Test string node_ids.clear(); node_ids.push_back(37); diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc index a6b542f53ae178..ea9faf57ac52b6 100644 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -205,6 +205,7 @@ void BindGraphPyClient(py::module* m) { .def("pull_graph_list", &GraphPyClient::pull_graph_list) .def("start_client", &GraphPyClient::start_client) .def("batch_sample_neighboors", &GraphPyClient::batch_sample_neighboors) + .def("remove_graph_node", &GraphPyClient::remove_graph_node) .def("random_sample_nodes", &GraphPyClient::random_sample_nodes) .def("stop_server", &GraphPyClient::stop_server) .def("get_node_feat", @@ -221,6 +222,20 @@ void BindGraphPyClient(py::module* m) { } return bytes_feats; }) + .def("set_node_feat", + [](GraphPyClient& self, std::string node_type, + std::vector node_ids, + std::vector feature_names, + std::vector> bytes_feats) { + std::vector> feats(bytes_feats.size()); + for (int i = 0; i < bytes_feats.size(); ++i) { + for (int j = 0; j < bytes_feats[i].size(); ++j) { + feats[i].push_back(std::string(bytes_feats[i][j])); + } + } + self.set_node_feat(node_type, node_ids, feature_names, feats); + return; + }) .def("bind_local_server", &GraphPyClient::bind_local_server); } From aefec228628ddf613d2006c055c57c7aad004adb Mon Sep 17 00:00:00 2001 From: TeslaZhao Date: Mon, 23 Aug 2021 14:08:27 +0800 Subject: [PATCH 126/126] Fix a bug of strided_slice op, about the axes parameter access memory out of bounds (#35062) --- paddle/fluid/operators/strided_slice_op.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc index d53ab914db4d72..a1b5ca0f6a6eb5 100644 --- a/paddle/fluid/operators/strided_slice_op.cc +++ b/paddle/fluid/operators/strided_slice_op.cc @@ -62,6 +62,20 @@ class StridedSliceOp : public framework::OperatorWithKernel { auto ends_size = ends.size(); auto strides_size = strides.size(); + for (size_t i = 0; i < axes.size(); ++i) { + PADDLE_ENFORCE_GE(axes[i], 0, + platform::errors::InvalidArgument( + "The axis should be greater than or equal to 0." + "But received %d of axes[%d]", + axes[i], i)); + PADDLE_ENFORCE_LT( + axes[i], in_dims.size(), + platform::errors::InvalidArgument( + "The axes should be less than or equal to input tensor's rank." + "But received %d of axes[%d], input tensor shape [%d]", + axes[i], i, in_dims.size())); + } + if (ctx->HasInputs("StartsTensorList")) { auto StartsTensorList = ctx->Inputs("StartsTensorList"); PADDLE_ENFORCE_GT(