From b23b17c0a9405d246df91a71764a19b92546cace Mon Sep 17 00:00:00 2001 From: liutiexing <74819124+liutiexing@users.noreply.github.com> Date: Sun, 26 Sep 2021 14:42:26 +0800 Subject: [PATCH 001/298] Alignment of WorkQueue (#35930) * add align for WorkQueue * WorkQueue update * Revert "WorkQueue update" This reverts commit 14ce793dbb204f8ddec63c34b3b72a73c7cdb93a. From 49c8253fc0fc360e8f93ee7f3567824beaa941b4 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Sun, 26 Sep 2021 15:30:50 +0800 Subject: [PATCH 002/298] modify adam to adamw in AdamW (#36028) * adam to adamw in AdamW * add lr_ratio in adamw * refine logic bug in cpu adamw * delete fix bug for cpu adamw * delete fix bug for cpu adamw --- paddle/fluid/pybind/op_function_generator.cc | 9 ++++++++- python/paddle/optimizer/adamw.py | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index f9d11e8154f43f..32e14dafb644bf 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -71,6 +71,9 @@ std::map> op_ins_map = { {"adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}}, + {"adamw", + {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", + "Beta2Pow", "MasterParam"}}, }; // NOTE(zhiqiu): Like op_ins_map. @@ -110,6 +113,9 @@ std::map> op_outs_map = { {"adam", {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}}, + {"adamw", + {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", + "MasterParamOut"}}, }; // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are @@ -129,7 +135,8 @@ std::map> op_passing_outs_map = { {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}}, {"adamw", - {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, + {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", + "MasterParamOut"}}, {"average_accumulates", {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates", "out_old_num_accumulates", "out_num_updates"}}, diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 10d6af651777e2..34fb201d8ccaf7 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -298,14 +298,14 @@ def _append_optimize_op(self, block, param_and_grad): _beta2 = self._beta2 if not isinstance( self._beta2, Variable) else self._beta2.numpy().item(0) - _, _, _, _, _, _ = _C_ops.adam( + _, _, _, _, _, _ = _C_ops.adamw( param_and_grad[0], param_and_grad[1], lr, moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0], moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight, 'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', 1000, 'beta1', _beta1, 'beta2', _beta2, 'coeff', self._coeff, 'multi_precision', - find_master) + find_master, "lr_ratio", lr_ratio_) return None From 991dc67df6fd68c63f0816231d33e011401d2a3a Mon Sep 17 00:00:00 2001 From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Date: Sun, 26 Sep 2021 15:34:07 +0800 Subject: [PATCH 003/298] set file_num in one shard (#35835) * set file_num in one shard * format --- paddle/fluid/framework/fleet/fleet_wrapper.cc | 14 ++++++++++++++ paddle/fluid/framework/fleet/fleet_wrapper.h | 1 + paddle/fluid/pybind/fleet_wrapper_py.cc | 2 ++ .../fleet/parameter_server/pslib/__init__.py | 15 +++++++++++++++ 4 files changed, 32 insertions(+) diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index dc5e24ef5de42f..4346c144fab7f2 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -1347,6 +1347,20 @@ void FleetWrapper::PrintTableStat(const uint64_t table_id) { #endif } +void FleetWrapper::SetFileNumOneShard(const uint64_t table_id, int file_num) { +#ifdef PADDLE_WITH_PSLIB + auto ret = + pslib_ptr_->_worker_ptr->set_file_num_one_shard(table_id, file_num); + ret.wait(); + int32_t err_code = ret.get(); + if (err_code == -1) { + LOG(ERROR) << "set_file_num_one_shard failed"; + } +#else + VLOG(0) << "FleetWrapper::SetFileNumOneShard does nothing when no pslib"; +#endif +} + double FleetWrapper::GetCacheThreshold(int table_id) { #ifdef PADDLE_WITH_PSLIB double cache_threshold = 0.0; diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index c1db06a298c861..d368b421ff2a05 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -266,6 +266,7 @@ class FleetWrapper { bool load_combine); void PrintTableStat(const uint64_t table_id); + void SetFileNumOneShard(const uint64_t table_id, int file_num); // mode = 0, load all feature // mode = 1, load delta feature, which means load diff void LoadModel(const std::string& path, const int mode); diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc index 873476629cb78f..d8142f717baed8 100644 --- a/paddle/fluid/pybind/fleet_wrapper_py.cc +++ b/paddle/fluid/pybind/fleet_wrapper_py.cc @@ -76,6 +76,8 @@ void BindFleetWrapper(py::module* m) { .def("shrink_sparse_table", &framework::FleetWrapper::ShrinkSparseTable) .def("shrink_dense_table", &framework::FleetWrapper::ShrinkDenseTable) .def("print_table_stat", &framework::FleetWrapper::PrintTableStat) + .def("set_file_num_one_shard", + &framework::FleetWrapper::SetFileNumOneShard) .def("client_flush", &framework::FleetWrapper::ClientFlush) .def("load_from_paddle_model", &framework::FleetWrapper::LoadFromPaddleModel) diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py index 39cf3ebeb32a95..e8d9cc3b77b6a8 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py @@ -327,6 +327,21 @@ def print_table_stat(self, table_id): self._fleet_ptr.print_table_stat(table_id) self._role_maker._barrier_worker() + def set_file_num_one_shard(self, table_id, file_num): + """ + set file_num in one shard + Args: + table_id(int): the id of table + file_num(int): file num in one shard + Example: + .. code-block:: python + fleet.set_file_num_one_shard(0, 5) + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.set_file_num_one_shard(table_id, file_num) + self._role_maker._barrier_worker() + def save_persistables(self, executor, dirname, main_program=None, **kwargs): """ save presistable parameters, From c330c3d9c82043695531153cdbc724990d8c434c Mon Sep 17 00:00:00 2001 From: andyjpaddle <87074272+andyjpaddle@users.noreply.github.com> Date: Sun, 26 Sep 2021 16:26:16 +0800 Subject: [PATCH 004/298] fix pinv api explosure rule (#36093) --- python/paddle/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index e4f0860e3be198..2efecf9ce4a84a 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -106,7 +106,6 @@ from .tensor.linalg import multi_dot # noqa: F401 from .tensor.linalg import matrix_power # noqa: F401 from .tensor.linalg import svd # noqa: F401 -from .tensor.linalg import pinv # noqa: F401 from .tensor.linalg import solve # noqa: F401 from .tensor.logic import equal # noqa: F401 from .tensor.logic import greater_equal # noqa: F401 From 52b450072429a91af31fae743156ed7154cf749a Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Sun, 26 Sep 2021 17:36:14 +0800 Subject: [PATCH 005/298] update multi_dot exposure rules (#36018) --- python/paddle/__init__.py | 1 - .../tests/unittests/test_multi_dot_op.py | 18 ++- python/paddle/tensor/__init__.py | 1 + python/paddle/tensor/linalg.py | 138 +++++++++--------- 4 files changed, 80 insertions(+), 78 deletions(-) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 2efecf9ce4a84a..024415664d8a66 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -103,7 +103,6 @@ from .tensor.linalg import mv # noqa: F401 from .tensor.linalg import det # noqa: F401 from .tensor.linalg import slogdet # noqa: F401 -from .tensor.linalg import multi_dot # noqa: F401 from .tensor.linalg import matrix_power # noqa: F401 from .tensor.linalg import svd # noqa: F401 from .tensor.linalg import solve # noqa: F401 diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py index 97047b1ae0e5e0..8856624b4efc72 100644 --- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py +++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py @@ -198,32 +198,34 @@ def test_errors(self): paddle.static.Program()): # The inputs type of multi_dot must be list matrix. input1 = 12 - self.assertRaises(TypeError, paddle.multi_dot, [input1, input1]) + self.assertRaises(TypeError, paddle.linalg.multi_dot, + [input1, input1]) # The inputs dtype of multi_dot must be float64, float64 or float16. input2 = paddle.static.data( name='input2', shape=[10, 10], dtype="int32") - self.assertRaises(TypeError, paddle.multi_dot, [input2, input2]) + self.assertRaises(TypeError, paddle.linalg.multi_dot, + [input2, input2]) # the number of tensor must be larger than 1 x0 = paddle.static.data(name='x0', shape=[3, 2], dtype="float64") - self.assertRaises(ValueError, paddle.multi_dot, [x0]) + self.assertRaises(ValueError, paddle.linalg.multi_dot, [x0]) #the first tensor must be 1D or 2D x1 = paddle.static.data(name='x1', shape=[3, 2, 3], dtype="float64") x2 = paddle.static.data(name='x2', shape=[3, 2], dtype="float64") - self.assertRaises(ValueError, paddle.multi_dot, [x1, x2]) + self.assertRaises(ValueError, paddle.linalg.multi_dot, [x1, x2]) #the last tensor must be 1D or 2D x3 = paddle.static.data(name='x3', shape=[3, 2], dtype="float64") x4 = paddle.static.data(name='x4', shape=[3, 2, 2], dtype="float64") - self.assertRaises(ValueError, paddle.multi_dot, [x3, x4]) + self.assertRaises(ValueError, paddle.linalg.multi_dot, [x3, x4]) #the tensor must be 2D, except first and last tensor x5 = paddle.static.data(name='x5', shape=[3, 2], dtype="float64") x6 = paddle.static.data(name='x6', shape=[2], dtype="float64") x7 = paddle.static.data(name='x7', shape=[2, 2], dtype="float64") - self.assertRaises(ValueError, paddle.multi_dot, [x5, x6, x7]) + self.assertRaises(ValueError, paddle.linalg.multi_dot, [x5, x6, x7]) class APITestMultiDot(unittest.TestCase): @@ -232,7 +234,7 @@ def test_out(self): with paddle.static.program_guard(paddle.static.Program()): x0 = paddle.static.data(name='x0', shape=[3, 2], dtype="float64") x1 = paddle.static.data(name='x1', shape=[2, 3], dtype='float64') - result = paddle.multi_dot([x0, x1]) + result = paddle.linalg.multi_dot([x0, x1]) exe = paddle.static.Executor(paddle.CPUPlace()) data1 = np.random.rand(3, 2).astype("float64") data2 = np.random.rand(2, 3).astype("float64") @@ -254,7 +256,7 @@ def test_dygraph_without_out(self): input_array2 = np.random.rand(4, 3).astype("float64") data1 = paddle.to_tensor(input_array1) data2 = paddle.to_tensor(input_array2) - out = paddle.multi_dot([data1, data2]) + out = paddle.linalg.multi_dot([data1, data2]) expected_result = np.linalg.multi_dot([input_array1, input_array2]) self.assertTrue(np.allclose(expected_result, out.numpy())) diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 02b34bb21a7920..080a06455a681a 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -387,6 +387,7 @@ 'bitwise_not', 'broadcast_tensors', 'uniform_', + 'multi_dot', 'solve', ] diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 6c898f2d607c9f..9f2c4316d542db 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -551,8 +551,8 @@ def cond(x, p=None, name=None): Computes the condition number of a matrix or batches of matrices with respect to a matrix norm ``p``. Args: - x (Tensor): The input tensor could be tensor of shape ``(*, m, n)`` where ``*`` is zero or more batch dimensions - for ``p`` in ``(2, -2)``, or of shape ``(*, n, n)`` where every matrix is invertible for any supported ``p``. + x (Tensor): The input tensor could be tensor of shape ``(*, m, n)`` where ``*`` is zero or more batch dimensions + for ``p`` in ``(2, -2)``, or of shape ``(*, n, n)`` where every matrix is invertible for any supported ``p``. And the input data type could be ``float32`` or ``float64``. p (float|string, optional): Order of the norm. Supported values are `fro`, `nuc`, `1`, `-1`, `2`, `-2`, `inf`, `-inf`. Default value is `None`, meaning that the order of the norm is `2`. @@ -607,7 +607,7 @@ def cond(x, p=None, name=None): # out_minus_inf.numpy() [1.] a = paddle.to_tensor(np.random.randn(2, 4, 4).astype('float32')) - # a.numpy() + # a.numpy() # [[[ 0.14063153 -0.996288 0.7996131 -0.02571543] # [-0.16303636 1.5534962 -0.49919784 -0.04402903] # [-1.1341571 -0.6022629 0.5445269 0.29154757] @@ -975,8 +975,8 @@ def t(input, name=None): return out check_variable_and_dtype( - input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'], - 'transpose') + input, 'input', ['float16', 'float32', 'float64', 'int32', + 'int64'], 'transpose') helper = LayerHelper('t', **locals()) out = helper.create_variable_for_type_inference(input.dtype) @@ -1108,17 +1108,17 @@ def matrix_rank(x, tol=None, hermitian=False, name=None): r""" Computes the rank of a matrix. - The rank of a matrix is the number of singular values that are greater than the specified `tol` threshold when hermitian=False, + The rank of a matrix is the number of singular values that are greater than the specified `tol` threshold when hermitian=False, or the number of eigenvalues in absolute value that are greater than the specified `tol` threshold when hermitian=True. Args: - x (Tensor): The input tensor. Its shape should be `[..., m, n]`, where `...` is zero or more batch dimensions. If `x` is a batch - of matrices then the output has the same batch dimensions. The data type of `x` should be float32 or float64. - tol (float,Tensor,optional): the tolerance value. Default: None. If `tol` is not specified, and `sigma` is the largest - singular value (or eigenvalues in absolute value), and `eps` is the epsilon value for the dtype of `x`, then `tol` is computed + x (Tensor): The input tensor. Its shape should be `[..., m, n]`, where `...` is zero or more batch dimensions. If `x` is a batch + of matrices then the output has the same batch dimensions. The data type of `x` should be float32 or float64. + tol (float,Tensor,optional): the tolerance value. Default: None. If `tol` is not specified, and `sigma` is the largest + singular value (or eigenvalues in absolute value), and `eps` is the epsilon value for the dtype of `x`, then `tol` is computed with formula `tol=sigma * max(m,n) * eps`. Note that if `x` is a batch of matrices, `tol` is computed this way for every batch. - hermitian (bool,optional): indicates whether `x` is Hermitian. Default: False. When hermitian=True, `x` is assumed to be Hermitian, - enabling a more efficient method for finding eigenvalues, but `x` is not checked inside the function. Instead, We just use + hermitian (bool,optional): indicates whether `x` is Hermitian. Default: False. When hermitian=True, `x` is assumed to be Hermitian, + enabling a more efficient method for finding eigenvalues, but `x` is not checked inside the function. Instead, We just use the lower triangular of the matrix to compute. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -1225,7 +1225,7 @@ def bmm(x, y, name=None): #output value: #[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]] out_np = out.numpy() - + """ x_shape = x.shape y_shape = y.shape @@ -1360,7 +1360,7 @@ def det(x): Returns: y (Tensor):the determinant value of a square matrix or batches of square matrices. - Example: + Examples: .. code-block:: python import paddle @@ -1370,10 +1370,10 @@ def det(x): A = paddle.det(x) print(A) - + # [ 0.02547996, 2.52317095, -6.15900707]) - + """ if in_dygraph_mode(): return core.ops.determinant(x) @@ -1403,7 +1403,7 @@ def slogdet(x): """ Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant. The determinant can be computed with ``sign * exp(logabsdet) - + Supports input of float, double Note that for matrices that have zero determinant, this returns ``(0, -inf)`` @@ -1415,7 +1415,7 @@ def slogdet(x): y (Tensor): A tensor containing the sign of the determinant and the natural logarithm of the absolute value of determinant, respectively. - Example: + Examples: .. code-block:: python import paddle @@ -1425,7 +1425,7 @@ def slogdet(x): A = paddle.slogdet(x) print(A) - + # [[ 1. , 1. , -1. ], # [-0.98610914, -0.43010661, -0.10872950]]) @@ -1461,19 +1461,19 @@ def svd(x, full_matrices=False, name=None): Let :math:`X` be the input matrix or a batch of input matrices, the output should satisfies: .. math:: - X = U * diag(S) * VT - + X = U * diag(S) * VT + Args: x (Tensor): The input tensor. Its shape should be `[..., N, M]`, where `...` is zero or more batch dimensions. N and M can be arbitraty - positive number. Note that if x is sigular matrices, the grad is numerical - instable. The data type of x should be float32 or float64. - full_matrices (bool): A flag to control the behavor of svd. - If full_matrices = True, svd op will compute full U and V matrics, + positive number. Note that if x is sigular matrices, the grad is numerical + instable. The data type of x should be float32 or float64. + full_matrices (bool): A flag to control the behavor of svd. + If full_matrices = True, svd op will compute full U and V matrics, which means shape of U is `[..., N, N]`, shape of V is `[..., M, M]`. K = min(M, N). - If full_matrices = False, svd op will use a economic method to store U and V. + If full_matrices = False, svd op will use a economic method to store U and V. which means shape of U is `[..., N, K]`, shape of V is `[..., M, K]`. K = min(M, N). - name (str, optional): Name for the operation (optional, default is None). + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: @@ -1497,9 +1497,9 @@ def svd(x, full_matrices=False, name=None): print (vh) #VT= [[ 0.51411221, 0.85772294], # [ 0.85772294, -0.51411221]] - + # one can verify : U * S * VT == X - # U * UH == I + # U * UH == I # V * VH == I """ @@ -1526,7 +1526,7 @@ def svd(x, full_matrices=False, name=None): def matrix_power(x, n, name=None): r""" Computes the n-th power of a square matrix or a batch of square matrices. - + Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be an exponent, the equation should be: @@ -1596,27 +1596,27 @@ def matrix_power(x, n, name=None): def eigvals(x, name=None): """ Compute the eigenvalues of one or more general matrices. - - Warning: - The gradient kernel of this operator does not yet developed. + + Warning: + The gradient kernel of this operator does not yet developed. If you need back propagation through this operator, please replace it with paddle.linalg.eig. Args: x (Tensor): A square matrix or a batch of square matrices whose eigenvalues will be computed. - Its shape should be `[*, M, M]`, where `*` is zero or more batch dimensions. + Its shape should be `[*, M, M]`, where `*` is zero or more batch dimensions. Its data type should be float32, float64, complex64, or complex128. - name (str, optional): Name for the operation (optional, default is None). + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: - Tensor: A tensor containing the unsorted eigenvalues which has the same batch dimensions with `x`. + Tensor: A tensor containing the unsorted eigenvalues which has the same batch dimensions with `x`. The eigenvalues are complex-valued even when `x` is real. Examples: .. code-block:: python import paddle - + paddle.set_device("cpu") paddle.seed(1234) @@ -1630,8 +1630,8 @@ def eigvals(x, name=None): """ check_variable_and_dtype(x, 'dtype', - ['float32', 'float64', 'complex64', 'complex128'], - 'eigvals') + ['float32', 'float64', 'complex64', + 'complex128'], 'eigvals') x_shape = list(x.shape) if len(x_shape) < 2: @@ -1657,7 +1657,7 @@ def multi_dot(x, name=None): """ Multi_dot is an operator that calculates multiple matrix multiplications. - Supports inputs of float, double and float16 dtypes. This function does not + Supports inputs of float16(only GPU support), float32 and float64 dtypes. This function does not support batched inputs. The input tensor in [x] must be 2-D except for the first and last can be 1-D. @@ -1699,7 +1699,7 @@ def multi_dot(x, name=None): B_data = np.random.random([4, 5]).astype(np.float32) A = paddle.to_tensor(A_data) B = paddle.to_tensor(B_data) - out = paddle.multi_dot([A, B]) + out = paddle.linalg.multi_dot([A, B]) print(out.numpy().shape) # [3, 5] @@ -1710,7 +1710,7 @@ def multi_dot(x, name=None): A = paddle.to_tensor(A_data) B = paddle.to_tensor(B_data) C = paddle.to_tensor(C_data) - out = paddle.multi_dot([A, B, C]) + out = paddle.linalg.multi_dot([A, B, C]) print(out.numpy().shape) # [10, 7] @@ -1735,7 +1735,7 @@ def multi_dot(x, name=None): def eigh(x, UPLO='L', name=None): """ - Compute the eigenvalues and eigenvectors of a + Compute the eigenvalues and eigenvectors of a complex Hermitian (conjugate symmetric) or a real symmetric matrix. Args: @@ -1804,7 +1804,7 @@ def __check_input(x, UPLO): def pinv(x, rcond=1e-15, hermitian=False, name=None): r""" - Calculate pseudo inverse via SVD(singular value decomposition) + Calculate pseudo inverse via SVD(singular value decomposition) of one matrix or batches of regular matrix. .. math:: @@ -1815,30 +1815,30 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None): else: x = u * s * ut (eigh) out = u * 1/s * u.conj().transpose(-2,-1) - + If x is hermitian or symmetric matrix, svd will be replaced with eigh. Args: - x(Tensor): The input tensor. Its shape should be (*, m, n) - where * is zero or more batch dimensions. m and n can be - arbitraty positive number. The data type of x should be + x(Tensor): The input tensor. Its shape should be (*, m, n) + where * is zero or more batch dimensions. m and n can be + arbitraty positive number. The data type of x should be float32 or float64 or complex64 or complex128. When data type is complex64 or cpmplex128, hermitian should be set True. - rcond(Tensor, optional): the tolerance value to determine - when is a singular value zero. Defalut:1e-15. - - hermitian(bool, optional): indicates whether x is Hermitian + rcond(Tensor, optional): the tolerance value to determine + when is a singular value zero. Defalut:1e-15. + + hermitian(bool, optional): indicates whether x is Hermitian if complex or symmetric if real. Default: False. - - name(str|None): A name for this layer(optional). If set None, + + name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. - + Returns: - Tensor: The tensor with same data type with x. it represents + Tensor: The tensor with same data type with x. it represents pseudo inverse of x. Its shape should be (*, n, m). - + Examples: .. code-block:: python @@ -1998,8 +1998,8 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None): helper = LayerHelper('pinv', **locals()) dtype = x.dtype check_variable_and_dtype( - x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], - 'pinv') + x, 'dtype', ['float32', 'float64', 'complex64', + 'complex128'], 'pinv') if dtype == paddle.complex128: s_type = 'float64' @@ -2079,40 +2079,40 @@ def solve(x, y, name=None): Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'. Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be a vector/matrix or a batch of vectors/matrices, the equation should be: - + .. math:: Out = X^-1 * Y Specifically, - This system of linear equations has one solution if and only if input 'X' is invertible. - + Args: x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or more batch dimensions. Its data type should be float32 or float64. y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or more batch dimensions. Its data type should be float32 or float64. - name(str, optional): Name for the operation (optional, default is None). + name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - + Returns: - Tensor: The solution of a square system of linear equations with a unique solution for input 'x' and 'y'. + Tensor: The solution of a square system of linear equations with a unique solution for input 'x' and 'y'. Its data type should be the same as that of `x`. - + Examples: .. code-block:: python - + # a square system of linear equations: # 2*X0 + X1 = 9 # X0 + 2*X1 = 8 - + import paddle import numpy as np - + np_x = np.array([[3, 1],[1, 2]]) np_y = np.array([9, 8]) x = paddle.to_tensor(np_x, dtype="float64") y = paddle.to_tensor(np_y, dtype="float64") out = paddle.linalg.solve(x, y) - + print(out) # [2., 3.]) """ From 991ae3b6d3e19e8c4f011a78bc1a5c08078e161f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?LJQ=E2=9D=A4=EF=B8=8F?= <33169170+lijiaqi0612@users.noreply.github.com> Date: Sun, 26 Sep 2021 17:58:36 +0800 Subject: [PATCH 006/298] Correct the misspelled part of the unit test (#36044) --- .../fluid/tests/unittests/fft/test_fft.py | 242 ++++++++++++------ 1 file changed, 166 insertions(+), 76 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/python/paddle/fluid/tests/unittests/fft/test_fft.py index 26355e0411fa3f..c83c943217d4e6 100644 --- a/python/paddle/fluid/tests/unittests/fft/test_fft.py +++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py @@ -108,6 +108,8 @@ def decorate(cls): ('test_norm_ortho', rand_x(5), None, 3, 'ortho')]) class TestFft(unittest.TestCase): def test_fft(self): + """Test fft with norm condition + """ with paddle.fluid.dygraph.guard(self.place): self.assertTrue( np.allclose( @@ -127,7 +129,14 @@ def test_fft(self): ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError) ]) class TestFftException(unittest.TestCase): - def test_Fft(self): + def test_fft(self): + """Test fft with buoudary condition + Test case include: + - n out of range + - axis out of range + - axis type error + - norm out of range + """ with self.assertRaises(self.expect_exception): paddle.fft.fft( paddle.to_tensor(self.x), self.n, self.axis, self.norm) @@ -149,7 +158,9 @@ def test_Fft(self): ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'), ]) class TestFft2(unittest.TestCase): - def test_Fft2(self): + def test_fft2(self): + """Test fft2 with norm condition + """ with paddle.fluid.dygraph.guard(self.place): self.assertTrue( np.allclose( @@ -178,6 +189,15 @@ def test_Fft2(self): ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)]) class TestFft2Exception(unittest.TestCase): def test_fft2(self): + """Test fft2 with buoudary condition + Test case include: + - input type error + - input dim error + - n out of range + - axis out of range + - axis type error + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.fft2( @@ -198,7 +218,9 @@ def test_fft2(self): 'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'), ('test_norm_ortho', rand_x(5), None, None, 'ortho')]) class TestFftn(unittest.TestCase): - def test_Fftn(self): + def test_fftn(self): + """Test fftn with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.fftn(self.x, self.n, self.axis, self.norm), @@ -230,10 +252,9 @@ def test_Fftn(self): "ortho"), ]) class TestHfft(unittest.TestCase): - """Test hfft with norm condition - """ - def test_hfft(self): + """Test hfft with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.hfft(self.x, self.n, self.axis, self.norm), @@ -265,10 +286,9 @@ def test_hfft(self): "ortho"), ]) class TestIrfft(unittest.TestCase): - """Test irfft with norm condition - """ - def test_irfft(self): + """Test irfft with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.irfft(self.x, self.n, self.axis, self.norm), @@ -299,11 +319,10 @@ def test_irfft(self): np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None, "ortho"), ]) -class Testirfftn(unittest.TestCase): - """Test irfftn with norm condition - """ - +class TestIrfftn(unittest.TestCase): def test_irfftn(self): + """Test irfftn with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.irfftn(self.x, self.n, self.axis, self.norm), @@ -334,11 +353,10 @@ def test_irfftn(self): np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None, "ortho"), ]) -class Testhfftn(unittest.TestCase): - """Test hfftn with norm condition - """ - +class TestHfftn(unittest.TestCase): def test_hfftn(self): + """Test hfftn with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.hfftn(self.x, self.n, self.axis, self.norm), @@ -365,11 +383,10 @@ def test_hfftn(self): np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1), "ortho"), ]) -class Testhfft2(unittest.TestCase): - """Test hfft2 with norm condition - """ - +class TestHfft2(unittest.TestCase): def test_hfft2(self): + """Test hfft2 with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.hfft2(self.x, self.s, self.axis, self.norm), @@ -398,10 +415,9 @@ def test_hfft2(self): "ortho"), ]) class TestIrfft2(unittest.TestCase): - """Test irfft2 with norm condition - """ - def test_irfft2(self): + """Test irfft2 with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.irfft2(self.x, self.s, self.axis, self.norm), @@ -434,14 +450,16 @@ def test_irfft2(self): np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, -1, 'random', ValueError)]) class TestHfftException(unittest.TestCase): - '''Test hfft with buoudary condition - Test case include: - - n out of range - - axis out of range - - norm out of range - ''' - def test_hfft(self): + """Test hfft with buoudary condition + Test case include: + Test case include: + - n out of range + - n type error + - axis out of range + - axis type error + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.hfft( @@ -466,15 +484,16 @@ def test_hfft(self): np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, None, 'random', ValueError)]) class TestIrfftException(unittest.TestCase): - '''Test Irfft with buoudary condition - Test case include: - - n out of range - - axis out of range - - norm out of range - - the dimensions of n and axis are different - ''' - def test_irfft(self): + """ + Test irfft with buoudary condition + Test case include: + - n out of range + - n type error + - axis type error + - axis out of range + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.irfft( @@ -505,15 +524,17 @@ def test_irfft(self): np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, None, 'random', ValueError)]) class TestHfft2Exception(unittest.TestCase): - '''Test hfft2 with buoudary condition - Test case include: - - n out of range - - axis out of range - - the dimensions of n and axis are different - - norm out of range - ''' - def test_hfft2(self): + """ + Test hfft2 with buoudary condition + Test case include: + - input type error + - n type error + - n out of range + - axis out of range + - the dimensions of n and axis are different + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.hfft2( @@ -544,15 +565,17 @@ def test_hfft2(self): np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, None, 'random', ValueError)]) class TestIrfft2Exception(unittest.TestCase): - '''Test irfft2 with buoudary condition - Test case include: - - n out of range - - axis out of range - - norm out of range - - the dimensions of n and axis are different - ''' - def test_irfft2(self): + """ + Test irfft2 with buoudary condition + Test case include: + - input type error + - n type error + - n out of range + - axis out of range + - the dimensions of n and axis are different + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.irfft2( @@ -584,15 +607,16 @@ def test_irfft2(self): np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, None, 'random', ValueError)]) class TestHfftnException(unittest.TestCase): - '''Test hfftn with buoudary condition - Test case include: - - n out of range - - axis out of range - - norm out of range - - the dimensions of n and axis are different - ''' - def test_hfftn(self): + """Test hfftn with buoudary condition + Test case include: + - input type error + - n type error + - n out of range + - axis out of range + - the dimensions of n and axis are different + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.hfftn( @@ -620,15 +644,15 @@ def test_hfftn(self): np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, None, 'random', ValueError)]) class TestIrfftnException(unittest.TestCase): - '''Test irfftn with buoudary condition - Test case include: - - n out of range - - axis out of range - - norm out of range - - the dimensions of n and axis are different - ''' - def test_irfftn(self): + """Test irfftn with buoudary condition + Test case include: + - n out of range + - n type error + - axis out of range + - norm out of range + - the dimensions of n and axis are different + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.irfftn( @@ -648,6 +672,8 @@ def test_irfftn(self): ('test_norm_ortho', rand_x(5), None, 3, 'ortho')]) class TestRfft(unittest.TestCase): def test_rfft(self): + """Test rfft with norm condition + """ with paddle.fluid.dygraph.guard(self.place): self.assertTrue( np.allclose( @@ -668,6 +694,14 @@ def test_rfft(self): ]) class TestRfftException(unittest.TestCase): def test_rfft(self): + """Test rfft with buoudary condition + Test case include: + - n out of range + - axis out of range + - axis type error + - norm out of range + - the dimensions of n and axis are different + """ with self.assertRaises(self.expect_exception): paddle.fft.rfft( paddle.to_tensor(self.x), self.n, self.axis, self.norm) @@ -688,6 +722,8 @@ def test_rfft(self): ]) class TestRfft2(unittest.TestCase): def test_rfft2(self): + """Test rfft2 with norm condition + """ with paddle.fluid.dygraph.guard(self.place): self.assertTrue( np.allclose( @@ -715,7 +751,16 @@ def test_rfft2(self): ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError), ]) class TestRfft2Exception(unittest.TestCase): - def test_rfft(self): + def test_rfft2(self): + """Test rfft2 with buoudary condition + Test case include: + - input type error + - input dim error + - n out of range + - axis out of range + - norm out of range + - the dimensions of n and axis are different + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.rfft2( @@ -736,6 +781,8 @@ def test_rfft(self): ]) class TestRfftn(unittest.TestCase): def test_rfftn(self): + """Test rfftn with norm condition + """ with paddle.fluid.dygraph.guard(self.place): self.assertTrue( np.allclose( @@ -759,7 +806,14 @@ def test_rfftn(self): ValueError), ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)]) class TestRfftnException(unittest.TestCase): - def test_rfft(self): + def test_rfftn(self): + """Test rfftn with buoudary condition + Test case include: + - n out of range + - axis out of range + - norm out of range + - the dimensions of n and axis are different + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.rfftn( @@ -779,6 +833,8 @@ def test_rfft(self): ('test_norm_ortho', rand_x(5), None, 3, 'ortho')]) class TestIhfft(unittest.TestCase): def test_ihfft(self): + """Test ihfft with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.ihfft(self.x, self.n, self.axis, self.norm), @@ -798,6 +854,12 @@ def test_ihfft(self): ]) class TestIhfftException(unittest.TestCase): def test_ihfft(self): + """Test ihfft with buoudary condition + Test case include: + - axis type error + - axis out of range + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.ihfft( @@ -819,6 +881,8 @@ def test_ihfft(self): ]) class TestIhfft2(unittest.TestCase): def test_ihfft2(self): + """Test ihfft2 with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.ihfft2(self.x, self.n, self.axis, self.norm), @@ -844,7 +908,16 @@ def test_ihfft2(self): -10, 'backward', ValueError), ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)]) class TestIhfft2Exception(unittest.TestCase): - def test_rfft(self): + def test_ihfft2(self): + """Test ihfft2 with buoudary condition + Test case include: + - input type error + - input dim error + - n out of range + - axis type error + - axis out of range + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.ihfft2( @@ -863,7 +936,9 @@ def test_rfft(self): 'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'), ('test_norm_ortho', rand_x(5), None, None, 'ortho')]) class TestIhfftn(unittest.TestCase): - def test_rfftn(self): + def test_ihfftn(self): + """Test ihfftn with norm condition + """ with paddle.fluid.dygraph.guard(self.place): self.assertTrue( np.allclose( @@ -885,7 +960,14 @@ def test_rfftn(self): ValueError), ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)]) class TestIhfftnException(unittest.TestCase): - def test_rfft(self): + def test_ihfftn(self): + """Test ihfftn with buoudary condition + Test case include: + - input type error + - n out of range + - axis out of range + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.ihfftn( @@ -899,6 +981,8 @@ def test_rfft(self): ]) class TestFftFreq(unittest.TestCase): def test_fftfreq(self): + """Test fftfreq with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.fftfreq(self.n, self.d).astype(self.dtype), @@ -914,6 +998,8 @@ def test_fftfreq(self): ]) class TestRfftFreq(unittest.TestCase): def test_rfftfreq(self): + """Test rfftfreq with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.rfftfreq(self.n, self.d).astype(self.dtype), @@ -929,6 +1015,8 @@ def test_rfftfreq(self): ]) class TestFftShift(unittest.TestCase): def test_fftshift(self): + """Test fftshift with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.fftshift(self.x, self.axes), @@ -945,6 +1033,8 @@ def test_fftshift(self): ]) class TestIfftShift(unittest.TestCase): def test_ifftshift(self): + """Test ifftshift with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.ifftshift(self.x, self.axes), From e45d64ec7b456640d8778d1c176799edb8f6b6fc Mon Sep 17 00:00:00 2001 From: JYChen Date: Sun, 26 Sep 2021 19:20:52 +0800 Subject: [PATCH 007/298] [new api] add func/class API psroi_pool and UT (#35352) * add func/class API psroi_pool and UT * add UT in static mode * Remove redundant type checks in static mode * More detailed description for test_psroi_pool_op * fix code format of UT * fix en-doc --- paddle/fluid/operators/psroi_pool_op.cc | 22 +- paddle/fluid/operators/psroi_pool_op.cu | 105 ++++-- paddle/fluid/operators/psroi_pool_op.h | 103 ++++-- paddle/fluid/pybind/op_function_generator.cc | 1 + .../tests/unittests/test_psroi_pool_op.py | 300 ++++++++++++++---- python/paddle/vision/ops.py | 115 ++++++- 6 files changed, 526 insertions(+), 120 deletions(-) diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc index d3faa2c8460f21..da637dfeb237dd 100644 --- a/paddle/fluid/operators/psroi_pool_op.cc +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -25,22 +25,26 @@ class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "Tensor, " + "(Tensor), " "the input of PSROIPoolOp. " "The format of input tensor is NCHW. Where N is the batch size, " "C is the number of input channels, " "H is the height of the input feature map, and " "W is the width. The data type can be float32 or float64"); AddInput("ROIs", - "LoDTensor, " + "(LoDTensor), " "ROIs (Regions of Interest) to pool over. " "should be a 2-D LoDTensor of shape (num_rois, 4) " "given as [(x1, y1, x2, y2), ...]. " "where (x1, y1) is the top left coordinates, and " "(x2, y2) is the bottom right coordinates. " "The roi batch index can be calculated from LoD."); + AddInput("RoisNum", + "(Tensor), " + "The number of RoIs in each image.") + .AsDispensable(); AddOutput("Out", - "Tensor, " + "(Tensor), " "the output of PSROIPoolOp is a 4-D Tensor with shape " "(num_rois, output_channels, pooled_h, pooled_w). " "The data type is the same as `x` "); @@ -65,8 +69,6 @@ class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { "the pooled output width.") .SetDefault(1); AddComment(R"Doc( -**PSROIPool Operator,** `rois` **of this op should be a LoDTensor** - Position sensitive region of interest pooling (also known as PSROIPooling) is to perform position-sensitive average pooling on regions of interest specified by input, takes as input N position-sensitive score maps and a list of num_rois regions of interest. @@ -106,7 +108,14 @@ class PSROIPoolOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " "given as [(x1, y1, x2, y2), ...]")); - + if (ctx->HasInput("RoisNum")) { + auto rois_num_dims = ctx->GetInputDim("RoisNum"); + PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1, + platform::errors::InvalidArgument( + "The second dimension of RoisNum should " + "be 1, but received dimension is %d", + rois_num_dims.size())); + } int pooled_height = ctx->Attrs().Get("pooled_height"); int pooled_width = ctx->Attrs().Get("pooled_width"); int output_channels = ctx->Attrs().Get("output_channels"); @@ -184,6 +193,7 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker { op->SetType("psroi_pool_grad"); op->SetInput("X", this->Input("X")); op->SetInput("ROIs", this->Input("ROIs")); + op->SetInput("RoisNum", this->Input("RoisNum")); op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); op->SetAttrMap(this->Attrs()); diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu index 748b6036008f13..f69edfc1fcfec9 100644 --- a/paddle/fluid/operators/psroi_pool_op.cu +++ b/paddle/fluid/operators/psroi_pool_op.cu @@ -185,34 +185,67 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel { int rois_num = rois->dims()[0]; if (rois_num == 0) return; - - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ(rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be " - "the same but received batch size of input(ROIs) and " - "input(X) is %d and %d respectively.", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The number of rois from input(ROIs) and its LOD " - "must be the same. Received rois %d of input(ROIs) " - "but the number of rois %d from its LOD is %d", - rois_num, rois_num_with_lod)); - - // set rois batch id + int rois_batch_size; framework::Tensor rois_batch_id_list; rois_batch_id_list.Resize({rois_num}); int* rois_batch_id_data = rois_batch_id_list.mutable_data(platform::CPUPlace()); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + + if (ctx.HasInput("RoisNum")) { + auto* rois_num_t = ctx.Input("RoisNum"); + rois_batch_size = rois_num_t->numel(); + auto* rois_num_data = rois_num_t->data(); + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + platform::errors::InvalidArgument( + "The batch size of input(ROIs) and input(X) must be " + "the same but received batch size of input(ROIs) and " + "input(X) is %d and %d respectively.", + rois_batch_size, batch_size)); + std::vector rois_num_list(rois_batch_size); + memory::Copy(platform::CPUPlace(), rois_num_list.data(), + BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()), + rois_num_data, sizeof(int) * rois_batch_size, 0); + int rois_num_count = 0; + for (int i = 0; i < rois_batch_size; ++i) { + rois_num_count += rois_num_list[i]; + } + PADDLE_ENFORCE_EQ( + rois_num_count, rois_num, + platform::errors::InvalidArgument( + "the rois_num from input and RoisNum must be the same")); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_list[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_list[n]; + } + } else { + auto rois_lod = rois->lod().back(); + rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + platform::errors::InvalidArgument( + "The batch size of input(ROIs) and input(X) must be " + "the same but received batch size of input(ROIs) and " + "input(X) is %d and %d respectively.", + rois_batch_size, batch_size)); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, + platform::errors::InvalidArgument( + "The number of rois from input(ROIs) and its LOD " + "must be the same. Received rois %d of input(ROIs) " + "but the number of rois %d from its LOD is %d", + rois_num, rois_num_with_lod)); + + // set rois batch id + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } } } - framework::Tensor rois_batch_id_list_gpu; framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), ctx.device_context(), &rois_batch_id_list_gpu); @@ -257,14 +290,30 @@ class GPUPSROIPoolGradOpKernel : public framework::OpKernel { rois_batch_id_list.Resize({rois_num}); int* rois_batch_id_data = rois_batch_id_list.mutable_data(platform::CPUPlace()); - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + int rois_batch_size; + if (ctx.HasInput("RoisNum")) { + auto* rois_num_t = ctx.Input("RoisNum"); + rois_batch_size = rois_num_t->numel(); + std::vector rois_num_list(rois_batch_size); + memory::Copy(platform::CPUPlace(), rois_num_list.data(), + BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()), + rois_num_t->data(), sizeof(int) * rois_batch_size, 0); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_list[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_list[n]; + } + } else { + auto rois_lod = rois->lod().back(); + rois_batch_size = rois_lod.size() - 1; + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } } } - framework::Tensor rois_batch_id_list_gpu; framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), ctx.device_context(), &rois_batch_id_list_gpu); diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h index 4f4cb24844b8c2..4d7e9ce295fc86 100644 --- a/paddle/fluid/operators/psroi_pool_op.h +++ b/paddle/fluid/operators/psroi_pool_op.h @@ -40,6 +40,13 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { int width = in_dims[3]; int rois_num = rois->dims()[0]; + PADDLE_ENFORCE_EQ(input_channels, + output_channels * pooled_height * pooled_width, + platform::errors::InvalidArgument( + "the channels of input " + "X should equal the product of " + "output_channels x pooled_height x pooled_width")); + auto in_stride = framework::stride(in_dims); auto out_stride = framework::stride(out->dims()); @@ -49,32 +56,52 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { rois_batch_id_list.Resize({rois_num}); int* rois_batch_id_data = rois_batch_id_list.mutable_data(ctx.GetPlace()); - - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("the rois_batch_size and input(X) " - "batch_size should be the same.")); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and lod must be the same")); - - PADDLE_ENFORCE_EQ(input_channels, - output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "the channels of input " - "X should equal the product of " - "output_channels x pooled_height x pooled_width")); - - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + int rois_batch_size; + if (ctx.HasInput("RoisNum")) { + auto* rois_num_t = ctx.Input("RoisNum"); + rois_batch_size = rois_num_t->numel(); + auto* rois_num_data = rois_num_t->data(); + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + platform::errors::InvalidArgument( + "The batch size of rois and the batch size of images " + " must be the same. But received the batch size of rois is %d, " + "and the batch size of images is %d", + rois_batch_size, batch_size)); + int rois_num_count = 0; + for (int i = 0; i < rois_batch_size; ++i) { + rois_num_count += rois_num_data[i]; + } + PADDLE_ENFORCE_EQ( + rois_num_count, rois_num, + platform::errors::InvalidArgument( + "the rois_num from input and RoisNum must be the same")); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_data[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_data[n]; + } + } else { + auto rois_lod = rois->lod().back(); + rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + platform::errors::InvalidArgument("the rois_batch_size and input(X) " + "batch_size should be the same.")); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ( + rois_num_with_lod, rois_num, + platform::errors::InvalidArgument( + "the rois_num from input and lod must be the same")); + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } } } - T* output_data = out->mutable_data(ctx.GetPlace()); const T* input_rois = rois->data(); @@ -93,7 +120,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; T roi_end_h = static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - // Force too small rois to be 1 x 1 T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); @@ -172,15 +198,28 @@ class CPUPSROIPoolGradOpKernel : public framework::OpKernel { rois_batch_id_list.Resize({rois_num}); int* rois_batch_id_data = rois_batch_id_list.mutable_data(ctx.GetPlace()); - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + int rois_batch_size; + if (ctx.HasInput("RoisNum")) { + auto* rois_num_t = ctx.Input("RoisNum"); + rois_batch_size = rois_num_t->numel(); + auto* rois_num_data = rois_num_t->data(); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_data[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_data[n]; + } + } else { + auto rois_lod = rois->lod().back(); + rois_batch_size = rois_lod.size() - 1; + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } } } - const T* input_rois = rois->data(); const T* output_grad_data = output_grad->data(); T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 32e14dafb644bf..01d101909b549b 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -54,6 +54,7 @@ std::map> op_ins_map = { {"gather", {"X", "Index", "Axis"}}, {"roi_pool", {"X", "ROIs", "RoisNum"}}, {"roi_align", {"X", "ROIs", "RoisNum"}}, + {"psroi_pool", {"X", "ROIs", "RoisNum"}}, {"collect_fpn_proposals", {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}}, {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}}, diff --git a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py index 066bcf48612c59..95b8c5c3c0a941 100644 --- a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py +++ b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py @@ -14,18 +14,89 @@ from __future__ import print_function +import paddle import math import numpy as np import unittest from op_test import OpTest +def calc_psroi_pool(x, rois, rois_num_per_img, output_channels, spatial_scale, + pooled_height, pooled_width): + """ + Psroi_pool implemented by Numpy. + x: 4-D as (N, C, H, W), + rois: 2-D as [[x1, y1, x2, y2], ...], + rois_num_per_img: 1-D as [nums_of_batch_0, nums_of_batch_1, ...] + """ + output_shape = (len(rois), output_channels, pooled_height, pooled_width) + out_data = np.zeros(output_shape) + batch_id = 0 + rois_num_id = 0 + rois_num_left = rois_num_per_img[rois_num_id] + for i in range(len(rois)): + roi = rois[i] + roi_batch_id = batch_id + rois_num_left -= 1 + if rois_num_left == 0: + rois_num_id += 1 + if rois_num_id < len(rois_num_per_img): + rois_num_left = rois_num_per_img[rois_num_id] + batch_id += 1 + roi_start_w = round(roi[0]) * spatial_scale + roi_start_h = round(roi[1]) * spatial_scale + roi_end_w = (round(roi[2]) + 1.) * spatial_scale + roi_end_h = (round(roi[3]) + 1.) * spatial_scale + + roi_height = max(roi_end_h - roi_start_h, 0.1) + roi_width = max(roi_end_w - roi_start_w, 0.1) + + bin_size_h = roi_height / float(pooled_height) + bin_size_w = roi_width / float(pooled_width) + + x_i = x[roi_batch_id] + + for c in range(output_channels): + for ph in range(pooled_height): + for pw in range(pooled_width): + hstart = int( + math.floor(float(ph) * bin_size_h + roi_start_h)) + wstart = int( + math.floor(float(pw) * bin_size_w + roi_start_w)) + hend = int( + math.ceil(float(ph + 1) * bin_size_h + roi_start_h)) + wend = int( + math.ceil(float(pw + 1) * bin_size_w + roi_start_w)) + hstart = min(max(hstart, 0), x.shape[2]) + hend = min(max(hend, 0), x.shape[2]) + wstart = min(max(wstart, 0), x.shape[3]) + wend = min(max(wend, 0), x.shape[3]) + + c_in = (c * pooled_height + ph) * pooled_width + pw + is_empty = (hend <= hstart) or (wend <= wstart) + out_sum = 0. + for ih in range(hstart, hend): + for iw in range(wstart, wend): + out_sum += x_i[c_in, ih, iw] + bin_area = (hend - hstart) * (wend - wstart) + out_data[i, c, ph, pw] = 0. if is_empty else ( + out_sum / float(bin_area)) + return out_data + + class TestPSROIPoolOp(OpTest): def set_data(self): + paddle.enable_static() self.init_test_case() self.make_rois() - self.calc_psroi_pool() - self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)} + self.outs = calc_psroi_pool(self.x, self.boxes, self.boxes_num, + self.output_channels, self.spatial_scale, + self.pooled_height, + self.pooled_width).astype('float64') + self.inputs = { + 'X': self.x, + 'ROIs': (self.rois_with_batch_id[:, 1:5], self.rois_lod) + } self.attrs = { 'output_channels': self.output_channels, 'spatial_scale': self.spatial_scale, @@ -67,57 +138,10 @@ def make_rois(self): roi = [bno, x1, y1, x2, y2] rois.append(roi) self.rois_num = len(rois) - self.rois = np.array(rois).astype('float64') - - def calc_psroi_pool(self): - output_shape = (self.rois_num, self.output_channels, self.pooled_height, - self.pooled_width) - out_data = np.zeros(output_shape) - for i in range(self.rois_num): - roi = self.rois[i] - roi_batch_id = int(roi[0]) - roi_start_w = round(roi[1]) * self.spatial_scale - roi_start_h = round(roi[2]) * self.spatial_scale - roi_end_w = (round(roi[3]) + 1.) * self.spatial_scale - roi_end_h = (round(roi[4]) + 1.) * self.spatial_scale - - roi_height = max(roi_end_h - roi_start_h, 0.1) - roi_width = max(roi_end_w - roi_start_w, 0.1) - - bin_size_h = roi_height / float(self.pooled_height) - bin_size_w = roi_width / float(self.pooled_width) - - x_i = self.x[roi_batch_id] - - for c in range(self.output_channels): - for ph in range(self.pooled_height): - for pw in range(self.pooled_width): - hstart = int( - math.floor(float(ph) * bin_size_h + roi_start_h)) - wstart = int( - math.floor(float(pw) * bin_size_w + roi_start_w)) - hend = int( - math.ceil( - float(ph + 1) * bin_size_h + roi_start_h)) - wend = int( - math.ceil( - float(pw + 1) * bin_size_w + roi_start_w)) - hstart = min(max(hstart, 0), self.height) - hend = min(max(hend, 0), self.height) - wstart = min(max(wstart, 0), self.width) - wend = min(max(wend, 0), self.width) - - c_in = (c * self.pooled_height + ph - ) * self.pooled_width + pw - is_empty = (hend <= hstart) or (wend <= wstart) - out_sum = 0. - for ih in range(hstart, hend): - for iw in range(wstart, wend): - out_sum += x_i[c_in, ih, iw] - bin_area = (hend - hstart) * (wend - wstart) - out_data[i, c, ph, pw] = 0. if is_empty else ( - out_sum / float(bin_area)) - self.outs = out_data.astype('float64') + self.rois_with_batch_id = np.array(rois).astype('float64') + self.boxes = self.rois_with_batch_id[:, 1:] + self.boxes_num = np.array( + [bno + 1 for bno in range(self.batch_size)]).astype('int32') def setUp(self): self.op_type = 'psroi_pool' @@ -130,5 +154,175 @@ def test_check_grad(self): self.check_grad(['X'], 'Out') +class TestPSROIPoolDynamicFunctionAPI(unittest.TestCase): + def setUp(self): + self.x = np.random.random([2, 490, 28, 28]).astype(np.float32) + self.boxes = np.array( + [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]]).astype(np.float32) + self.boxes_num = np.array([1, 2]).astype(np.int32) + + def test_output_size(self): + def test_output_size_is_int(): + output_size = 7 + out = paddle.vision.ops.psroi_pool( + paddle.to_tensor(self.x), + paddle.to_tensor(self.boxes), + paddle.to_tensor(self.boxes_num), output_size).numpy() + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10, + 1.0, 7, 7) + self.assertTrue(np.allclose(out, expect_out)) + + def test_output_size_is_tuple(): + output_size = (7, 7) + out = paddle.vision.ops.psroi_pool( + paddle.to_tensor(self.x), + paddle.to_tensor(self.boxes), + paddle.to_tensor(self.boxes_num), output_size).numpy() + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10, + 1.0, 7, 7) + self.assertTrue(np.allclose(out, expect_out)) + + def test_dytype_is_float64(): + output_size = (7, 7) + out = paddle.vision.ops.psroi_pool( + paddle.to_tensor(self.x, 'float64'), + paddle.to_tensor(self.boxes, 'float64'), + paddle.to_tensor(self.boxes_num, 'int32'), output_size).numpy() + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10, + 1.0, 7, 7) + self.assertTrue(np.allclose(out, expect_out)) + + places = ['cpu'] + if paddle.fluid.core.is_compiled_with_cuda(): + places.append('gpu') + for place in places: + paddle.set_device(place) + test_output_size_is_int() + test_output_size_is_tuple() + test_dytype_is_float64() + + +class TestPSROIPoolDynamicClassAPI(unittest.TestCase): + def setUp(self): + self.x = np.random.random([2, 128, 32, 32]).astype(np.float32) + self.boxes = np.array([[3, 5, 6, 13], [7, 4, 22, 18], [4, 5, 7, 10], + [5, 3, 25, 21]]).astype(np.float32) + self.boxes_num = np.array([2, 2]).astype(np.int32) + + def test_output_size(self): + def test_output_size_is_int(): + psroi_module = paddle.vision.ops.PSRoIPool(8, 1.1) + out = psroi_module( + paddle.to_tensor(self.x), + paddle.to_tensor(self.boxes), + paddle.to_tensor(self.boxes_num)).numpy() + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 2, + 1.1, 8, 8) + self.assertTrue(np.allclose(out, expect_out)) + + def test_output_size_is_tuple(): + psroi_pool_module = paddle.vision.ops.PSRoIPool(8, 1.1) + out = psroi_pool_module( + paddle.to_tensor(self.x), + paddle.to_tensor(self.boxes), + paddle.to_tensor(self.boxes_num)).numpy() + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 2, + 1.1, 8, 8) + self.assertTrue(np.allclose(out, expect_out)) + + def test_dytype_is_float64(): + psroi_pool_module = paddle.vision.ops.PSRoIPool(8, 1.1) + out = psroi_pool_module( + paddle.to_tensor(self.x, 'float64'), + paddle.to_tensor(self.boxes, 'float64'), + paddle.to_tensor(self.boxes_num, 'int32')).numpy() + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 2, + 1.1, 8, 8) + self.assertTrue(np.allclose(out, expect_out)) + + paddle.disable_static() + places = ['cpu'] + if paddle.fluid.core.is_compiled_with_cuda(): + places.append('gpu') + for place in places: + paddle.set_device(place) + test_output_size_is_int() + test_output_size_is_tuple() + test_dytype_is_float64() + + +class TestPSROIPoolBoxesNumError(unittest.TestCase): + def setUp(self): + paddle.disable_static() + self.x = paddle.uniform([2, 490, 28, 28], dtype='float32') + self.boxes = paddle.to_tensor( + [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], 'float32') + + def test_errors(self): + def test_boxes_num_nums_error(): + boxes_num = paddle.to_tensor([1, 5], 'int32') + out = paddle.vision.ops.psroi_pool( + self.x, self.boxes, boxes_num, output_size=7) + + self.assertRaises(ValueError, test_boxes_num_nums_error) + + def test_boxes_num_length_error(): + boxes_num = paddle.to_tensor([1, 1, 1], 'int32') + out = paddle.vision.ops.psroi_pool( + self.x, self.boxes, boxes_num, output_size=7) + + self.assertRaises(ValueError, test_boxes_num_length_error) + + +class TestPSROIPoolChannelError(unittest.TestCase): + def setUp(self): + paddle.disable_static() + self.x = paddle.uniform([2, 490, 28, 28], dtype='float32') + self.boxes = paddle.to_tensor( + [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], 'float32') + self.output_size = 4 + + def test_errors(self): + def test_channel_error(): + boxes_num = paddle.to_tensor([2, 1], 'int32') + out = paddle.vision.ops.psroi_pool(self.x, self.boxes, boxes_num, + self.output_size) + + self.assertRaises(ValueError, test_channel_error) + + +class TestPSROIPoolStaticAPI(unittest.TestCase): + def setUp(self): + paddle.enable_static() + self.x_placeholder = paddle.static.data( + name='x', shape=[2, 490, 28, 28]) + self.x = np.random.random([2, 490, 28, 28]).astype(np.float32) + self.boxes_placeholder = paddle.static.data( + name='boxes', shape=[3, 4], lod_level=1) + self.boxes = np.array( + [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]]).astype(np.float32) + self.boxes_num = np.array([1, 2]).astype(np.int32) + + def test_function_in_static(self): + output_size = 7 + out = paddle.vision.ops.psroi_pool(self.x_placeholder, + self.boxes_placeholder, + self.boxes_num, output_size) + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10, + 1.0, 7, 7) + places = [paddle.CPUPlace()] + if paddle.fluid.core.is_compiled_with_cuda(): + places.append(paddle.CUDAPlace(0)) + for place in places: + exe = paddle.static.Executor(place) + boxes_lod_data = paddle.fluid.create_lod_tensor(self.boxes, + [[1, 2]], place) + out_res = exe.run(paddle.static.default_main_program(), + feed={'x': self.x, + 'boxes': boxes_lod_data}, + fetch_list=[out.name]) + self.assertTrue(np.allclose(out_res, expect_out)) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index d5e73f977b5634..5f02b805a3ed31 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -29,7 +29,9 @@ 'deform_conv2d', 'DeformConv2D', 'read_file', - 'decode_jpeg' + 'decode_jpeg', + 'psroi_pool', + 'PSRoIPool', ] @@ -900,3 +902,114 @@ def decode_jpeg(x, mode='unchanged', name=None): type="decode_jpeg", inputs=inputs, attrs=attrs, outputs={"Out": out}) return out + + +def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None): + """ + Position sensitive region of interest pooling (also known as PSROIPooling) is to perform + position-sensitive average pooling on regions of interest specified by input. It performs + on inputs of nonuniform sizes to obtain fixed-size feature maps. + + PSROIPooling is proposed by R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details. + + Args: + x (Tensor): Input features with shape (N, C, H, W). The data type can be float32 or float64. + boxes (Tensor): Box coordinates of ROIs (Regions of Interest) to pool over. It should be + a 2-D Tensor with shape (num_rois, 4). Given as [[x1, y1, x2, y2], ...], + (x1, y1) is the top left coordinates, and (x2, y2) is the bottom + right coordinates. + boxes_num (Tensor): The number of boxes contained in each picture in the batch. + output_size (int|Tuple(int, int)) The pooled output size(H, W), data type + is int32. If int, H and W are both equal to output_size. + spatial_scale (float): Multiplicative spatial scale factor to translate ROI coords from their + input scale to the scale used when pooling. Default: 1.0 + name(str, optional): The default value is None. + Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name` + + Returns: + 4-D Tensor. The pooled ROIs with shape (num_rois, output_channels, pooled_h, pooled_w). + The output_channels equal to C / (pooled_h * pooled_w), where C is the channels of input. + + Examples: + .. code-block:: python + + import paddle + x = paddle.uniform([2, 490, 28, 28], dtype='float32') + boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32') + boxes_num = paddle.to_tensor([1, 2], dtype='int32') + pool_out = paddle.vision.ops.psroi_pool(x, boxes, boxes_num, 7, 1.0) + """ + + check_type(output_size, 'output_size', (int, tuple, list), 'psroi_pool') + if isinstance(output_size, int): + output_size = (output_size, output_size) + pooled_height, pooled_width = output_size + assert (len(x.shape) == 4, + "Input features with shape should be (N, C, H, W)") + output_channels = int(x.shape[1] / (pooled_height * pooled_width)) + if in_dygraph_mode(): + return core.ops.psroi_pool(x, boxes, boxes_num, "output_channels", + output_channels, "spatial_scale", + spatial_scale, "pooled_height", + pooled_height, "pooled_width", pooled_width) + + helper = LayerHelper('psroi_pool', **locals()) + dtype = helper.input_dtype() + out = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='psroi_pool', + inputs={'X': x, + 'ROIs': boxes}, + outputs={'Out': out}, + attrs={ + 'output_channels': output_channels, + 'spatial_scale': spatial_scale, + 'pooled_height': pooled_height, + 'pooled_width': pooled_width + }) + return out + + +class PSRoIPool(Layer): + """ + This interface is used to construct a callable object of the ``PSRoIPool`` class. Please + refer to :ref:`api_paddle_vision_ops_psroi_pool`. + + Args: + output_size (int|Tuple(int, int)) The pooled output size(H, W), data type + is int32. If int, H and W are both equal to output_size. + spatial_scale (float): Multiplicative spatial scale factor to translate ROI coords from their + input scale to the scale used when pooling. Default: 1.0. + + Shape: + - x: 4-D Tensor with shape (N, C, H, W). + - boxes: 2-D Tensor with shape (num_rois, 4). + - boxes_num: 1-D Tensor. + - output: 4-D tensor with shape (num_rois, output_channels, pooled_h, pooled_w). + The output_channels equal to C / (pooled_h * pooled_w), where C is the channels of input. + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + + psroi_module = paddle.vision.ops.PSRoIPool(7, 1.0) + x = paddle.uniform([2, 490, 28, 28], dtype='float32') + boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32') + boxes_num = paddle.to_tensor([1, 2], dtype='int32') + pool_out = psroi_module(x, boxes, boxes_num) + + """ + + def __init__(self, output_size, spatial_scale=1.0): + super(PSRoIPool, self).__init__() + self.output_size = output_size + self.spatial_scale = spatial_scale + + def forward(self, x, boxes, boxes_num): + return psroi_pool(x, boxes, boxes_num, self.output_size, + self.spatial_scale) From 2fe9ae71f7fc6e0e1d80be4121c7bb50208e983e Mon Sep 17 00:00:00 2001 From: JZ-LIANG <38102074+JZ-LIANG@users.noreply.github.com> Date: Sun, 26 Sep 2021 19:30:21 +0800 Subject: [PATCH 008/298] bugfix reshape -1 (#36087) --- paddle/fluid/operators/reshape_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 51ff8f189b1513..c74f0f0e499b44 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -229,7 +229,7 @@ class ReshapeOp : public framework::OperatorWithKernel { // by now we require that if the input tensor is zero shape, the target // shape of output must be zero if (in_size == 0) { - PADDLE_ENFORCE_EQ( + PADDLE_ENFORCE_LE( capacity, in_size, platform::errors::InvalidArgument( "The 'shape' in ReshapeOp is invalid. " From 7803f403b97c6b390d0f81bc271da5777f48a235 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 27 Sep 2021 10:16:08 +0800 Subject: [PATCH 009/298] Revert "auto read all public envs from flags_map in paddle_gtest_main (#36057)" (#36117) This reverts commit 3fabc808857d543831579afa133da48eac94ce48. --- paddle/testing/paddle_gtest_main.cc | 53 ++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index e94805be5a1474..6feef11a366d97 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" -#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/npu_info.h" @@ -23,11 +22,13 @@ int main(int argc, char** argv) { paddle::memory::allocation::UseAllocatorStrategyGFlag(); testing::InitGoogleTest(&argc, argv); std::vector new_argv; + std::string gflags_env; for (int i = 0; i < argc; ++i) { new_argv.push_back(argv[i]); } std::vector envs; + std::vector undefok; #if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_PSLIB) std::string str_max_body_size; if (::GFLAGS_NAMESPACE::GetCommandLineOption("max_body_size", @@ -37,13 +38,35 @@ int main(int argc, char** argv) { } #endif - const auto& flag_map = paddle::platform::GetExportedFlagInfoMap(); - for (const auto& pair : flag_map) { - const std::string& name = pair.second.name; - if (pair.second.is_writable) { // means public - envs.push_back(name); - } - } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) + envs.push_back("fraction_of_gpu_memory_to_use"); + envs.push_back("initial_gpu_memory_in_mb"); + envs.push_back("reallocate_gpu_memory_in_mb"); + envs.push_back("allocator_strategy"); + envs.push_back("selected_gpus"); +#elif __clang__ + envs.push_back("use_mkldnn"); + envs.push_back("initial_cpu_memory_in_mb"); + envs.push_back("allocator_strategy"); + + undefok.push_back("use_mkldnn"); + undefok.push_back("initial_cpu_memory_in_mb"); +#else + envs.push_back("use_pinned_memory"); + envs.push_back("use_mkldnn"); + envs.push_back("initial_cpu_memory_in_mb"); + envs.push_back("allocator_strategy"); + + undefok.push_back("use_pinned_memory"); + undefok.push_back("use_mkldnn"); + undefok.push_back("initial_cpu_memory_in_mb"); +#endif + +#if defined(PADDLE_WITH_ASCEND_CL) + envs.push_back("selected_npus"); + envs.push_back("npu_config_path"); +#endif char* env_str = nullptr; if (envs.size() > 0) { @@ -57,6 +80,18 @@ int main(int argc, char** argv) { VLOG(1) << "gtest env_string:" << env_string; } + char* undefok_str = nullptr; + if (undefok.size() > 0) { + std::string undefok_string = "--undefok="; + for (auto t : undefok) { + undefok_string += t + ","; + } + undefok_string = undefok_string.substr(0, undefok_string.length() - 1); + undefok_str = strdup(undefok_string.c_str()); + new_argv.push_back(undefok_str); + VLOG(1) << "gtest undefok_string:" << undefok_string; + } + int new_argc = static_cast(new_argv.size()); char** new_argv_address = new_argv.data(); ::GFLAGS_NAMESPACE::ParseCommandLineFlags( @@ -70,5 +105,7 @@ int main(int argc, char** argv) { #endif if (env_str) free(env_str); + if (undefok_str) free(undefok_str); + return ret; } From 23ccbcb15413a17bbb22a5806bd33c6687baf54e Mon Sep 17 00:00:00 2001 From: Xiaoxu Chen Date: Mon, 27 Sep 2021 10:19:06 +0800 Subject: [PATCH 010/298] update externalErrorMsg.tar.gz md5 value (#36126) --- cmake/third_party.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 44463f29923b2e..892ae270267a79 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -252,7 +252,7 @@ if(WITH_GPU) list(APPEND third_party_deps extern_cub) endif() set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE) - file_download_and_uncompress(${URL} "externalError" MD5 c0749523ebb536eb7382487d645d9cd4) # download file externalErrorMsg.tar.gz + file_download_and_uncompress(${URL} "externalError" MD5 061f3b7895aadcbe2c3ed592590f8b10) # download file externalErrorMsg.tar.gz if(WITH_TESTING) # copy externalErrorMsg.pb, just for unittest can get error message correctly. set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data) From 8db6d221772d95fe96181d199b1458b3707e0cfd Mon Sep 17 00:00:00 2001 From: Haipeng Wang Date: Mon, 27 Sep 2021 12:45:31 +0800 Subject: [PATCH 011/298] support saving model defined parameters without add scale_op (#36119) * add scale_op in model save step is not necessary, just fix the prune method to support static graph and inplace op * fix jit.save, no need to add scale_op to each outputvar anymore. fix prune_with_input, now it supports inplace op * temporarily disable test_trt_dynamic_shape.TRTDynamicShapeOutOfBound2Test * allow user to export parameters defined in model --- python/paddle/fluid/framework.py | 6 +----- python/paddle/fluid/io.py | 3 ++- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 11e7e7c2f7c08c..b6241f6e5299df 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -5074,11 +5074,7 @@ def _prune_with_input(self, feeded_var_names, targets): else: target_op = op - if target_op is None: - raise ValueError( - "The target variable used for pruning should have an " - "associated operator that generates it.") - else: + if target_op is not None: targets_idx.append([target_op.block.idx, target_op.idx]) else: targets_idx.append([t.block.idx, t.idx]) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index f050b3995be96c..e110c47d790f1e 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -1426,7 +1426,8 @@ def save_inference_model(dirname, main_program.global_block().create_var( name=target_v.name, shape=target_v.shape, - dtype=target_v.dtype) + dtype=target_v.dtype, + persistable=target_v.persistable) prepend_feed_ops(main_program, feeded_var_names) append_fetch_ops(main_program, fetch_var_names) From 6c4a741aceeae92acd3d7f1be44ceba91b5ffa03 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Sun, 26 Sep 2021 23:49:14 -0500 Subject: [PATCH 012/298] [Docker Images] Add cuda11.2 + cudnn8.2.1 + trt8.0.3.4 images (#35982) --- .../dockerfile/build_scripts/install_cudnn.sh | 8 +++++++ tools/dockerfile/build_scripts/install_trt.sh | 5 +++++ tools/dockerfile/centos7_manylinux.sh | 22 +++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/tools/dockerfile/build_scripts/install_cudnn.sh b/tools/dockerfile/build_scripts/install_cudnn.sh index e90a0789a34bd4..0817634fa91afb 100644 --- a/tools/dockerfile/build_scripts/install_cudnn.sh +++ b/tools/dockerfile/build_scripts/install_cudnn.sh @@ -37,4 +37,12 @@ elif [[ "$1" == "cudnn811" && "$VERSION" == "10.2" ]]; then cp -r lib64 /usr && cd ../ && \ rm -f cudnn-10.2-linux-x64-v8.1.1.33.tgz && \ rm -rf cuda +elif [[ "$1" == "cudnn821" && "$VERSION" == "11.2" ]]; then + wget -q https://paddle-ci.gz.bcebos.com/cudnn/cudnn-11.3-linux-x64-v8.2.1.32.tgz --no-check-certificate + tar -xzf cudnn-11.3-linux-x64-v8.2.1.32.tgz && \ + cd cuda && \ + cp -r include /usr && \ + cp -r lib64 /usr && cd ../ && \ + rm -f cudnn-11.3-linux-x64-v8.2.1.32.tgz && \ + rm -rf cuda fi diff --git a/tools/dockerfile/build_scripts/install_trt.sh b/tools/dockerfile/build_scripts/install_trt.sh index 2e7917448f2e2e..9e028625de1c3c 100644 --- a/tools/dockerfile/build_scripts/install_trt.sh +++ b/tools/dockerfile/build_scripts/install_trt.sh @@ -31,6 +31,11 @@ if [[ "$VERSION" == "10.1" ]];then tar -zxf TensorRT6-cuda10.1-cudnn7.tar.gz -C /usr/local cp -rf /usr/local/TensorRT6-cuda10.1-cudnn7/include/* /usr/include/ && cp -rf /usr/local/TensorRT6-cuda10.1-cudnn7/lib/* /usr/lib/ rm TensorRT6-cuda10.1-cudnn7.tar.gz +elif [[ "$1" == "trt8034" && "$VERSION" == "11.2" ]];then + wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz --no-check-certificate + tar -zxf TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz -C /usr/local + cp -rf /usr/local/TensorRT-8.0.3.4/include/* /usr/include/ && cp -rf /usr/local/TensorRT-8.0.3.4/lib/* /usr/lib/ + rm TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz elif [[ "$VERSION" == "11.2" ]];then wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT7-cuda11.1-cudnn8.1.tar.gz --no-check-certificate tar -zxf TensorRT7-cuda11.1-cudnn8.1.tar.gz -C /usr/local diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh index 2435c57d541b03..6038e464097cd4 100755 --- a/tools/dockerfile/centos7_manylinux.sh +++ b/tools/dockerfile/centos7_manylinux.sh @@ -84,6 +84,22 @@ function make_cuda112cudnn8() { sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp } +function make_cuda112cudnn821trt8034gcc82() { + sed 's//11.2.2-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN yum remove -y libcudnn8-devel.x86_64 libcudnn8.x86_64 \nRun bash build_scripts/install_cudnn.sh cudnn821 \nENV CUDNN_VERSION=8.2.1 \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp + sed -i "/install_trt.sh/d" Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_trt.sh trt8034 \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp + sed -i '/CMD/iRUN ldconfig' Dockerfile.tmp +} + +function make_cuda112cudnn821trt8034gcc54() { + sed 's//11.2.2-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN yum remove -y libcudnn8-devel.x86_64 libcudnn8.x86_64 \nRun bash build_scripts/install_cudnn.sh cudnn821 \nENV CUDNN_VERSION=8.2.1 \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp + sed -i "/install_trt.sh/d" Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_trt.sh trt8034 \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp + sed -i '/CMD/iRUN ldconfig' Dockerfile.tmp +} + function main() { local CMD=$1 case $CMD in @@ -123,6 +139,12 @@ function main() { cuda112cudnn8) make_cuda112cudnn8 ;; + cuda112cudnn821trt8034gcc82) + make_cuda112cudnn821trt8034gcc82 + ;; + cuda112cudnn821trt8034gcc54) + make_cuda112cudnn821trt8034gcc54 + ;; *) echo "Make dockerfile error, Without this paramet." exit 1 From 0e5d81c76bf4e5080c4b48715d2f1eda2aa04b7c Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Mon, 27 Sep 2021 13:02:15 +0800 Subject: [PATCH 013/298] Polish multi-thread schedule strategy and Keep one task in current thread (#35928) * Polish multi-thread schedule strategy * fix atomic_deps * modify into lambda function * add and run --- .../framework/new_executor/interpretercore.cc | 102 ++++++++++++------ .../framework/new_executor/interpretercore.h | 10 +- .../new_executor/interpretercore_util.cc | 29 +++-- .../new_executor/interpretercore_util.h | 9 +- .../new_executor/new_executor_defs.h | 9 +- 5 files changed, 102 insertions(+), 57 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 7d9d3d5fef14a8..083d989cb52672 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -189,8 +189,6 @@ void InterpreterCore::Convert() { for (auto inst_id : filter_next) { dependecy_count_[inst_id]++; } - vec_instruction_[i].next_instruction_.all_next_ops_ = - std::move(filter_next); } for (size_t i = 0; i < vec_instruction_.size(); ++i) { @@ -356,31 +354,81 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { void InterpreterCore::ExecuteInstructionList( const std::vector& vec_instr) { - auto atomic_deps = async_work_queue_.PrepareAtomicDeps(dependecy_count_); - auto atomic_var_ref = async_work_queue_.PrepareAtomicVarRef(vec_meta_info_); - std::atomic op_run_number{0}; + async_work_queue_.PrepareAtomicDeps(dependecy_count_); + async_work_queue_.PrepareAtomicVarRef(vec_meta_info_); + op_run_number_ = 0; for (size_t i = 0; i < dependecy_count_.size(); ++i) { if (dependecy_count_[i] == 0) { - async_work_queue_.AddTask(vec_instr[i].type_, [&, i]() { - RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number); - }); + async_work_queue_.AddTask(vec_instr[i].type_, + [&, i] { RunInstructionAsync(i); }); } } async_work_queue_.WaitEmpty(); PADDLE_ENFORCE_EQ( - op_run_number.load(), vec_instr.size(), + op_run_number_.load(), vec_instr.size(), platform::errors::Fatal( "Required op_run_number == %d, but received op_run_number = %d.", - vec_instr.size(), op_run_number.load())); + vec_instr.size(), op_run_number_.load())); } -void InterpreterCore::RunInstructionAsync(size_t instr_id, - AtomicVectorSizeT* atomic_deps, - AtomicVectorSizeT* atomic_var_ref, - std::atomic* op_run_number) { +void InterpreterCore::RunNextInstruction(const Instruction& instr) { + auto& next_instr = instr.next_instruction_; + auto& atomic_deps = async_work_queue_.AtomicDeps(); + auto IsReady = [&](size_t next_id) { + return atomic_deps[next_id]->fetch_sub(1, std::memory_order_relaxed) == 1; + }; + + if (instr.type_ == OpFuncType::kQueueAsync) { + // move all sync_ops into other threads + for (auto next_id : next_instr.synchronize_run_) { + if (IsReady(next_id)) { + async_work_queue_.AddTask( + vec_instruction_[next_id].type_, + [&, next_id] { RunInstructionAsync(next_id); }); + } + } + // keep all async_ops running in current thread + for (auto next_id : next_instr.direct_run_) { + if (IsReady(next_id)) { + RunInstructionAsync(next_id); + } + } + for (auto next_id : next_instr.event_wait_run_) { + if (IsReady(next_id)) { + RunInstructionAsync(next_id); + } + } + } else { + // move async_ops into async_thread + for (auto next_id : next_instr.event_wait_run_) { + if (IsReady(next_id)) { + async_work_queue_.AddTask( + vec_instruction_[next_id].type_, + [&, next_id] { RunInstructionAsync(next_id); }); + } + } + + for (size_t i = 0; i < next_instr.direct_run_.size(); ++i) { + auto next_id = next_instr.direct_run_[i]; + if (IsReady(next_id)) { + // only keep one op running in current thread + if (i == 0) { + RunInstructionAsync(next_id); + continue; + } + // move rest ops into other threads + async_work_queue_.AddTask( + vec_instruction_[next_id].type_, + [&, next_id] { RunInstructionAsync(next_id); }); + } + } + } +} + +void InterpreterCore::RunInstructionAsync(size_t instr_id) { auto& instr_node = vec_instruction_[instr_id]; platform::RecordEvent instruction_event( instr_node.kernel_func_.operator_base_->Type()); @@ -389,32 +437,22 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id, RunInstruction(instr_node); event_manager_.RecordEvent(instr_node, place_); - op_run_number->fetch_add(1, std::memory_order_relaxed); + op_run_number_.fetch_add(1, std::memory_order_relaxed); - auto& next_instr = instr_node.next_instruction_.all_next_ops_; - - for (auto next_i : next_instr) { - // fetch_sub return value before applying sub - bool is_ready = - atomic_deps->at(next_i)->fetch_sub(1, std::memory_order_relaxed) == 1; - if (is_ready) { - async_work_queue_.AddTask(vec_instruction_[next_i].type_, [=]() { - RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number); - }); - } - } // GC infomation - CheckGC(instr_id, instr_node.gc_check_var_list, atomic_var_ref); + CheckGC(instr_id, instr_node.gc_check_var_list); + + RunNextInstruction(instr_node); } void InterpreterCore::CheckGC(size_t instr_id, - const std::vector& gc_check_list, - AtomicVectorSizeT* atomic_var_ref) { + const std::vector& gc_check_list) { auto& var_scope = *global_scope_; + auto& atomic_var_ref = async_work_queue_.AtomicVarRef(); for (auto var_id : gc_check_list) { - bool is_ready = atomic_var_ref->at(var_id)->fetch_sub( - 1, std::memory_order_relaxed) == 1; + bool is_ready = + atomic_var_ref[var_id]->fetch_sub(1, std::memory_order_relaxed) == 1; if (is_ready && var_scope.vec_meta_info_[var_id].vardesc_ && !var_scope.vec_meta_info_[var_id].vardesc_->Persistable()) { gc_.Add(var_scope.var_list[var_id], gc_event_[instr_id], diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index e594f9ca8b54b5..47f23aff4f00e7 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -65,13 +65,10 @@ class InterpreterCore { void DryRunPrepare(const std::vector& feed_tensors); - void CheckGC(size_t instr_id, const std::vector& gc_check_list, - AtomicVectorSizeT* working_var_ref); + void CheckGC(size_t instr_id, const std::vector& gc_check_list); - void RunInstructionAsync(size_t instr_id, - AtomicVectorSizeT* working_dependecy_count, - AtomicVectorSizeT* working_var_ref, - std::atomic* op_run_number); + void RunInstructionAsync(size_t instr_id); + void RunNextInstruction(const Instruction& instr_id); void AddFetch(const std::vector& fetch_names); void BuildSkipShareLoDInfo(); @@ -101,6 +98,7 @@ class InterpreterCore { InterpreterCoreGarbageCollector gc_; std::vector gc_event_; + std::atomic op_run_number_{0}; }; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 16df5d794f4d44..3438fc3bd4dcd1 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -12,31 +12,40 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/new_executor/interpretercore_util.h" +#include + #include "paddle/fluid/framework/executor_gc_helper.h" namespace paddle { namespace framework { namespace interpretercore { -AtomicVectorSizeT AsyncWorkQueue::PrepareAtomicDeps( +AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicDeps( const std::vector& dependecy_count) { - AtomicVectorSizeT working_dependecy_count(dependecy_count.size()); + if (atomic_deps_.size() != dependecy_count.size()) { + atomic_deps_.clear(); + std::generate_n(std::back_inserter(atomic_deps_), dependecy_count.size(), + [] { return std::make_unique>(0); }); + } + for (size_t i = 0; i < dependecy_count.size(); ++i) { - working_dependecy_count[i] = - std::make_unique>(dependecy_count[i]); + atomic_deps_[i]->store(dependecy_count[i]); } - return working_dependecy_count; + return atomic_deps_; } -AtomicVectorSizeT AsyncWorkQueue::PrepareAtomicVarRef( +AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicVarRef( const std::vector& vec_meta_info) { - AtomicVectorSizeT working_var_ref(vec_meta_info.size()); + if (atomic_var_ref_.size() != vec_meta_info.size()) { + atomic_var_ref_.clear(); + std::generate_n(std::back_inserter(atomic_var_ref_), vec_meta_info.size(), + [] { return std::make_unique>(0); }); + } for (size_t i = 0; i < vec_meta_info.size(); ++i) { - working_var_ref[i] = - std::make_unique>(vec_meta_info[i].var_ref_count_); + atomic_var_ref_[i]->store(vec_meta_info[i].var_ref_count_); } - return working_var_ref; + return atomic_var_ref_; } bool var_can_be_deleted(const std::string& name, const BlockDesc& block) { diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h index 259f1c615533d9..2a5942c7123651 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.h +++ b/paddle/fluid/framework/new_executor/interpretercore_util.h @@ -66,9 +66,9 @@ class AsyncWorkQueue { queue_group_ = CreateWorkQueueGroup(group_options); } - AtomicVectorSizeT PrepareAtomicDeps( + AtomicVectorSizeT& PrepareAtomicDeps( const std::vector& dependecy_count); - AtomicVectorSizeT PrepareAtomicVarRef( + AtomicVectorSizeT& PrepareAtomicVarRef( const std::vector& vec_meta_info); void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); } @@ -77,9 +77,14 @@ class AsyncWorkQueue { queue_group_->AddTask(static_cast(op_func_type), std::move(fn)); } + AtomicVectorSizeT& AtomicDeps() { return atomic_deps_; } + AtomicVectorSizeT& AtomicVarRef() { return atomic_var_ref_; } + private: size_t host_num_thread_; std::unique_ptr queue_group_; + AtomicVectorSizeT atomic_deps_; + AtomicVectorSizeT atomic_var_ref_; }; std::string get_memcpy_type(const platform::Place& src_place, diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 9c0444b3157cb1..19b7b6d5dc299f 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -477,15 +477,10 @@ struct VariableScope { std::vector vec_meta_info_; }; -struct EventRun { - explicit EventRun(size_t op_id) : op_id_(op_id) {} - size_t op_id_; -}; struct NextInstruction { std::vector direct_run_; - std::vector event_wait_run_; - std::vector synchronize_run_; - std::vector all_next_ops_; + std::vector event_wait_run_; + std::vector synchronize_run_; }; struct EventInter { From 6841d4d4a954eb85f30c411b23e4c40d2d4f10f5 Mon Sep 17 00:00:00 2001 From: zhangchunle Date: Mon, 27 Sep 2021 13:19:47 +0800 Subject: [PATCH 014/298] test=document_fix;paddle/testing nend run all cases (#36138) --- tools/get_pr_ut.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index bd67d68c131118..0ba60265353073 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -139,6 +139,7 @@ def get_is_white_file(self, filename): """ judge is white file in pr's files. """ isWhiteFile = False not_white_files = (PADDLE_ROOT + 'cmake/', PADDLE_ROOT + 'patches/', + PADDLE_ROOT + 'paddle/testing/', PADDLE_ROOT + 'tools/dockerfile/', PADDLE_ROOT + 'tools/windows/', PADDLE_ROOT + 'tools/test_runner.py', From 6d62769ad4b7bd78d08df479f16b74028c51ed05 Mon Sep 17 00:00:00 2001 From: Wenyu Date: Mon, 27 Sep 2021 13:37:53 +0800 Subject: [PATCH 015/298] Add roi pool (#35084) * add roi pool * rename input as x --- python/paddle/tests/test_ops_roi_pool.py | 109 ++++++++++++++++++++ python/paddle/vision/ops.py | 125 +++++++++++++++++++++++ 2 files changed, 234 insertions(+) create mode 100644 python/paddle/tests/test_ops_roi_pool.py diff --git a/python/paddle/tests/test_ops_roi_pool.py b/python/paddle/tests/test_ops_roi_pool.py new file mode 100644 index 00000000000000..3c84a55da1ea69 --- /dev/null +++ b/python/paddle/tests/test_ops_roi_pool.py @@ -0,0 +1,109 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle +from paddle.vision.ops import roi_pool, RoIPool + + +class TestRoIPool(unittest.TestCase): + def setUp(self): + self.data = np.random.rand(1, 256, 32, 32).astype('float32') + boxes = np.random.rand(3, 4) + boxes[:, 2] += boxes[:, 0] + 3 + boxes[:, 3] += boxes[:, 1] + 4 + self.boxes = boxes.astype('float32') + self.boxes_num = np.array([3], dtype=np.int32) + + def roi_pool_functional(self, output_size): + + if isinstance(output_size, int): + output_shape = (3, 256, output_size, output_size) + else: + output_shape = (3, 256, output_size[0], output_size[1]) + + if paddle.in_dynamic_mode(): + data = paddle.to_tensor(self.data) + boxes = paddle.to_tensor(self.boxes) + boxes_num = paddle.to_tensor(self.boxes_num) + + pool_out = roi_pool( + data, boxes, boxes_num=boxes_num, output_size=output_size) + np.testing.assert_equal(pool_out.shape, output_shape) + + else: + data = paddle.static.data( + shape=self.data.shape, dtype=self.data.dtype, name='data') + boxes = paddle.static.data( + shape=self.boxes.shape, dtype=self.boxes.dtype, name='boxes') + boxes_num = paddle.static.data( + shape=self.boxes_num.shape, + dtype=self.boxes_num.dtype, + name='boxes_num') + + pool_out = roi_pool( + data, boxes, boxes_num=boxes_num, output_size=output_size) + + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + + pool_out = exe.run(paddle.static.default_main_program(), + feed={ + 'data': self.data, + 'boxes': self.boxes, + 'boxes_num': self.boxes_num + }, + fetch_list=[pool_out]) + + np.testing.assert_equal(pool_out[0].shape, output_shape) + + def test_roi_pool_functional_dynamic(self): + self.roi_pool_functional(3) + self.roi_pool_functional(output_size=(3, 4)) + + def test_roi_pool_functional_static(self): + paddle.enable_static() + self.roi_pool_functional(3) + paddle.disable_static() + + def test_RoIPool(self): + roi_pool_c = RoIPool(output_size=(4, 3)) + data = paddle.to_tensor(self.data) + boxes = paddle.to_tensor(self.boxes) + boxes_num = paddle.to_tensor(self.boxes_num) + + pool_out = roi_pool_c(data, boxes, boxes_num) + np.testing.assert_equal(pool_out.shape, (3, 256, 4, 3)) + + def test_value(self, ): + data = np.array([i for i in range(1, 17)]).reshape(1, 1, 4, + 4).astype(np.float32) + boxes = np.array( + [[1., 1., 2., 2.], [1.5, 1.5, 3., 3.]]).astype(np.float32) + boxes_num = np.array([2]).astype(np.int32) + output = np.array([[[[11.]]], [[[16.]]]], dtype=np.float32) + + data = paddle.to_tensor(data) + boxes = paddle.to_tensor(boxes) + boxes_num = paddle.to_tensor(boxes_num) + + roi_pool_c = RoIPool(output_size=1) + pool_out = roi_pool_c(data, boxes, boxes_num) + np.testing.assert_almost_equal(pool_out.numpy(), output) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 5f02b805a3ed31..84dcdfa4cfcc4f 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -30,6 +30,8 @@ 'DeformConv2D', 'read_file', 'decode_jpeg', + 'roi_pool', + 'RoIPool', 'psroi_pool', 'PSRoIPool', ] @@ -1013,3 +1015,126 @@ def __init__(self, output_size, spatial_scale=1.0): def forward(self, x, boxes, boxes_num): return psroi_pool(x, boxes, boxes_num, self.output_size, self.spatial_scale) + + +def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None): + """ + This operator implements the roi_pooling layer. + Region of interest pooling (also known as RoI pooling) is to perform max pooling on inputs of nonuniform sizes to obtain fixed-size feature maps (e.g. 7*7). + The operator has three steps: 1. Dividing each region proposal into equal-sized sections with output_size(h, w) 2. Finding the largest value in each section 3. Copying these max values to the output buffer + For more information, please refer to https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn. + + Args: + x (Tensor): input feature, 4D-Tensor with the shape of [N,C,H,W], + where N is the batch size, C is the input channel, H is Height, W is weight. + The data type is float32 or float64. + boxes (Tensor): boxes (Regions of Interest) to pool over. + 2D-Tensor with the shape of [num_boxes,4]. + Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, + and (x2, y2) is the bottom right coordinates. + boxes_num (Tensor): the number of RoIs in each image, data type is int32. Default: None + output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size. + spatial_scale (float, optional): multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0 + name(str, optional): for detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default. + + Returns: + pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]]. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.ops import roi_pool + + data = paddle.rand([1, 256, 32, 32]) + boxes = paddle.rand([3, 4]) + boxes[:, 2] += boxes[:, 0] + 3 + boxes[:, 3] += boxes[:, 1] + 4 + boxes_num = paddle.to_tensor([3]).astype('int32') + pool_out = roi_pool(data, boxes, boxes_num=boxes_num, output_size=3) + assert pool_out.shape == [3, 256, 3, 3], '' + """ + + check_type(output_size, 'output_size', (int, tuple), 'roi_pool') + if isinstance(output_size, int): + output_size = (output_size, output_size) + + pooled_height, pooled_width = output_size + if in_dygraph_mode(): + assert boxes_num is not None, "boxes_num should not be None in dygraph mode." + pool_out, argmaxes = core.ops.roi_pool( + x, boxes, boxes_num, "pooled_height", pooled_height, "pooled_width", + pooled_width, "spatial_scale", spatial_scale) + return pool_out + + else: + check_variable_and_dtype(x, 'x', ['float32'], 'roi_pool') + check_variable_and_dtype(boxes, 'boxes', ['float32'], 'roi_pool') + helper = LayerHelper('roi_pool', **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + argmaxes = helper.create_variable_for_type_inference(dtype='int32') + + inputs = { + "X": x, + "ROIs": boxes, + } + if boxes_num is not None: + inputs['RoisNum'] = boxes_num + helper.append_op( + type="roi_pool", + inputs=inputs, + outputs={"Out": pool_out, + "Argmax": argmaxes}, + attrs={ + "pooled_height": pooled_height, + "pooled_width": pooled_width, + "spatial_scale": spatial_scale + }) + return pool_out + + +class RoIPool(Layer): + """ + This interface is used to construct a callable object of the `RoIPool` class. Please + refer to :ref:`api_paddle_vision_ops_roi_pool`. + + Args: + output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size. + spatial_scale (float, optional): multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0. + + Returns: + pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]]. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.ops import RoIPool + + data = paddle.rand([1, 256, 32, 32]) + boxes = paddle.rand([3, 4]) + boxes[:, 2] += boxes[:, 0] + 3 + boxes[:, 3] += boxes[:, 1] + 4 + boxes_num = paddle.to_tensor([3]).astype('int32') + roi_pool = RoIPool(output_size=(4, 3)) + pool_out = roi_pool(data, boxes, boxes_num) + assert pool_out.shape == [3, 256, 4, 3], '' + """ + + def __init__(self, output_size, spatial_scale=1.0): + super(RoIPool, self).__init__() + self._output_size = output_size + self._spatial_scale = spatial_scale + + def forward(self, x, boxes, boxes_num): + return roi_pool( + x=x, + boxes=boxes, + boxes_num=boxes_num, + output_size=self._output_size, + spatial_scale=self._spatial_scale) + + def extra_repr(self): + main_str = 'output_size={_output_size}, spatial_scale={_spatial_scale}' + return main_str.format(**self.__dict__) From ec2f68e85d413655d5774d03fb81c5ba13db54cd Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Mon, 27 Sep 2021 14:04:34 +0800 Subject: [PATCH 016/298] Add functional autograd API: jacobian (#35917) * init functional jacobian api * finish test with dtype float32 * add float64 test case * polish code * use atol=1e-5 with dtype float64 * fix for ci * set timeout for test_jacobian * polish API docstring * modify docstring --- python/paddle/autograd/__init__.py | 1 + python/paddle/autograd/functional.py | 185 +++++++++++++++ python/paddle/fluid/dygraph/base.py | 2 +- .../fluid/tests/unittests/CMakeLists.txt | 1 + .../tests/unittests/autograd/CMakeLists.txt | 9 + .../tests/unittests/autograd/test_jacobian.py | 224 ++++++++++++++++++ 6 files changed, 421 insertions(+), 1 deletion(-) create mode 100644 python/paddle/autograd/functional.py create mode 100644 python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/autograd/test_jacobian.py diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py index 89094357b35050..dfbb3cfb45f2be 100644 --- a/python/paddle/autograd/__init__.py +++ b/python/paddle/autograd/__init__.py @@ -18,5 +18,6 @@ from .py_layer import PyLayer, PyLayerContext # noqa: F401 from ..framework import set_grad_enabled # noqa: F401 from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401 +from .functional import jacobian # noqa: F401 __all__ = ['backward', 'PyLayer', 'PyLayerContext'] diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py new file mode 100644 index 00000000000000..c1b4dd9e3a2db8 --- /dev/null +++ b/python/paddle/autograd/functional.py @@ -0,0 +1,185 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid import framework +import paddle + + +def _check_tensors(in_out_list, name): + assert in_out_list is not None, "{} should not be None".format(name) + + if isinstance(in_out_list, (list, tuple)): + assert len(in_out_list) > 0, "{} connot be empyt".format(name) + for each_var in in_out_list: + assert isinstance( + each_var, + paddle.Tensor), "Elements of {} must be paddle.Tensor".format( + name) + return in_out_list + else: + assert isinstance( + in_out_list, + paddle.Tensor), "{} must be Tensor or list of Tensor".format(name) + return [in_out_list] + + +def _stack_tensor_or_return_none(origin_list): + assert len(origin_list) > 0, "Can't not stack an empty list" + return paddle.stack( + origin_list, axis=0) if isinstance(origin_list[0], + paddle.Tensor) else None + + +@framework.dygraph_only +def jacobian(func, inputs, create_graph=False, allow_unused=False): + ''' + .. note:: + **This API is ONLY available in imperative mode.** + + This API computes the Jacobian matrix of `func` with respect to `inputs`. + + Parameters: + func (function): a Python function that takes a Tensor or a Tensor + list/tuple as inputs and returns a Tensor or a Tensor tuple. + inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or + Tensor list/tuple of the function ``func``. + create_graph (bool, optional): whether to create the gradient graphs + of the computing process. When it is True, higher order derivatives + are supported to compute; when it is False, the gradient graphs of + the computing process would be discarded. Defaults to ``False``. + allow_unused (bool, optional): whether to raise error or return None if + some Tensors of `inputs` are unreachable in the graph. Error would + be raised if allow_unused=False, and None would be returned as + their gradients if allow_unused=True. Default False. + Returns: + Jacobian (Tensor or nested tuple of Tensors): if function ``func`` + takes a Tensor as inputs and returns a Tensor as outputs, Jacobian + will be a single Tensor containing the Jacobian matrix for the + linearized inputs and outputs. If one of the inputs and outputs is + a Tensor, and another is a Tensor list/tuple, then the Jacobian will + be a tuple of Tensors. If both of inputs and outputs are Tensor + list/tuple, then the Jacobian will be a tuple of tuple of Tensors + where ``Jacobian[i][j]`` will contain the Jacobian matrix of the + linearized ``i``th output and ``j``th input and will have same + dtype and device as the corresponding input. ``Jacobian[i][j]`` will + have as size ``m * n``, where ``m`` and ``n`` denote the numbers of + elements of ``i``th output and ``j``th input respectively. + + + Examples 1: + .. code-block:: python + + import paddle + + def func(x): + return paddle.matmul(x, x) + + x = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, x) + print(jacobian) + # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 1., 1., 0.], + # [1., 2., 0., 1.], + # [1., 0., 2., 1.], + # [0., 1., 1., 2.]]) + + Examples 2: + .. code-block:: python + + import paddle + + def func(x, y): + return paddle.matmul(x, y) + + x = paddle.ones(shape=[2, 2], dtype='float32') + y = paddle.ones(shape=[2, 2], dtype='float32') * 2 + x.stop_gradient = False + y.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, [x, y], create_graph=True) + print(jacobian) + # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [[2., 2., 0., 0.], + # [2., 2., 0., 0.], + # [0., 0., 2., 2.], + # [0., 0., 2., 2.]]), + # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [[1., 0., 1., 0.], + # [0., 1., 0., 1.], + # [1., 0., 1., 0.], + # [0., 1., 0., 1.]])) + + Examples 3: + .. code-block:: python + + import paddle + + def func(x, y): + return paddle.matmul(x, y), x * x + + x = paddle.ones(shape=[2, 2], dtype='float32') + y = paddle.ones(shape=[2, 2], dtype='float32') * 2 + x.stop_gradient = False + y.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, [x, y], allow_unused=True) + print(jacobian) + # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 2., 0., 0.], + # [2., 2., 0., 0.], + # [0., 0., 2., 2.], + # [0., 0., 2., 2.]]), + # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[1., 0., 1., 0.], + # [0., 1., 0., 1.], + # [1., 0., 1., 0.], + # [0., 1., 0., 1.]])), + # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 0., 0., 0.], + # [0., 2., 0., 0.], + # [0., 0., 2., 0.], + # [0., 0., 0., 2.]]), None)) + + ''' + inputs = _check_tensors(inputs, "inputs") + outputs = _check_tensors(func(*inputs), "outputs") + fin_size = len(inputs) + fout_size = len(outputs) + flat_outputs = tuple( + paddle.reshape( + output, shape=[-1]) for output in outputs) + jacobian = tuple() + for i, flat_output in enumerate(flat_outputs): + jac_i = list([] for _ in range(fin_size)) + for k in range(len(flat_output)): + row_k = paddle.grad( + flat_output[k], + inputs, + create_graph=create_graph, + retain_graph=True, + allow_unused=allow_unused) + for j in range(fin_size): + jac_i[j].append( + paddle.reshape( + row_k[j], shape=[-1]) + if isinstance(row_k[j], paddle.Tensor) else None) + jacobian += (tuple( + _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), ) + if fin_size == 1 and fout_size == 1: + return jacobian[0][0] + elif fin_size == 1 and fout_size != 1: + return tuple(jacobian[i][0] for i in range(fout_size)) + elif fin_size != 1 and fout_size == 1: + return jacobian[0] + else: + return jacobian diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index c8e1370e44772f..18052fa7d4da85 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -414,7 +414,7 @@ def grad(outputs, no_grad_vars=None): ''' .. note:: - **This API is ONLY available in Dygraph mode.** + **This API is ONLY available in imperative mode.** This API computes the sum of gradients of `outputs` with respect to each `inputs` . diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 4b887da8382576..129fbb9ac3328d 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -702,6 +702,7 @@ endif() add_subdirectory(sequence) add_subdirectory(dygraph_to_static) add_subdirectory(rnn) +add_subdirectory(autograd) if (NOT WIN32 OR NOT WITH_GPU) add_subdirectory(fft) diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt new file mode 100644 index 00000000000000..7f7a232fcefa64 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt @@ -0,0 +1,9 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") +set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0) + +foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) +endforeach(TEST_OP) + +set_tests_properties(test_jacobian PROPERTIES TIMEOUT 20) diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py b/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py new file mode 100644 index 00000000000000..640292a47114a1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py @@ -0,0 +1,224 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle +import paddle.compat as cpt +from paddle.autograd.functional import _check_tensors + + +def _product(t): + if isinstance(t, int): + return t + else: + return np.product(t) + + +def _get_item(t, idx): + assert isinstance(t, paddle.Tensor), "The first argument t must be Tensor." + assert isinstance(idx, + int), "The second argument idx must be an int number." + flat_t = paddle.reshape(t, [-1]) + return flat_t.__getitem__(idx) + + +def _set_item(t, idx, value): + assert isinstance(t, paddle.Tensor), "The first argument t must be Tensor." + assert isinstance(idx, + int), "The second argument idx must be an int number." + flat_t = paddle.reshape(t, [-1]) + flat_t.__setitem__(idx, value) + return paddle.reshape(flat_t, t.shape) + + +def _compute_numerical_jacobian(func, xs, delta, np_dtype): + xs = _check_tensors(xs, "xs") + ys = _check_tensors(func(*xs), "ys") + fin_size = len(xs) + fout_size = len(ys) + jacobian = list([] for _ in range(fout_size)) + for i in range(fout_size): + jac_i = list([] for _ in range(fin_size)) + for j in range(fin_size): + jac_i[j] = np.zeros( + (_product(ys[i].shape), _product(xs[j].shape)), dtype=np_dtype) + jacobian[i] = jac_i + + for j in range(fin_size): + for q in range(_product(xs[j].shape)): + orig = _get_item(xs[j], q) + x_pos = orig + delta + xs[j] = _set_item(xs[j], q, x_pos) + ys_pos = _check_tensors(func(*xs), "ys_pos") + + x_neg = orig - delta + xs[j] = _set_item(xs[j], q, x_neg) + ys_neg = _check_tensors(func(*xs), "ys_neg") + + xs[j] = _set_item(xs[j], q, orig) + + for i in range(fout_size): + for p in range(_product(ys[i].shape)): + y_pos = _get_item(ys_pos[i], p) + y_neg = _get_item(ys_neg[i], p) + jacobian[i][j][p][q] = (y_pos - y_neg) / delta / 2. + return jacobian + + +class TestJacobian(unittest.TestCase): + @classmethod + def setUpClass(self): + self.shape = (4, 4) + self.dtype = 'float32' + self.np_dtype = np.float32 + self.numerical_delta = 1e-4 + self.rtol = 1e-3 + self.atol = 1e-3 + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + + def test_single_input_and_single_output(self): + def func(x): + return paddle.matmul(x, x) + + numerical_jacobian = _compute_numerical_jacobian( + func, self.x, self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, self.x) + assert np.allclose(jacobian.numpy(), numerical_jacobian[0][0], + self.rtol, self.atol) + + def test_single_input_and_multi_output(self): + def func(x): + return paddle.matmul(x, x), x * x + + numerical_jacobian = _compute_numerical_jacobian( + func, self.x, self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, self.x) + for i in range(len(jacobian)): + assert np.allclose(jacobian[i].numpy(), numerical_jacobian[i][0], + self.rtol, self.atol) + + def test_multi_input_and_single_output(self): + def func(x, y): + return paddle.matmul(x, y) + + numerical_jacobian = _compute_numerical_jacobian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + self.y.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) + for j in range(len(jacobian)): + assert np.allclose(jacobian[j].numpy(), numerical_jacobian[0][j], + self.rtol, self.atol) + + def test_multi_input_and_multi_output(self): + def func(x, y): + return paddle.matmul(x, y), x * y + + numerical_jacobian = _compute_numerical_jacobian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + self.y.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) + for i in range(len(jacobian)): + for j in range(len(jacobian[0])): + assert np.allclose(jacobian[i][j].numpy(), + numerical_jacobian[i][j], self.rtol, + self.atol) + + def test_allow_unused_false(self): + def func(x, y): + return paddle.matmul(x, x) + + try: + self.x.stop_gradient = False + self.y.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) + except ValueError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("allow_unused") > 0 + + def test_allow_unused_true(self): + def func(x, y): + return paddle.matmul(x, x) + + numerical_jacobian = _compute_numerical_jacobian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + self.y.stop_gradient = False + jacobian = paddle.autograd.jacobian( + func, [self.x, self.y], allow_unused=True) + assert np.allclose(jacobian[0].numpy(), numerical_jacobian[0][0], + self.rtol, self.atol) + assert jacobian[1] is None + + def test_create_graph_false(self): + def func(x, y): + return paddle.matmul(x, y) + + numerical_jacobian = _compute_numerical_jacobian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + self.y.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) + for j in range(len(jacobian)): + assert jacobian[j].stop_gradient == True + assert np.allclose(jacobian[j].numpy(), numerical_jacobian[0][j], + self.rtol, self.atol) + try: + paddle.grad(jacobian[0], [self.x, self.y]) + except RuntimeError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("has no gradient") > 0 + + def test_create_graph_true(self): + def func(x, y): + return paddle.matmul(x, y) + + numerical_jacobian = _compute_numerical_jacobian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + self.y.stop_gradient = False + jacobian = paddle.autograd.jacobian( + func, [self.x, self.y], create_graph=True) + for j in range(len(jacobian)): + assert jacobian[j].stop_gradient == False + assert np.allclose(jacobian[j].numpy(), numerical_jacobian[0][j], + self.rtol, self.atol) + double_grad = paddle.grad(jacobian[0], [self.x, self.y]) + assert double_grad is not None + + +class TestJacobianFloat64(TestJacobian): + @classmethod + def setUpClass(self): + self.shape = (4, 4) + self.dtype = 'float64' + self.np_dtype = np.float64 + self.numerical_delta = 1e-7 + self.rtol = 1e-7 + self.atol = 1e-7 + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + + # NOTE(levi): skip this test case temporaryly. + def test_create_graph_true(self): + pass + + +if __name__ == "__main__": + unittest.main() From e427a0f1c1e1f815b42fc3d43b697ae868b8b23f Mon Sep 17 00:00:00 2001 From: jakpiase <62569058+jakpiase@users.noreply.github.com> Date: Mon, 27 Sep 2021 12:09:05 +0200 Subject: [PATCH 017/298] Added flatten and flatten2 BF16/FP32 FWD/BWD kernels (#35892) * refactored reshape multiop kernel and added flatten1/2 kernels * added formatting for flatten tests * CI fix * disabled reshape_kernel ops after succesful CI run * minor fix --- paddle/fluid/operators/flatten_op.cc | 65 +++- .../operators/mkldnn/reshape_mkldnn_op.cc | 311 +++++++++++++----- paddle/fluid/operators/reshape_op.cc | 42 +-- paddle/fluid/operators/squeeze_op.cc | 56 ++-- .../mkldnn/test_flatten_mkldnn_op.py | 151 +++++++++ 5 files changed, 491 insertions(+), 134 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_flatten_mkldnn_op.py diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index 0858a43838b964..14f2e9061b742f 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -77,9 +77,17 @@ class FlattenOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.device_context()); + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -101,6 +109,14 @@ class FlattenOpMaker : public framework::OpProtoAndCheckerMaker { "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the" "input tensor is (d_0, d_1, ... d_n).") .SetDefault(1); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "bfloat16"}); AddComment(R"DOC( Flatten Operator @@ -139,9 +155,17 @@ class FlattenGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context()); + auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -198,6 +222,21 @@ class Flatten2Op : public framework::OperatorWithKernel { ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); ctx->ShareLoD("X", "XShape"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class Flatten2OpMaker : public FlattenOpMaker { @@ -244,9 +283,17 @@ class Flatten2GradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context()); + auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc index e6a7f3e74fcc7a..6c3f4ec06201a1 100644 --- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc @@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/flatten_op.h" #include "paddle/fluid/operators/squeeze_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +namespace { +enum class ReshapeKernelOpName { + reshape, + reshape2, + squeeze, + squeeze2, + flatten, + flatten2, +}; +} // anonymous namespace + namespace paddle { namespace operators { @@ -41,7 +53,7 @@ static std::vector extract_shape( return vec_new_shape; } -template +template class ReshapeMKLDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -55,43 +67,13 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { const auto& onednn_engine = dev_ctx.GetEngine(); auto* x = ctx.Input("X"); - auto* xshape = ctx.Output("XShape"); auto* out = ctx.Output("Out"); - framework::DDim x_dims; - // if reshape or squeeze - if (ctx.Type().find("2") == std::string::npos) { - x_dims = x->dims(); - } else { - auto xshape_dims = xshape->dims(); - x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); - } + framework::DDim x_dims, out_dims; + InferInOutShape(ctx, x_dims, out_dims); auto x_vec_dims = framework::vectorize(x_dims); - framework::DDim out_dims; - if (ctx.Type() == "squeeze") { - auto& axes = ctx.Attr>("axes"); - out_dims = GetOutputShape(axes, x_dims, true); - } else { - out_dims = out->dims(); - } - - if (ctx.Type().find("reshape") != std::string::npos) { - auto list_new_shape_tensor = ctx.MultiInput("ShapeTensor"); - if (list_new_shape_tensor.size() > 0) { - auto new_shape = extract_shape(list_new_shape_tensor); - out_dims = ValidateShape(new_shape, x_dims); - } else if (ctx.HasInput("Shape")) { - auto* shape_tensor = ctx.Input("Shape"); - auto* shape_data = shape_tensor->data(); - - auto shape = - std::vector(shape_data, shape_data + shape_tensor->numel()); - out_dims = ValidateShape(shape, x_dims); - } - } - mkldnn::memory::data_type x_type = framework::ToMKLDNNDataType(x->type()); platform::ReorderMKLDNNHandler reorder_handler(x_vec_dims, x->type(), x_type, onednn_engine); @@ -116,6 +98,104 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { framework::vectorize(out_dims)))); } + void InferInOutShape(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + switch (op_name) { + case ReshapeKernelOpName::reshape: + InferShapeReshapeOp(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::reshape2: + InferShapeReshape2Op(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::squeeze: + InferShapeSqueezeOp(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::squeeze2: + InferShapeSqueeze2Op(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::flatten: + InferShapeFlattenOp(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::flatten2: + InferShapeFlattenOp(ctx, x_dims, out_dims); + break; + default: + PADDLE_THROW(paddle::platform::errors::OutOfRange( + "Reshape kernel doesn not support that operator name")); + } + } + + void InferShapeReshapeOp(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + x_dims = x->dims(); + out_dims = out->dims(); + ChangeReshapeOutDimsIfNeeded(ctx, x_dims, out_dims); + } + + void InferShapeReshape2Op(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto* out = ctx.Output("Out"); + auto* xshape = ctx.Output("XShape"); + auto xshape_dims = xshape->dims(); + x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + out_dims = out->dims(); + ChangeReshapeOutDimsIfNeeded(ctx, x_dims, out_dims); + } + + // in reshape1/2 ops "ShapeTensor" has highest priority and "Shape" has + // second highest priority + void ChangeReshapeOutDimsIfNeeded(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto list_new_shape_tensor = ctx.MultiInput("ShapeTensor"); + if (list_new_shape_tensor.size() > 0) { + auto new_shape = extract_shape(list_new_shape_tensor); + out_dims = ValidateShape(new_shape, x_dims); + } else if (ctx.HasInput("Shape")) { + auto* shape_tensor = ctx.Input("Shape"); + auto* shape_data = shape_tensor->data(); + + auto shape = + std::vector(shape_data, shape_data + shape_tensor->numel()); + out_dims = ValidateShape(shape, x_dims); + } + } + + void InferShapeSqueezeOp(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto* x = ctx.Input("X"); + x_dims = x->dims(); + const auto& axes = ctx.Attr>("axes"); + out_dims = GetOutputShape(axes, x_dims, true); + } + + void InferShapeSqueeze2Op(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto* out = ctx.Output("Out"); + auto* xshape = ctx.Output("XShape"); + auto xshape_dims = xshape->dims(); + x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + out_dims = out->dims(); + } + + void InferShapeFlattenOp(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto x = ctx.Input("X"); + x_dims = x->dims(); + auto axes = ctx.Attr("axis"); + out_dims = framework::make_ddim( + FlattenKernel::GetOutputShape( + axes, x_dims)); + } + protected: static mkldnn::memory::format_tag getPlainFormatTag(const Tensor* tensor) { auto tensor_dims_size = tensor->dims().size(); @@ -223,8 +303,8 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { } }; -template -class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { +template +class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { RunKernel(ctx); @@ -239,14 +319,9 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); - framework::DDim x_dims; - // if reshape or squeeze - if (ctx.Type().find("2") == std::string::npos) { - x_dims = dx->dims(); - } else { - auto xshape_dims = ctx.Input("XShape")->dims(); - x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); - } + framework::DDim dx_dims; + InferOutputShapeInGrad(ctx, dx_dims); + auto dout_vec_dims = framework::vectorize(dout->dims()); mkldnn::memory::data_type dout_type = @@ -265,44 +340,128 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); - dx->Resize(x_dims); + dx->Resize(dx_dims); dx->set_layout(framework::DataLayout::kMKLDNN); dx->set_format(GetMKLDNNFormat(reorder_dst_memory_p->get_desc().reshape( - framework::vectorize(x_dims)))); + framework::vectorize(dx_dims)))); } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_KERNEL(squeeze, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); - -REGISTER_OP_KERNEL(squeeze_grad, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeGradMKLDNNKernel, - ops::ReshapeGradMKLDNNKernel); -REGISTER_OP_KERNEL(squeeze2, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); - -REGISTER_OP_KERNEL(squeeze2_grad, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeGradMKLDNNKernel, - ops::ReshapeGradMKLDNNKernel); + void InferOutputShapeInGrad(const framework::ExecutionContext& ctx, + framework::DDim& x_dims) const { + switch (op_name) { + case ReshapeKernelOpName::reshape: + InferShapeReshapeSqueezeGradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::reshape2: + InferShapeReshape2Squeeze2Flatten2GradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::squeeze: + InferShapeReshapeSqueezeGradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::squeeze2: + InferShapeReshape2Squeeze2Flatten2GradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::flatten: + InferShapeFlattenGradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::flatten2: + InferShapeReshape2Squeeze2Flatten2GradOp(ctx, x_dims); + break; + default: + PADDLE_THROW(paddle::platform::errors::OutOfRange( + "Reshape grad kernel doesn not support that operator name")); + } + } -REGISTER_OP_KERNEL(reshape, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); + void InferShapeReshapeSqueezeGradOp(const framework::ExecutionContext& ctx, + framework::DDim& dx_dims) const { + auto* dx = ctx.Output(framework::GradVarName("X")); + dx_dims = dx->dims(); + } -REGISTER_OP_KERNEL(reshape_grad, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeGradMKLDNNKernel, - ops::ReshapeGradMKLDNNKernel); + void InferShapeReshape2Squeeze2Flatten2GradOp( + const framework::ExecutionContext& ctx, framework::DDim& dx_dims) const { + auto xshape_dims = ctx.Input("XShape")->dims(); + dx_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + } -REGISTER_OP_KERNEL(reshape2, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); + void InferShapeFlattenGradOp(const framework::ExecutionContext& ctx, + framework::DDim& dx_dims) const { + dx_dims = ctx.Input("X")->dims(); + } +}; +} // namespace operators +} // namespace paddle -REGISTER_OP_KERNEL(reshape2_grad, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeGradMKLDNNKernel, - ops::ReshapeGradMKLDNNKernel); +namespace ops = paddle::operators; +REGISTER_OP_KERNEL( + squeeze, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + squeeze_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + squeeze2, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + squeeze2_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + reshape, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + reshape_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + reshape2, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + reshape2_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + flatten, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + flatten_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + flatten2, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + flatten2_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index c74f0f0e499b44..6f244b1a4cb8fe 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -248,13 +248,13 @@ class ReshapeOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -366,13 +366,13 @@ class ReshapeGradOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -557,13 +557,13 @@ class Reshape2GradOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index 8894ca650de034..de30eab25f3cf2 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -113,13 +113,13 @@ class SqueezeOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -140,13 +140,13 @@ class SqueezeGradOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -241,13 +241,13 @@ class Squeeze2Op : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -287,13 +287,13 @@ class Squeeze2GradOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_flatten_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_flatten_mkldnn_op.py new file mode 100644 index 00000000000000..c01f244004effb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_flatten_mkldnn_op.py @@ -0,0 +1,151 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core + +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16 + + +@OpTestTool.skip_if_not_cpu_bf16() +class TestFlattenOneDNNOp(OpTest): + def setUp(self): + self.set_op_type() + self.init_test_case() + self.set_inputs() + self.attrs = {"axis": self.axis, 'use_mkldnn': True} + self.ori_shape = self.inputs['X'].shape + self.outputs = {"Out": self.inputs["X"].copy().reshape(self.new_shape)} + + def set_inputs(self): + self.inputs = {"X": np.random.random(self.in_shape).astype("float32")} + + def set_op_type(self): + self.op_type = "flatten" + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + def test_check_grad(self): + self.check_grad_with_place(core.CPUPlace(), ["X"], "Out") + + def init_test_case(self): + self.in_shape = (3, 2, 2, 10) + self.axis = 1 + self.new_shape = (3, 40) + + +class TestFlattenOneDNNOp1(TestFlattenOneDNNOp): + def init_test_case(self): + self.in_shape = (3, 2, 2, 10) + self.axis = 0 + self.new_shape = (1, 120) + + +class TestFlattenOneDNNOpSixDims(TestFlattenOneDNNOp): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.axis = 4 + self.new_shape = (36, 16) + + +class TestFlatten2OneDNNOp(TestFlattenOneDNNOp): + def set_op_type(self): + self.op_type = "flatten2" + + +class TestFlatten2OneDNNOp1(TestFlattenOneDNNOp1): + def set_op_type(self): + self.op_type = "flatten2" + + +class TestFlatten2OneDNNOpSixDims(TestFlattenOneDNNOpSixDims): + def set_op_type(self): + self.op_type = "flatten2" + + +# BF16 TESTS +def create_flatten_bf16_test_classes(parent): + class TestFlatten2BF16OneDNNOp(parent): + def set_inputs(self): + self.dtype = np.uint16 + self.inputs = { + "X": np.random.random(self.in_shape).astype("uint16") + } + + def calculate_grads(self): + self.dout = self.outputs['Out'] + self.dx = np.reshape(self.dout, self.ori_shape) + + def test_check_output(self): + self.check_output_with_place( + core.CPUPlace(), no_check_set=["XShape"]) + + def test_check_grad(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ["X"], + "Out", + user_defined_grads=[self.dx], + user_defined_grad_outputs=[self.dout]) + + cls_name = "{0}_{1}".format(parent.__name__, "Flatten2_BF16") + TestFlatten2BF16OneDNNOp.__name__ = cls_name + globals()[cls_name] = TestFlatten2BF16OneDNNOp + + class TestFlattenBF16OneDNNOp(parent): + def set_op_type(self): + self.dtype = np.uint16 + self.op_type = "flatten" + + def set_inputs(self): + self.dtype = np.uint16 + self.inputs = { + "X": np.random.random(self.in_shape).astype("uint16") + } + + def set_outputs(self): + self.outputs = {"Out": self.x.reshape(self.new_shape)} + + def calculate_grads(self): + self.dout = self.outputs['Out'] + self.dx = np.reshape(self.dout, self.ori_shape) + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + def test_check_grad(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ["X"], + "Out", + user_defined_grads=[self.dx], + user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) + + cls_name = "{0}_{1}".format(parent.__name__, "Flatten_BF16") + TestFlattenBF16OneDNNOp.__name__ = cls_name + globals()[cls_name] = TestFlattenBF16OneDNNOp + + +create_flatten_bf16_test_classes(TestFlatten2OneDNNOp) +create_flatten_bf16_test_classes(TestFlatten2OneDNNOp1) +create_flatten_bf16_test_classes(TestFlatten2OneDNNOpSixDims) + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() From a112ce4260b51966beef01ee8ca43210ce280095 Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Mon, 27 Sep 2021 18:43:46 +0800 Subject: [PATCH 018/298] Lars op optimiztion with cudaLaunchCooperativeKernel method (#35652) * A leap of try for cudaLaunchCooperativeKernel * fix bugs * Totally replace the lar cuda kernel * Fix bugs * fix code according to comments * fix codes according to review comments * adding some function overload * relocate the power operation. --- .../operators/optimizers/lars_momentum_op.cu | 391 ++++++++++++++---- 1 file changed, 314 insertions(+), 77 deletions(-) diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index 42477232e7ca1b..3e7023bd1260f5 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -14,7 +14,29 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/operators/optimizers/lars_momentum_op.h" +#include "paddle/fluid/platform/fast_divmod.h" + +#if defined(__NVCC__) && CUDA_VERSION >= 11000 +/* Once CUDA_VERSION is beyond 11.0, cooperative_groups can be involved in + without adding --rdc=true compile flag, then L2_norm cuda kernel can be + set as a __device__ kernel rather than global kernel. On the contrary, + the compile flag shall be set in old version, which may affect the cuda + kernel performance in paddle, consequently, L2_norm kernel shall be set + as a __global__ kernel. +*/ +#include +#define LARS_FUNCTION_FLAG __device__ +#else +#define LARS_FUNCTION_FLAG __global__ +#endif + +#ifdef __HIPCC__ +#define LARS_BLOCK_SIZE 256 +#else +#define LARS_BLOCK_SIZE 512 +#endif namespace paddle { namespace operators { @@ -22,55 +44,207 @@ namespace operators { template using MultiPrecisionType = typename details::MPTypeTrait::Type; +__device__ __forceinline__ float Sqrt(float x) { return sqrtf(x); } +__device__ __forceinline__ double Sqrt(double x) { return sqrt(x); } +__device__ __forceinline__ float Fma(float x, float y, float z) { + return fmaf(x, y, z); +} +__device__ __forceinline__ double Fma(double x, double y, double z) { + return fma(x, y, z); +} + +template +__device__ inline void VectorizeLarsUpdate( + const T* __restrict__ grad, const MT* __restrict__ param, + const MT* __restrict__ velocity, T* __restrict__ param_out, + MT* __restrict__ velocity_out, const MT mu, MT local_lr, + const MT lars_weight_decay, const MT rescale_grad, const int tid, + const int grid_stride, const int numel, + MT* __restrict__ master_param_out = nullptr) { + using VecType = paddle::platform::AlignedVector; + using VecMType = paddle::platform::AlignedVector; + int main = numel >> (VecSize >> 1); + int tail_offset = main * VecSize; + + const VecType* __restrict__ grad_vec = reinterpret_cast(grad); + const VecMType* __restrict__ param_vec = + reinterpret_cast(param); + const VecMType* __restrict__ velocity_vec = + reinterpret_cast(velocity); + VecType* param_out_vec = reinterpret_cast(param_out); + VecMType* velocity_out_vec = reinterpret_cast(velocity_out); + + VecMType* master_param_out_vec; + if (IsAmp) { + master_param_out_vec = reinterpret_cast(master_param_out); + } + + for (int i = tid; i < main; i += grid_stride) { + VecType param_out_tmp; + VecMType velocity_tmp, param_tmp; + VecType grad_data = grad_vec[i]; + VecMType param_data = param_vec[i]; + VecMType velocity_data = velocity_vec[i]; + +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + MT grad_val = static_cast(grad_data[j]) * rescale_grad; + velocity_tmp[j] = + Fma(velocity_data[j], mu, + local_lr * Fma(lars_weight_decay, param_data[j], grad_val)); + param_tmp[j] = param_data[j] - velocity_tmp[j]; + param_out_tmp[j] = static_cast(param_tmp[j]); + } + param_out_vec[i] = param_out_tmp; + velocity_out_vec[i] = velocity_tmp; + if (IsAmp) { + master_param_out_vec[i] = param_tmp; + } + } + + for (int i = tid + tail_offset; i < numel; i += grid_stride) { + MT grad_val = static_cast(grad[i]) * rescale_grad; + MT param_val = param[i]; + MT velocity_tmp = Fma(velocity[i], mu, local_lr * Fma(lars_weight_decay, + param_val, grad_val)); + MT param_tmp = param_val - velocity_tmp; + param_out[i] = static_cast(param_tmp); + velocity_out[i] = velocity_tmp; + if (IsAmp) { + master_param_out[i] = param_tmp; + } + } +} + template -__global__ void MomentumLarsKernel( - const T* p, const T* g, const MT* v, - const MultiPrecisionType* learning_rate, const MT mu, const int64_t num, - const MT lars_coeff, const MT lars_weight_decay, - const MultiPrecisionType* p_norm, const MultiPrecisionType* g_norm, - T* p_out, MT* v_out, const MT epsilon, const MT* master_p, MT* master_p_out, - const MultiPrecisionType rescale_grad) { - const MT lr = static_cast(learning_rate[0]); - MT local_lr = lr; - const MT p_n = static_cast(p_norm[0]); - const MT g_n = static_cast(g_norm[0]); +LARS_FUNCTION_FLAG void L2NormKernel( + const T* __restrict__ p_data, const T* __restrict__ g_data, + MT* __restrict__ p_buffer, MT* __restrict__ g_buffer, + const int repeat_times, const int64_t numel, const MT rescale_grad, + MT* __restrict__ p_n = nullptr, MT* __restrict__ g_n = nullptr) { + int tid = threadIdx.x + blockDim.x * blockIdx.x; + int grid_stride = LARS_BLOCK_SIZE * gridDim.x; + const MT rescale_grad_pow = rescale_grad * rescale_grad; + __shared__ MT s_buffer[2]; + s_buffer[0] = static_cast(0); + s_buffer[1] = static_cast(0); + MT p_tmp_val = static_cast(0); + MT g_tmp_val = static_cast(0); - if (lars_weight_decay > static_cast(0) && p_n > static_cast(0) && - g_n > static_cast(0)) { - local_lr = - lr * lars_coeff * p_n / (g_n + lars_weight_decay * p_n + epsilon); + if (repeat_times == 0) { + if (tid < numel) { + p_tmp_val = static_cast(p_data[tid]); + g_tmp_val = static_cast(g_data[tid]); + } + s_buffer[0] += math::blockReduceSum(p_tmp_val * p_tmp_val, FINAL_MASK); + s_buffer[1] += math::blockReduceSum(g_tmp_val * g_tmp_val, FINAL_MASK); + } else { + /* To avoid occupy too much temp buffer. Hence, slice the whole data into 2 + parts, the front of them whose quantity is excatly multiple of grid-thread + number, and this part of data is delt in for loop, the rest of data is delt + with another step to avoid visiting data address beyond bound. */ + for (int i = 0; i < repeat_times; ++i) { + p_tmp_val = static_cast(p_data[tid]); + g_tmp_val = static_cast(g_data[tid]); + tid += grid_stride; + s_buffer[0] += + math::blockReduceSum(p_tmp_val * p_tmp_val, FINAL_MASK); + s_buffer[1] += + math::blockReduceSum(g_tmp_val * g_tmp_val, FINAL_MASK); + __syncthreads(); + } + MT p_val = 0; + MT g_val = 0; + if (tid < numel) { + p_val = static_cast(p_data[tid]); + g_val = static_cast(g_data[tid]); + } + s_buffer[0] += math::blockReduceSum(p_val * p_val, FINAL_MASK); + s_buffer[1] += math::blockReduceSum(g_val * g_val, FINAL_MASK); } - CUDA_KERNEL_LOOP(i, num) { - MT grad = static_cast(g[i]) * static_cast(rescale_grad); - MT param = master_p ? master_p[i] : static_cast(p[i]); + __syncthreads(); + + if (threadIdx.x == 0) { + p_buffer[blockIdx.x] = s_buffer[0]; + g_buffer[blockIdx.x] = s_buffer[1]; + } + +#if CUDA_VERSION >= 11000 + // Grid sync for completely writring partial result back to gloabl memory + const cooperative_groups::grid_group cg = cooperative_groups::this_grid(); + cg.sync(); + MT p_partial_sum = threadIdx.x < gridDim.x ? p_buffer[threadIdx.x] : 0; + MT g_partial_sum = threadIdx.x < gridDim.x ? g_buffer[threadIdx.x] : 0; + *p_n = Sqrt(math::blockReduceSum(p_partial_sum, FINAL_MASK)); + *g_n = Sqrt(rescale_grad_pow * + math::blockReduceSum(g_partial_sum, FINAL_MASK)); +#endif +} - MT v_new = v[i] * mu + local_lr * (grad + lars_weight_decay * param); - MT p_new = param - v_new; +template +__global__ void MomentumLarsKernel( + const T* __restrict__ param, const T* __restrict__ grad, + const MT* __restrict__ velocity, T* param_out, MT* velocity_out, + const MT* __restrict__ master_param, MT* __restrict__ master_param_out, + const MT* __restrict__ learning_rate, MT* __restrict__ p_buffer, + MT* __restrict__ g_buffer, const MT mu, const MT lars_coeff, + const MT lars_weight_decay, const MT epsilon, const MT rescale_grad, + const int repeat_times, const int thresh, const int64_t numel) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int grid_stride = gridDim.x * LARS_BLOCK_SIZE; +#if CUDA_VERSION >= 11000 + MT param_norm = static_cast(0); + MT grad_norm = static_cast(0); + L2NormKernel(param, grad, p_buffer, g_buffer, repeat_times, numel, + rescale_grad, ¶m_norm, &grad_norm); +#else + const MT rescale_grad_pow = rescale_grad * rescale_grad; + MT param_parital_norm = threadIdx.x < thresh ? p_buffer[threadIdx.x] : 0; + MT grad_parital_norm = threadIdx.x < thresh ? g_buffer[threadIdx.x] : 0; + __syncthreads(); + MT param_norm = + Sqrt(math::blockReduceSum(param_parital_norm, FINAL_MASK)); + MT grad_norm = Sqrt(rescale_grad_pow * + math::blockReduceSum(grad_parital_norm, FINAL_MASK)); +#endif - v_out[i] = v_new; - p_out[i] = static_cast(p_new); - if (master_p_out) master_p_out[i] = p_new; + const MT lr = learning_rate[0]; + MT local_lr = lr; + if (lars_weight_decay > static_cast(0)) { + local_lr = lr * lars_coeff * param_norm / + (Fma(lars_weight_decay, param_norm, grad_norm) + epsilon); + } + + if (master_param_out) { + VectorizeLarsUpdate(grad, master_param, velocity, param_out, + velocity_out, mu, local_lr, + lars_weight_decay, rescale_grad, tid, + grid_stride, numel, master_param_out); + } else { + if (std::is_same::value || + std::is_same::value) { + // As for multiple-precision, type T and MT cannot be more than fp16 or + // fp32, Then, the maximum data IO size could be set to 4. + VectorizeLarsUpdate( + grad, reinterpret_cast(param), velocity, param_out, + velocity_out, mu, local_lr, lars_weight_decay, rescale_grad, tid, + grid_stride, numel); + } else { + VectorizeLarsUpdate( + grad, reinterpret_cast(param), velocity, param_out, + velocity_out, mu, local_lr, lars_weight_decay, rescale_grad, tid, + grid_stride, numel); + } } } template class LarsMomentumOpCUDAKernel : public framework::OpKernel { - using MPDType = MultiPrecisionType; + using MT = MultiPrecisionType; public: void Compute(const framework::ExecutionContext& ctx) const override { const bool multi_precision = ctx.Attr("multi_precision"); - if (multi_precision) { - InnerCompute(ctx, multi_precision); - } else { - InnerCompute(ctx, multi_precision); - } - } - - private: - template - void InnerCompute(const framework::ExecutionContext& ctx, - const bool multi_precision) const { auto param_out = ctx.Output("ParamOut"); auto velocity_out = ctx.Output("VelocityOut"); auto param = ctx.Input("Param"); @@ -78,8 +252,13 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { auto grad = ctx.Input("Grad"); auto learning_rate = ctx.Input("LearningRate"); + int64_t numel = param->numel(); + int grid = (numel + LARS_BLOCK_SIZE - 1) / LARS_BLOCK_SIZE; const framework::Tensor* master_param = nullptr; framework::Tensor* master_param_out = nullptr; + const MT* master_param_data = nullptr; + MT* master_param_out_data = nullptr; + if (multi_precision) { bool has_master = ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); @@ -90,56 +269,114 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { "the attr `multi_precision` is true")); master_param = ctx.Input("MasterParam"); master_param_out = ctx.Output("MasterParamOut"); + master_param_data = master_param->data(); + master_param_out_data = + master_param_out->mutable_data(ctx.GetPlace()); } - - const MT* master_p = multi_precision ? master_param->data() : nullptr; - MT* master_p_out = multi_precision - ? master_param_out->mutable_data(ctx.GetPlace()) - : nullptr; - - T* p_out = param_out->mutable_data(ctx.GetPlace()); - MT* v_out = velocity_out->mutable_data(ctx.GetPlace()); - MT mu = static_cast(ctx.Attr("mu")); MT lars_coeff = static_cast(ctx.Attr("lars_coeff")); MT lars_weight_decay = static_cast(ctx.Attr("lars_weight_decay")); MT epsilon = static_cast(ctx.Attr("epsilon")); - MPDType rescale_grad = - static_cast(ctx.Attr("rescale_grad")); - - auto* p = param->data(); - auto* g = grad->data(); - auto* v = velocity->data(); - auto* lr = learning_rate->data(); - - int block = 512; - int grid = (param->numel() + block - 1) / block; - - auto eigen_p = framework::EigenVector::Flatten(*param); - auto eigen_g = framework::EigenVector::Flatten(*grad); - // calculate norms using eigein and launch the kernel. - framework::Tensor p_norm_t, g_norm_t; - p_norm_t.Resize({1}); - g_norm_t.Resize({1}); - auto* p_norm_data = p_norm_t.mutable_data(ctx.GetPlace()); - auto* g_norm_data = g_norm_t.mutable_data(ctx.GetPlace()); - auto ep_norm = framework::EigenScalar::From(p_norm_t); - auto eg_norm = framework::EigenScalar::From(g_norm_t); - - auto* place = ctx.template device_context().eigen_device(); - - // eigen unsupport fp16 l2-norm - ep_norm.device(*place) = - eigen_p.template cast().square().sum().sqrt(); - eg_norm.device(*place) = - (eigen_g.template cast() * rescale_grad).square().sum().sqrt(); + MT rescale_grad = static_cast(ctx.Attr("rescale_grad")); - MomentumLarsKernel< - T, MT><<>>( - p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay, - p_norm_data, g_norm_data, p_out, v_out, epsilon, master_p, master_p_out, + auto* param_data = param->data(); + auto* grad_data = grad->data(); + auto* velocity_data = velocity->data(); + auto* lr = learning_rate->data(); + auto& cuda_ctx = ctx.template device_context(); + T* param_out_data = param_out->mutable_data(ctx.GetPlace()); + MT* velocity_out_data = velocity_out->mutable_data(ctx.GetPlace()); + +#if CUDA_VERSION >= 11000 + /* + Once model trainning with lars optimizer, whose principal implementation + is achieved by following two steps: + 1. Figure out the L2 norm statistic result of grad data and param data. + 2. Update param and velocity data with usage of L2 norm statistic result. + + Orignally, these two steps were fulfilled by respective eigen function and + cuda kernel, however the overhead of eigen function occupied much ratio in + total, consequently affect the performance of lars op, make it necessary + to combine 2 steps into one cuda kernel. + Since the step1 is l2 norm statistic, grid level reduce is needed. To + achieve this and continuous calculation of step 2 in only one global + lanuch, essential basis is to control all grid-threads while running. Apart + from normal lanuch form, cuda9.0 provides `cudaLaunchCooperativeKernel` + api : + - The thread quantity shall less than pyhsical SM limited threads + - Launches a device function where thread blocks can cooperate and + synchronize as they execute. + */ + // Figure out how many blocks can be active in each sm. + int num_blocks_per_sm = 0; + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm, + MomentumLarsKernel, + LARS_BLOCK_SIZE, sizeof(MT)); + int sm_num = cuda_ctx.GetSMCount(); + int grid_real = + std::min(std::min(sm_num * num_blocks_per_sm, grid), LARS_BLOCK_SIZE); + framework::Tensor tmp_buffer_t = + ctx.AllocateTmpTensor( + {LARS_BLOCK_SIZE << 1}, cuda_ctx); + auto* p_buffer = tmp_buffer_t.mutable_data(ctx.GetPlace()); + auto* g_buffer = p_buffer + LARS_BLOCK_SIZE; + int grid_stride = LARS_BLOCK_SIZE * grid; + int repeat_times = (numel + grid_stride - 1) / grid_stride - 1; + int thresh = 0; + + // Uniform kernel parameter for cudaLaunchCooperativeKernel + void* cuda_param[] = { + reinterpret_cast(¶m_data), + reinterpret_cast(&grad_data), + reinterpret_cast(&velocity_data), + reinterpret_cast(¶m_out_data), + reinterpret_cast(&velocity_out_data), + reinterpret_cast(&master_param_data), + reinterpret_cast(&master_param_out_data), + reinterpret_cast(&lr), + reinterpret_cast(&p_buffer), + reinterpret_cast(&g_buffer), + reinterpret_cast(&mu), + reinterpret_cast(&lars_coeff), + reinterpret_cast(&lars_weight_decay), + reinterpret_cast(&epsilon), + reinterpret_cast(&rescale_grad), + reinterpret_cast(&repeat_times), + reinterpret_cast(&thresh), // Just a placeholder + reinterpret_cast(&numel)}; + // Lanuch all sm theads. + cudaLaunchCooperativeKernel( + reinterpret_cast(MomentumLarsKernel), grid_real, + LARS_BLOCK_SIZE, cuda_param, 0, cuda_ctx.stream()); +#else + // Determine to read 4 fp16 or float data once, but 2 double data once. + int grid_lars = + sizeof(T) < sizeof(double) + ? (numel + (LARS_BLOCK_SIZE << 2) - 1) / (LARS_BLOCK_SIZE << 2) + : (numel + (LARS_BLOCK_SIZE << 1) - 1) / (LARS_BLOCK_SIZE << 1); + + int grid_norm = std::min(grid, LARS_BLOCK_SIZE); + framework::Tensor p_buffer_t = + ctx.AllocateTmpTensor( + {LARS_BLOCK_SIZE << 1}, cuda_ctx); + auto* p_buffer = p_buffer_t.mutable_data(ctx.GetPlace()); + auto* g_buffer = p_buffer + LARS_BLOCK_SIZE; + + const int grid_stride = LARS_BLOCK_SIZE * grid_norm; + const int repeat_times = (numel + grid_stride - 1) / grid_stride - 1; + + L2NormKernel<<>>( + param_data, grad_data, p_buffer, g_buffer, repeat_times, numel, rescale_grad); + + MomentumLarsKernel< + T, MT><<>>( + param_data, grad_data, velocity_data, param_out_data, velocity_out_data, + master_param_data, master_param_out_data, lr, p_buffer, g_buffer, mu, + lars_coeff, lars_weight_decay, epsilon, rescale_grad, 0, grid_norm, + numel); // 0 is just a placeholder. +#endif } }; From efd35384db04356d511b5f6fae50f3dd091ea224 Mon Sep 17 00:00:00 2001 From: Jiawei Wang Date: Mon, 27 Sep 2021 20:40:01 +0800 Subject: [PATCH 019/298] fix zero tensor for unique, unstack (#36021) * fix extra op for expand, expand_as, tile, unstack * fix unique unstack dim 0 * Update expand_v2_op.cc * fix unique_op format --- paddle/fluid/operators/unique_op.h | 5 ++++- paddle/fluid/operators/unstack_op.h | 2 +- python/paddle/fluid/layers/nn.py | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h index 99793ecd244cf2..66b0543771f4d3 100644 --- a/paddle/fluid/operators/unique_op.h +++ b/paddle/fluid/operators/unique_op.h @@ -403,7 +403,10 @@ class UniqueKernel : public framework::OpKernel { bool return_index = context.Attr("return_index"); bool return_inverse = context.Attr("return_inverse"); bool return_counts = context.Attr("return_counts"); - + if (x->numel() == 0) { + out->mutable_data(context.GetPlace()); + return; + } if (axis_vec.empty()) { framework::VisitDataTypeTiny( data_type, diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h index 82118b692707fb..cfd4d6bce83643 100644 --- a/paddle/fluid/operators/unstack_op.h +++ b/paddle/fluid/operators/unstack_op.h @@ -149,7 +149,7 @@ class UnStackKernel : public framework::OpKernel { dx_datas[i] = dx[i]->mutable_data(ctx.GetPlace()); } auto dy_data = dy->data(); - + if (dy->numel() == 0) return; int pre = 1; for (int i = 0; i < axis; ++i) pre *= dy->dims()[i]; int total_num = dy->numel(); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 515d4a5c0ef7cd..75b0392ab6ae47 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -10315,6 +10315,8 @@ def unstack(x, axis=0, num=None): if in_dygraph_mode(): if num == None: num = x.shape[axis] + if num == 0: + return [] return _C_ops.unstack(x, num, 'axis', int(axis), 'num', num) helper = LayerHelper('unstack', **locals()) From ae382d1fea6c55ff44f1439f1ca8df08048aa3d0 Mon Sep 17 00:00:00 2001 From: xiaoxiao-luomu <73728031+xiaoxiao-luomu@users.noreply.github.com> Date: Mon, 27 Sep 2021 22:45:55 +0800 Subject: [PATCH 020/298] gloo hdfs set check & gloo connect retry (#35750) * gloo hdfs set check & gloo connect retry * add vlog * print gloo connect addr & add vlog * . * modify vlof * modify vlog * modify vlog --- paddle/fluid/framework/fleet/gloo_wrapper.cc | 45 +++++++++++++++++++- paddle/fluid/framework/fleet/gloo_wrapper.h | 20 +++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc index 489cef9f04654a..14e5f2f51924ba 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.cc +++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc @@ -71,6 +71,18 @@ void HdfsStore::set(const std::string& key, const std::vector& data) { } } paddle::framework::fs_mv(tmp, path); + auto start = std::chrono::steady_clock::now(); + while (paddle::framework::fs_exists(path) == false) { + VLOG(0) << "HdfsStore::set fs_mv retrying..."; + paddle::framework::fs_mv(tmp, path); + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start); + if (wait_timeout_ != gloo::kNoTimeout && elapsed > wait_timeout_) { + PADDLE_THROW(paddle::platform::errors::ExecutionTimeout( + "fs_mv failed, tmp: %s, path: %s", tmp, path)); + } + std::this_thread::sleep_for(std::chrono::milliseconds(wait_sleep_ms_)); + } #endif } @@ -140,6 +152,7 @@ void HdfsStore::wait(const std::vector& keys, auto start = std::chrono::steady_clock::now(); std::vector check_key_status(keys.size(), false); while (!Check(keys, &check_key_status)) { + VLOG(0) << "HdfsStore::wait checking repeatedly..."; auto elapsed = std::chrono::duration_cast( std::chrono::steady_clock::now() - start); if (wait_timeout_ != gloo::kNoTimeout && elapsed > wait_timeout_) { @@ -209,6 +222,8 @@ void ParallelConnectContext::connectFullMesh( // Create pairs auto transportContext = dev->createContext(rank, size); transportContext->setTimeout(getTimeout()); + VLOG(0) << "transportContext timeout: " << getTimeout().count() + << ", curr rank: " << rank; for (int i = 0; i < size; i++) { if (i == rank) { continue; @@ -225,6 +240,7 @@ void ParallelConnectContext::connectFullMesh( std::vector> connect_threads(thread_num_); // Connect every pair + VLOG(0) << "connect_thread_num: " << thread_num_ << ", size: " << size; for (uint32_t i = 0; i < connect_threads.size(); ++i) { connect_threads[i].reset(new std::thread( [&store, &transportContext, total_add_size, this]( @@ -252,10 +268,36 @@ void ParallelConnectContext::connectFullMesh( sleep(5); --max_retry_times; } - auto addr = extractAddress(allAddrs, i); + if (addr.empty()) { + VLOG(0) << "peer address is null"; + } + Impl impl_; + memcpy(&impl_, addr.data(), sizeof(impl_)); + struct sockaddr_in* sa = (struct sockaddr_in*)&(impl_.ss); + std::string ip = getCharIpAddr(sa->sin_addr.s_addr); + VLOG(0) << "peer " << i << " ip addr: " << ip + << ", port: " << sa->sin_port; + + auto start = std::chrono::steady_clock::now(); + std::chrono::seconds connect_wait_timeout_ = + std::chrono::seconds(600); + while (true) { + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start); + if (elapsed > connect_wait_timeout_) { + break; + } + try { + transportContext->getPair(i)->connect(addr); + break; + } catch (...) { + VLOG(0) << "gloo connect failed, retrying..."; + } + } transportContext->getPair(i)->connect(addr); } + VLOG(0) << "peer connected success"; }, i, connect_threads.size())); } @@ -264,6 +306,7 @@ void ParallelConnectContext::connectFullMesh( } device_ = dev; transportContext_ = std::move(transportContext); + VLOG(0) << "ParallelConnectContext::connectFullMesh() is over"; } #endif } // namespace rendezvous diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h index 4eb40da1bfd39b..eafc991fbca0ae 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.h +++ b/paddle/fluid/framework/fleet/gloo_wrapper.h @@ -97,6 +97,26 @@ class ParallelConnectContext : public gloo::rendezvous::Context { // slowly in case big size, especialy in HdfsStore void connectFullMesh(Store& store, // NOLINT std::shared_ptr& dev); // NOLINT + struct Impl { + // IP address of the listening socket. + struct sockaddr_storage ss; + // Sequence number of this address. + // If this is equal to -1, the address is assumed to + // represent the listening socket of a device. The sequence number + // must be set before it can be used by a pair. + ssize_t seq{-1}; + }; + std::string getCharIpAddr(uint32_t ipAddress) { + const int NBYTES = 4; + uint8_t octet[NBYTES]; + char ipAddressFinal[16]; + for (int i = 0; i < NBYTES; i++) { + octet[i] = ipAddress >> (i * 8); + } + snprintf(ipAddressFinal, sizeof(ipAddressFinal), "%d.%d.%d.%d", octet[0], + octet[1], octet[2], octet[3]); + return std::string(ipAddressFinal); + } protected: int thread_num_ = 6; From 74ff59cfae77fead43640151f10fa27f1c02f1f3 Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Tue, 28 Sep 2021 09:02:29 +0800 Subject: [PATCH 021/298] dlpack fix (#35817) --- cmake/external/dlpack.cmake | 2 +- paddle/fluid/framework/dlpack_tensor.cc | 80 +++++++++----------- paddle/fluid/framework/dlpack_tensor.h | 2 +- paddle/fluid/framework/dlpack_tensor_test.cc | 29 +++---- paddle/fluid/framework/tensor_util.cc | 21 ++++- paddle/fluid/pybind/pybind.cc | 7 +- python/paddle/tests/test_dlpack.py | 41 ++++++++++ python/paddle/utils/dlpack.py | 18 ++--- 8 files changed, 120 insertions(+), 80 deletions(-) diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake index 87db181d953afb..43ffde75992266 100644 --- a/cmake/external/dlpack.cmake +++ b/cmake/external/dlpack.cmake @@ -18,7 +18,7 @@ set(DLPACK_PREFIX_DIR ${THIRD_PARTY_PATH}/dlpack) set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack/src/extern_dlpack) set(DLPACK_REPOSITORY ${GIT_URL}/dmlc/dlpack.git) -set(DLPACK_TAG v0.2) +set(DLPACK_TAG v0.4) cache_third_party(extern_dlpack REPOSITORY ${DLPACK_REPOSITORY} diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index f1f5ba7789ea61..71b53b8a51882f 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -30,14 +30,10 @@ static ::DLDataType GetDLDataTypeCode() { ::DLDataType dtype; if (std::is_same>::value || std::is_same>::value) { - // The current dlpack library version is v0.2, and does not define - // kDLComplex value. But kDLComplex is defined by 5U in v0.4, so we set - // dtype.code to 5U directly here. After the dlpack library version being - // upgraded to v0.4, it should be written as follow. - // dtype.code = kDLComplex; - dtype.code = 5U; + dtype.code = kDLComplex; + } else if (std::is_same::value) { + dtype.code = kDLBfloat; } else if (std::is_same::value || - std::is_same::value || std::is_floating_point::value) { dtype.code = kDLFloat; } else if (std::is_unsigned::value) { @@ -77,47 +73,47 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) { #undef REG_DL_DATA_TYPE } -struct DLContextVisitor : public boost::static_visitor<::DLContext> { - inline ::DLContext operator()(const platform::CPUPlace &place) const { - ::DLContext ctx; - ctx.device_type = kDLCPU; - ctx.device_id = 0; - return ctx; +struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> { + inline ::DLDevice operator()(const platform::CPUPlace &place) const { + ::DLDevice device; + device.device_type = kDLCPU; + device.device_id = 0; + return device; } - inline ::DLContext operator()(const platform::XPUPlace &place) const { + inline ::DLDevice operator()(const platform::XPUPlace &place) const { PADDLE_THROW( platform::errors::Unimplemented("platform::XPUPlace is not supported")); } - inline ::DLContext operator()(const platform::NPUPlace &place) const { + inline ::DLDevice operator()(const platform::NPUPlace &place) const { PADDLE_THROW( platform::errors::Unimplemented("platform::NPUPlace is not supported")); } - inline ::DLContext operator()(const platform::NPUPinnedPlace &place) const { + inline ::DLDevice operator()(const platform::NPUPinnedPlace &place) const { PADDLE_THROW(platform::errors::Unimplemented( "platform::NPUPinnedPlace is not supported")); } - inline ::DLContext operator()(const platform::CUDAPlace &place) const { + inline ::DLDevice operator()(const platform::CUDAPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - ::DLContext ctx; - ctx.device_type = kDLGPU; - ctx.device_id = place.device; - return ctx; + ::DLDevice device; + device.device_type = kDLGPU; + device.device_id = place.device; + return device; #else PADDLE_THROW(platform::errors::Unavailable( "platform::CUDAPlace is not supported in CPU only version.")); #endif } - inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { + inline ::DLDevice operator()(const platform::CUDAPinnedPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - ::DLContext ctx; - ctx.device_type = kDLCPUPinned; - ctx.device_id = 0; - return ctx; + ::DLDevice device; + device.device_type = kDLCPUPinned; + device.device_id = 0; + return device; #else PADDLE_THROW(platform::errors::Unavailable( "platform::CUDAPinnedPlace is not supported in CPU only version.")); @@ -130,9 +126,9 @@ DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { // init data, data buffer t_.data = const_cast(tensor.data()); - // init ctx, DLContext type with device_type and device_id + // init device, DLDevice type with device_type and device_id auto place = tensor.place(); - t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place); + t_.device = boost::apply_visitor(internal::DLDeviceVisitor(), place); // init dtype t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type()); @@ -156,10 +152,8 @@ DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { t_.byte_offset = 0; } -::DLManagedTensor *DLPackTensor::ToCudfCompatibleDLManagedTensor() { - // init shape, tensor dims - // for DLManagedTensor shape need to be compatible with ndim - // refer to cupy and cudf, we new int64[ndim] +::DLManagedTensor *DLPackTensor::ToDLManagedTensor() { + // init shape auto shape = new int64_t[t_.ndim]; using DimType = decltype(t_.ndim); // int for (DimType i = 0; i < t_.ndim; ++i) { @@ -167,19 +161,15 @@ ::DLManagedTensor *DLPackTensor::ToCudfCompatibleDLManagedTensor() { } t_.shape = shape; - // init strides, nullptr means the tensor is compact - // refer to cupy and cudf, the compact tensor first dim's strides need to be 1 - // and second dim's strides need to be length of rows of cudf - // cudf now only support dim=2 - PADDLE_ENFORCE_LE(t_.ndim, 2, platform::errors::InvalidArgument( - "cudf now only supports dimension is 2, " - "but received dimension is %d.", - t_.ndim)); - - if (t_.ndim > 1) - t_.strides = new int64_t[2]{1, t_.shape[1]}; - else - t_.strides = new int64_t[1]{1}; + // init strides + auto strides = new int64_t[t_.ndim]; + for (DimType i = 0; i < t_.ndim; ++i) { + strides[i] = 1; + } + for (DimType i = t_.ndim - 2; i >= 0; --i) { + strides[i] = t_.shape[i + 1] * strides[i + 1]; + } + t_.strides = strides; auto tensor = new DLManagedTensor; tensor->dl_tensor = t_; diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h index e342523718b34b..03ed8884925ce4 100644 --- a/paddle/fluid/framework/dlpack_tensor.h +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -36,7 +36,7 @@ class DLPackTensor { inline operator ::DLTensor&() { return t_; } - ::DLManagedTensor* ToCudfCompatibleDLManagedTensor(); + ::DLManagedTensor* ToDLManagedTensor(); private: ::DLTensor t_; diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index 8265d105accae0..4e2d7bb979b617 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -30,7 +30,11 @@ template constexpr uint8_t GetDLDataTypeCode() { if (std::is_same>::value || std::is_same>::value) { - return static_cast(5); + return static_cast(kDLComplex); + } + + if (std::is_same::value) { + return static_cast(kDLBfloat); } return std::is_same::value || @@ -55,15 +59,15 @@ void TestMain(const platform::Place &place, uint16_t lanes) { CHECK_EQ(p, dl_tensor.data); if (platform::is_cpu_place(place)) { - CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type); - CHECK_EQ(0, dl_tensor.ctx.device_id); + CHECK_EQ(kDLCPU, dl_tensor.device.device_type); + CHECK_EQ(0, dl_tensor.device.device_id); } else if (platform::is_gpu_place(place)) { - CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type); + CHECK_EQ(kDLGPU, dl_tensor.device.device_type); CHECK_EQ(BOOST_GET_CONST(platform::CUDAPlace, place).device, - dl_tensor.ctx.device_id); + dl_tensor.device.device_id); } else if (platform::is_cuda_pinned_place(place)) { - CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type); - CHECK_EQ(0, dl_tensor.ctx.device_id); + CHECK_EQ(kDLCPUPinned, dl_tensor.device.device_type); + CHECK_EQ(0, dl_tensor.device.device_id); } else { CHECK_EQ(false, true); } @@ -83,8 +87,7 @@ void TestMain(const platform::Place &place, uint16_t lanes) { } template -void TestToCudfCompatibleDLManagedTensor(const platform::Place &place, - uint16_t lanes) { +void TestToDLManagedTensor(const platform::Place &place, uint16_t lanes) { DDim dims{6, 7}; Tensor tensor; tensor.Resize(dims); @@ -92,8 +95,7 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place, DLPackTensor dlpack_tensor(tensor, lanes); - ::DLManagedTensor *dl_managed_tensor = - dlpack_tensor.ToCudfCompatibleDLManagedTensor(); + ::DLManagedTensor *dl_managed_tensor = dlpack_tensor.ToDLManagedTensor(); CHECK_EQ(dl_managed_tensor->manager_ctx == nullptr, true); @@ -101,7 +103,8 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place, CHECK_EQ(dims[i], dl_managed_tensor->dl_tensor.shape[i]); } - CHECK_EQ(dl_managed_tensor->dl_tensor.strides[0] == 1, true); + CHECK_EQ(dl_managed_tensor->dl_tensor.strides[0] == 7, true); + CHECK_EQ(dl_managed_tensor->dl_tensor.strides[1] == 1, true); dl_managed_tensor->deleter(dl_managed_tensor); } @@ -122,7 +125,7 @@ void TestMainLoop() { for (auto &p : places) { for (auto &l : lanes) { TestMain(p, l); - TestToCudfCompatibleDLManagedTensor(p, l); + TestToDLManagedTensor(p, l); } } } diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 15021b6267b656..ee30a82aff6ef0 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -1065,6 +1065,9 @@ void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst, if (type.code == kDLFloat) return static_cast( dst->mutable_data(dst_place)); + if (type.code == kDLBfloat) + return static_cast( + dst->mutable_data(dst_place)); PADDLE_THROW(platform::errors::Unimplemented( "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.", type.code, type.bits)); @@ -1081,6 +1084,16 @@ void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst, return static_cast(dst->mutable_data(dst_place)); if (type.code == kDLFloat) return static_cast(dst->mutable_data(dst_place)); + if (type.code == kDLComplex) + return static_cast( + dst->mutable_data>(dst_place)); + PADDLE_THROW(platform::errors::Unimplemented( + "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.", + type.code, type.bits)); + case 128: + if (type.code == kDLComplex) + return static_cast( + dst->mutable_data>(dst_place)); PADDLE_THROW(platform::errors::Unimplemented( "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.", type.code, type.bits)); @@ -1107,15 +1120,15 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) { auto src_ptr = static_cast(dl_tensor.data); auto size = paddle::framework::product(vddim) * type.bits / 8; - if (dl_tensor.ctx.device_type == kDLCPU) { + if (dl_tensor.device.device_type == kDLCPU) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (dl_tensor.ctx.device_type == kDLGPU) { + if (dl_tensor.device.device_type == kDLGPU) { platform::CUDAPlace dst_place = - platform::CUDAPlace(dl_tensor.ctx.device_id); + platform::CUDAPlace(dl_tensor.device.device_id); platform::CUDAPlace src_place = - platform::CUDAPlace(dl_tensor.ctx.device_id); + platform::CUDAPlace(dl_tensor.device.device_id); dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place); auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place); memory::Copy( diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c00f529f61793f..16e42885c52fb7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -537,11 +537,11 @@ PYBIND11_MODULE(core_noavx, m) { DLTensor dl = dmt->dl_tensor; framework::Tensor tensor; - if (dl.ctx.device_type == kDLCPU) { + if (dl.device.device_type == kDLCPU) { paddle::framework::TensorFromDLPack(dl, &tensor); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (dl.ctx.device_type == kDLGPU) { + if (dl.device.device_type == kDLGPU) { paddle::framework::TensorFromDLPack(dl, &tensor); } #endif @@ -776,8 +776,7 @@ PYBIND11_MODULE(core_noavx, m) { .def("_to_dlpack", [](framework::Tensor &self) { DLPackTensor dlpack_tensor(self, 1); - DLManagedTensor *dmt = - dlpack_tensor.ToCudfCompatibleDLManagedTensor(); + DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor(); auto capsule = py::capsule( static_cast(dmt), "dltensor", [](PyObject *ptr) { if (ptr) { diff --git a/python/paddle/tests/test_dlpack.py b/python/paddle/tests/test_dlpack.py index 2880901d1ad161..3a3f748bb991e7 100644 --- a/python/paddle/tests/test_dlpack.py +++ b/python/paddle/tests/test_dlpack.py @@ -22,6 +22,7 @@ class TestDLPack(unittest.TestCase): def test_dlpack_dygraph(self): + paddle.disable_static() tensor = paddle.to_tensor(np.array([1, 2, 3, 4]).astype('int')) dlpack = paddle.utils.dlpack.to_dlpack(tensor) out_from_dlpack = paddle.utils.dlpack.from_dlpack(dlpack) @@ -31,6 +32,15 @@ def test_dlpack_dygraph(self): np.array(out_from_dlpack), np.array([1, 2, 3, 4]).astype( 'int'))) + def test_dlpack_tensor_larger_than_2dim(self): + paddle.disable_static() + numpy_data = np.random.randn(4, 5, 6) + t = paddle.to_tensor(numpy_data) + # TODO: There may be a reference count problem of to_dlpack. + dlpack = paddle.utils.dlpack.to_dlpack(t) + out = paddle.utils.dlpack.from_dlpack(dlpack) + self.assertTrue(np.allclose(numpy_data, out.numpy())) + def test_dlpack_static(self): paddle.enable_static() tensor = fluid.create_lod_tensor( @@ -57,6 +67,37 @@ def test_dlpack_static(self): np.array(gout_from_dlpack), np.array([[1], [2], [3], [4]]).astype('int'))) + def test_dlpack_dtype_conversion(self): + paddle.disable_static() + # DLpack does not explicitly support bool data type. + dtypes = [ + "float16", + "float32", + "float64", + "int8", + "int16", + "int32", + "int64", + "uint8", + ] + data = np.ones((2, 3, 4)) + for dtype in dtypes: + x = paddle.to_tensor(data, dtype=dtype) + dlpack = paddle.utils.dlpack.to_dlpack(x) + o = paddle.utils.dlpack.from_dlpack(dlpack) + self.assertEqual(x.dtype, o.dtype) + self.assertTrue(np.allclose(x.numpy(), o.numpy())) + + complex_dtypes = ["complex64", "complex128"] + for dtype in complex_dtypes: + x = paddle.to_tensor( + [[1 + 6j, 2 + 5j, 3 + 4j], [4 + 3j, 5 + 2j, 6 + 1j]], + dtype=dtype) + dlpack = paddle.utils.dlpack.to_dlpack(x) + o = paddle.utils.dlpack.from_dlpack(dlpack) + self.assertEqual(x.dtype, o.dtype) + self.assertTrue(np.allclose(x.numpy(), o.numpy())) + class TestRaiseError(unittest.TestCase): def test_from_dlpack_raise_type_error(self): diff --git a/python/paddle/utils/dlpack.py b/python/paddle/utils/dlpack.py index ca2a1ae0e19ec5..01611be3ea56f1 100644 --- a/python/paddle/utils/dlpack.py +++ b/python/paddle/utils/dlpack.py @@ -28,7 +28,9 @@ def to_dlpack(x): Encodes a tensor to DLPack. Args: - x (Tensor): A tensor, and the data type is bool, float32, float64, int32, int64. + x (Tensor): The input tensor, and the data type can be `bool`, `float16`, `float32`, + `float64`, `int8`, `int16`, `int32`, `int64`, `uint8`, `complex64`, + `complex128`. Returns: dltensor, and the data type is PyCapsule. @@ -51,19 +53,9 @@ def to_dlpack(x): "The type of 'x' in to_dlpack must be paddle.Tensor," " but received {}.".format(type(x))) - dtype = convert_dtype(x.dtype) - - if dtype not in ['bool', 'int32', 'int64', 'float32', 'float64']: - raise TypeError( - "the dtype of 'x' in to_dlpack must be any of [bool, int32, int64, " - "float32, float64], but received {}.".format(dtype)) - return x.value().get_tensor()._to_dlpack() check_type(x, 'x', (LoDTensor), 'to_dlpack') - check_dtype(x._dtype(), 'x', - ['bool', 'int32', 'int64', 'float32', 'float64'], 'to_dlpack') - return x._to_dlpack() @@ -75,7 +67,9 @@ def from_dlpack(dlpack): dlpack (PyCapsule): a PyCapsule object with the dltensor. Returns: - out (Tensor): a tensor decoded from DLPack. + out (Tensor): a tensor decoded from DLPack. One thing to be noted, if we get + an input dltensor with data type as `bool`, we return the decoded + tensor as `uint8`. Examples: .. code-block:: python From 6f18b0414a9c5bd88d09f862a7f2bdadb3c6728f Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Tue, 28 Sep 2021 09:40:45 +0800 Subject: [PATCH 022/298] Add Basic CINN Runner Class (#35978) * Add Basic CINN Runner Class * Add CinnCacheKey * Add Cache logic and improve CinnCacheKey * Modify as reviewer commented * Implement hash_combine to fix MAC build. --- paddle/fluid/framework/CMakeLists.txt | 1 + .../framework/paddle2cinn/CMakeLists.txt | 7 ++ .../framework/paddle2cinn/cinn_cache_key.cc | 87 +++++++++++++++ .../framework/paddle2cinn/cinn_cache_key.h | 63 +++++++++++ .../paddle2cinn/cinn_cache_key_test.cc | 101 ++++++++++++++++++ .../paddle2cinn/cinn_compiled_object.cc | 50 +++++++++ .../paddle2cinn/cinn_compiled_object.h | 50 +++++++++ .../paddle2cinn/cinn_compiled_object_test.cc | 41 +++++++ .../framework/paddle2cinn/cinn_runner.cc | 46 ++++++++ .../fluid/framework/paddle2cinn/cinn_runner.h | 55 ++++++++++ .../framework/paddle2cinn/cinn_runner_test.cc | 41 +++++++ 11 files changed, 542 insertions(+) create mode 100644 paddle/fluid/framework/paddle2cinn/CMakeLists.txt create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_cache_key.h create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_compiled_object.cc create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_compiled_object_test.cc create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_runner.cc create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_runner.h create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index de19c7a0e773e3..67073350d5a8aa 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -26,6 +26,7 @@ add_subdirectory(details) add_subdirectory(fleet) add_subdirectory(io) add_subdirectory(new_executor) +add_subdirectory(paddle2cinn) #ddim lib proto_library(framework_proto SRCS framework.proto) proto_library(pass_desc_proto SRCS pass_desc.proto DEPS framework_proto) diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt new file mode 100644 index 00000000000000..8621c7363a09f1 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt @@ -0,0 +1,7 @@ +cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper lod_tensor proto_desc) +cc_library(cinn_compiled_object SRCS cinn_compiled_object.cc DEPS feed_fetch_method graph lod_tensor proto_desc) +cc_library(cinn_runner SRCS cinn_runner.cc DEPS cinn_cache_key cinn_compiled_object feed_fetch_method graph lod_tensor scope) + +cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key) +cc_test(cinn_runner_test SRCS cinn_runner_test.cc DEPS cinn_runner proto_desc) +cc_test(cinn_compiled_object_test SRCS cinn_compiled_object_test.cc DEPS cinn_compiled_object) diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc new file mode 100644 index 00000000000000..ac6c83be4fae3c --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" + +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +CinnCacheKey::CinnCacheKey( + const ir::Graph& graph, + const std::map& feed_tensors) { + this->SetKey(graph, feed_tensors); +} + +CinnCacheKey::CinnCacheKey(const ir::Graph& graph, + const std::map& feed_shapes) { + this->SetKey(graph, feed_shapes); +} + +void CinnCacheKey::SetKey( + const ir::Graph& graph, + const std::map& feed_tensors) { + ProgramDesc program; + GraphToProgram(graph, &program); + program.Proto()->SerializeToString(&graph_serialize_str_); + for (const auto& name_tensor : feed_tensors) { + feed_shapes_[name_tensor.first] = name_tensor.second->dims(); + } +} + +void CinnCacheKey::SetKey(const ir::Graph& graph, + const std::map& feed_shapes) { + ProgramDesc program; + GraphToProgram(graph, &program); + program.Proto()->SerializeToString(&graph_serialize_str_); + feed_shapes_ = feed_shapes; +} + +bool CinnCacheKey::operator!=(const CinnCacheKey& other) const { + return !this->operator==(other); +} + +bool CinnCacheKey::operator==(const CinnCacheKey& other) const { + return graph_serialize_str_ == other.graph_serialize_str_ && + feed_shapes_ == other.feed_shapes_; +} + +size_t CinnCacheKey::Hash::hash_combine(size_t seed, size_t value) { + return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2)); +} + +size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const { + std::size_t ret = 0; + + std::hash string_hasher; + for (const auto& name_shape : key.feed_shapes_) { + ret = hash_combine(ret, string_hasher(name_shape.first)); + ret = hash_combine(ret, string_hasher(name_shape.second.to_str())); + } + + ret = hash_combine(ret, string_hasher(key.graph_serialize_str_)); + return ret; +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h new file mode 100644 index 00000000000000..9627ae92aaba25 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h @@ -0,0 +1,63 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +// Class to store the keys for compiling CINN. +// +// CINN cannot handle changable shape now, so CinnRunner keeps a cache mapping +// from CinnCacheKey to CinnCompiledObject. +// +// The CinnCacheKey contains a graph serialized string and the feeded tensor +// shapes. +class CinnCacheKey { + public: + CinnCacheKey(const ir::Graph& graph, + const std::map& feed_tensors); + CinnCacheKey(const ir::Graph& graph, + const std::map& feed_shapes); + + ~CinnCacheKey() {} + + void SetKey(const ir::Graph& graph, + const std::map& feed_tensors); + void SetKey(const ir::Graph& graph, + const std::map& feed_shapes); + + bool operator==(const CinnCacheKey& other) const; + bool operator!=(const CinnCacheKey& other) const; + + struct Hash { + static size_t hash_combine(size_t seed, size_t value); + size_t operator()(const CinnCacheKey& key) const; + }; + + private: + std::string graph_serialize_str_; + std::map feed_shapes_; +}; + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc new file mode 100644 index 00000000000000..a84ade26bfd124 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +TEST(CinnCacheKeyTest, TestAsUnorderedKey) { + std::unordered_set test_set; + + ProgramDesc empty_program; + ir::Graph empty_graph(empty_program); + + ProgramDesc program; + auto *global_block = program.MutableBlock(0); + auto *x = global_block->Var("X"); + x->SetType(proto::VarType::LOD_TENSOR); + ir::Graph graph(program); + + LoDTensor tensor; + tensor.Resize({1, 2, 3}); + const LoDTensor *tensor_pointer = &tensor; + std::map feed_tensors = { + {"X", tensor_pointer}}; + + DDim ddim = paddle::framework::make_ddim({1, 2, 3}); + std::map feed_shapes = {{"X", ddim}}; + + CinnCacheKey cache_key1(empty_graph, feed_tensors); + CinnCacheKey cache_key2(empty_graph, feed_shapes); + EXPECT_EQ(cache_key1, cache_key2); + + CinnCacheKey cache_key3(graph, feed_shapes); + CinnCacheKey cache_key4(graph, feed_tensors); + EXPECT_EQ(cache_key3, cache_key4); + + CinnCacheKey cache_key5(empty_graph, + std::map()); + CinnCacheKey cache_key6(empty_graph, std::map()); + EXPECT_EQ(cache_key5, cache_key6); + + EXPECT_NE(cache_key1, cache_key3); + EXPECT_NE(cache_key4, cache_key2); + + EXPECT_NE(cache_key3, cache_key5); + EXPECT_NE(cache_key6, cache_key4); + + EXPECT_NE(cache_key5, cache_key1); + EXPECT_NE(cache_key2, cache_key6); + + test_set.insert(cache_key1); + test_set.insert(cache_key2); + test_set.insert(cache_key3); + test_set.insert(cache_key4); + test_set.insert(cache_key5); + test_set.insert(cache_key6); + EXPECT_EQ(test_set.size(), 3U); + + auto iter = test_set.find(cache_key1); + EXPECT_NE(iter, test_set.end()); + test_set.erase(iter); + EXPECT_EQ(test_set.size(), 2U); + EXPECT_EQ(test_set.find(cache_key2), test_set.end()); + + iter = test_set.find(cache_key3); + EXPECT_NE(iter, test_set.end()); + test_set.erase(iter); + EXPECT_EQ(test_set.size(), 1U); + EXPECT_EQ(test_set.find(cache_key4), test_set.end()); + + iter = test_set.find(cache_key5); + EXPECT_NE(iter, test_set.end()); + test_set.erase(iter); + EXPECT_EQ(test_set.size(), 0U); + EXPECT_EQ(test_set.find(cache_key6), test_set.end()); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.cc new file mode 100644 index 00000000000000..a90494bafe9bb6 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h" + +#include + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +CinnCompiledObject::CinnCompiledObject() { + // TODO(zhhsplendid): complete this function after CINN interface is ready +} +CinnCompiledObject::~CinnCompiledObject() { + // TODO(zhhsplendid): complete this function after CINN interface is ready +} + +void CinnCompiledObject::Compile( + const ir::Graph& graph, + std::map* feed_targets) { + // TODO(zhhsplendid): complete this function after CINN interface is ready +} + +std::map CinnCompiledObject::Run( + Scope* scope, std::map* feed_targets) { + // TODO(zhhsplendid): complete this function after CINN interface is ready + return std::map(); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h new file mode 100644 index 00000000000000..21191d44345877 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h @@ -0,0 +1,50 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +// Class to store and call CINN complied object +class CinnCompiledObject { + public: + CinnCompiledObject(); + ~CinnCompiledObject(); + + // Compiles use CINN. CINN compilation needs model graph, input names, and + // input_shapes + void Compile(const ir::Graph& graph, + std::map* feed_targets); + + // Feed LoDTensors to tun CINN compiled object and return fetched result + std::map Run( + Scope* scope, std::map* feed_targets); + + // Converts compiled object to Paddle Graph + // To be discussed + // ir::Graph ToGraph(); +}; + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiled_object_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object_test.cc new file mode 100644 index 00000000000000..5a7861edf210c4 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object_test.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +TEST(CinnCompiledObjecctTest, TodoTest) { + ProgramDesc empty_program; + ir::Graph empty_graph(empty_program); + std::map empty_feed; + Scope empty_scope; + + CinnCompiledObject compiled_obj; + compiled_obj.Compile(empty_graph, &empty_feed); + auto fetch = compiled_obj.Run(&empty_scope, &empty_feed); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner.cc b/paddle/fluid/framework/paddle2cinn/cinn_runner.cc new file mode 100644 index 00000000000000..de5af910c99add --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_runner.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" + +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using ir::Graph; + +std::map CinnRunner::Run( + const Graph& graph, Scope* scope, + std::map* feed_targets) { + CinnCacheKey cur_key(graph, *feed_targets); + std::shared_ptr obj_to_run; + if (cache_.find(cur_key) != cache_.end()) { + obj_to_run = cache_[cur_key]; + } else { + obj_to_run = std::make_shared(); + obj_to_run->Compile(graph, feed_targets); + cache_[cur_key] = obj_to_run; + } + return obj_to_run->Run(scope, feed_targets); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner.h b/paddle/fluid/framework/paddle2cinn/cinn_runner.h new file mode 100644 index 00000000000000..5f63d64545ff75 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_runner.h @@ -0,0 +1,55 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +// Entrance to run CINN. +// +// CINN cannot handle changable shape now, so CinnRunner keeps a cache mapping +// from CinnCacheKey to CinnCompiledObject. If cache hits, we will re-use cache +// stored CinnCompiledObject, otherwise we will compile again and put into +// cache. +class CinnRunner { + public: + CinnRunner() {} + ~CinnRunner() {} + + // Feed LoDTensors to tun CINN compiled object and return fetched result + std::map Run( + const ir::Graph& graph, Scope* scope, + std::map* feed_targets); + + private: + std::unordered_map, + CinnCacheKey::Hash> + cache_; +}; + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc new file mode 100644 index 00000000000000..88aca0bd66b375 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using ir::Graph; + +TEST(CinnRunnerTest, TodoTest) { + ProgramDesc empty_program; + Graph empty_graph(empty_program); + Scope empty_scope; + std::map empty_feed; + + CinnRunner cinn_runner; + cinn_runner.Run(empty_graph, &empty_scope, &empty_feed); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle From 4cbed9e5422df6b3aacb170fa99a5915885d15b2 Mon Sep 17 00:00:00 2001 From: Yanxing Shi <48111042+Yanxing-Shi@users.noreply.github.com> Date: Tue, 28 Sep 2021 10:09:33 +0800 Subject: [PATCH 023/298] Add paddle.device.cuda.get_device_properties (#35661) * Initial Commit * add unittest and add error information * modify doc * fix some error * fix some word * fix bug cudaDeviceProp* and modify error explanation * fix cudaDeviceProp* error and unnitest samples * fix hip error and PADDLE_WITH_HIP * update style * fix error is_compiled_with_cuda * fix paddle.device.cuda.get_device_properties * fix error for multi thread safe * update style * merge conflict * modify after mentor review * update style * delete word * fix unittest error for windows * support string input and modify some code * modify doc to support string input * fix error for express information * fix error for express information * fix unnitest for windows * fix device.startswith('gpu:') * format error and doc * fix after review * format code * fix error for doc compile * fix error for doc compile * fix error for doc compile * fix error for doc compile * fix error for doc compile * fix py2 error * fix wrong words and doc * fix _gpuDeviceProperties --- paddle/fluid/platform/gpu_info.cc | 44 ++++++++++++ paddle/fluid/platform/gpu_info.h | 3 + paddle/fluid/platform/type_defs.h | 2 + paddle/fluid/pybind/pybind.cc | 25 +++++++ python/paddle/device/cuda/__init__.py | 67 ++++++++++++++++++ .../unittests/test_get_device_properties.py | 70 +++++++++++++++++++ 6 files changed, 211 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_get_device_properties.py diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 76edb3910ccced..c4ac5aa3046a9c 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #include +#include +#include #include "gflags/gflags.h" #include "paddle/fluid/platform/cuda_device_guard.h" @@ -39,6 +41,10 @@ DECLARE_uint64(gpu_memory_limit_mb); constexpr static float fraction_reserve_gpu_memory = 0.05f; +static std::once_flag g_device_props_size_init_flag; +static std::vector> g_device_props_init_flags; +static std::vector g_device_props; + USE_GPU_MEM_STAT; namespace paddle { namespace platform { @@ -297,6 +303,44 @@ std::vector GetSelectedDevices() { return devices; } +const gpuDeviceProp &GetDeviceProperties(int id) { + std::call_once(g_device_props_size_init_flag, [&] { + int gpu_num = 0; + gpu_num = platform::GetCUDADeviceCount(); + g_device_props_init_flags.resize(gpu_num); + g_device_props.resize(gpu_num); + for (int i = 0; i < gpu_num; ++i) { + g_device_props_init_flags[i] = std::make_unique(); + } + }); + + if (id == -1) { + id = platform::GetCurrentDeviceId(); + } + + if (id < 0 || id >= static_cast(g_device_props.size())) { + PADDLE_THROW(platform::errors::OutOfRange( + "The device id %d is out of range [0, %d), where %d is the number of " + "devices on this machine. Because the device id should be greater than " + "or equal to zero and smaller than the number of gpus. Please input " + "appropriate device again!", + id, static_cast(g_device_props.size()), + static_cast(g_device_props.size()))); + } + + std::call_once(*(g_device_props_init_flags[id]), [&] { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaGetDeviceProperties(&g_device_props[id], id)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS( + hipGetDeviceProperties(&g_device_props[id], id)); +#endif + }); + + return g_device_props[id]; +} + void SetDeviceId(int id) { // TODO(qijun): find a better way to cache the cuda device count PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index ef7f93a61dbfb3..401873dcd77da2 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -67,6 +67,9 @@ dim3 GetGpuMaxGridDimSize(int); //! Get a list of device ids from environment variable or use all. std::vector GetSelectedDevices(); +//! Get the properties of the ith GPU device. +const gpuDeviceProp &GetDeviceProperties(int id); + //! Set the GPU device id for next execution. void SetDeviceId(int device_id); diff --git a/paddle/fluid/platform/type_defs.h b/paddle/fluid/platform/type_defs.h index 31784a04265803..f46bd1a0bdfa4a 100644 --- a/paddle/fluid/platform/type_defs.h +++ b/paddle/fluid/platform/type_defs.h @@ -27,11 +27,13 @@ namespace paddle { using gpuStream_t = hipStream_t; using gpuError_t = hipError_t; using gpuEvent_t = hipEvent_t; +using gpuDeviceProp = hipDeviceProp_t; #else #define gpuSuccess cudaSuccess using gpuStream_t = cudaStream_t; using gpuError_t = cudaError_t; using gpuEvent_t = cudaEvent_t; +using gpuDeviceProp = cudaDeviceProp; #endif } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 16e42885c52fb7..a16916ab33f831 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2285,6 +2285,31 @@ All parameter, weight, gradient are variables in Paddle. #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("get_cuda_device_count", platform::GetCUDADeviceCount); m.def("cuda_empty_cache", platform::EmptyCache); + m.def("get_device_properties", + [](int id) -> const gpuDeviceProp & { + return platform::GetDeviceProperties(id); + }, + py::return_value_policy::copy); + + py::class_(m, "_gpuDeviceProperties") + .def_readonly("name", &gpuDeviceProp::name) + .def_readonly("major", &gpuDeviceProp::major) + .def_readonly("minor", &gpuDeviceProp::minor) + .def_readonly("is_multi_gpu_board", &gpuDeviceProp::isMultiGpuBoard) + .def_readonly("is_integrated", &gpuDeviceProp::integrated) + .def_readonly("multi_processor_count", + &gpuDeviceProp::multiProcessorCount) + .def_readonly("total_memory", &gpuDeviceProp::totalGlobalMem) + .def("__repr__", [](const gpuDeviceProp &gpu_device_prop) { + std::ostringstream stream; + stream << "_gpuDeviceProperties(name='" << gpu_device_prop.name + << "', major=" << gpu_device_prop.major + << ", minor=" << gpu_device_prop.minor << ", total_memory=" + << gpu_device_prop.totalGlobalMem / (1024 * 1024) + << "MB, multi_processor_count=" + << gpu_device_prop.multiProcessorCount << ")"; + return stream.str(); + }); #if !defined(PADDLE_WITH_HIP) && !defined(_WIN32) m.def("nvprof_init", platform::CudaProfilerInit); diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index 4d1934aeed9fb5..a559df21ad2413 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -27,6 +27,7 @@ 'device_count', 'empty_cache', 'stream_guard', + 'get_device_properties', ] @@ -204,3 +205,69 @@ def stream_guard(stream): yield finally: stream = _set_current_stream(pre_stream) + + +def get_device_properties(device=None): + ''' + Return the properties of given device. + + Args: + device(paddle.CUDAPlace or int or str): The device, the id of the device + or the string name of device like 'gpu:x' which to get the properties of + the device from. If device is None, the device is the current device. + Default: None. + + Returns: + _gpuDeviceProperties: the properties of the device which include ASCII string + identifying device, major compute capability, minor compute capability, global + memory available on device and the number of multiprocessors on the device. + + Examples: + + .. code-block:: python + + # required: gpu + + import paddle + paddle.device.cuda.get_device_properties() + # _gpuDeviceProperties(name='A100-SXM4-40GB', major=8, minor=0, total_memory=40536MB, multi_processor_count=108) + + paddle.device.cuda.get_device_properties(0) + # _gpuDeviceProperties(name='A100-SXM4-40GB', major=8, minor=0, total_memory=40536MB, multi_processor_count=108) + + paddle.device.cuda.get_device_properties('gpu:0') + # _gpuDeviceProperties(name='A100-SXM4-40GB', major=8, minor=0, total_memory=40536MB, multi_processor_count=108) + + paddle.device.cuda.get_device_properties(paddle.CUDAPlace(0)) + # _gpuDeviceProperties(name='A100-SXM4-40GB', major=8, minor=0, total_memory=40536MB, multi_processor_count=108) + + ''' + + if not core.is_compiled_with_cuda(): + raise ValueError( + "The API paddle.device.cuda.get_device_properties is not supported in " + "CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support " + "to call this API.") + + if device is not None: + if isinstance(device, int): + device_id = device + elif isinstance(device, core.CUDAPlace): + device_id = device.get_device_id() + elif isinstance(device, str): + if device.startswith('gpu:'): + device_id = int(device[4:]) + else: + raise ValueError( + "The current string {} is not expected. Because paddle.device." + "cuda.get_device_properties only support string which is like 'gpu:x'. " + "Please input appropriate string again!".format(device)) + else: + raise ValueError( + "The device type {} is not expected. Because paddle.device.cuda." + "get_device_properties only support int, str or paddle.CUDAPlace. " + "Please input appropriate device again!".format(device)) + else: + device_id = -1 + + return core.get_device_properties(device_id) diff --git a/python/paddle/fluid/tests/unittests/test_get_device_properties.py b/python/paddle/fluid/tests/unittests/test_get_device_properties.py new file mode 100644 index 00000000000000..4cfb91bfae93e7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_get_device_properties.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest +from paddle.fluid import core +from paddle.device.cuda import device_count, get_device_properties + + +class TestGetDeviceProperties(unittest.TestCase): + def test_get_device_properties_default(self): + if core.is_compiled_with_cuda(): + props = get_device_properties() + self.assertIsNotNone(props) + + def test_get_device_properties_str(self): + if core.is_compiled_with_cuda(): + props = get_device_properties('gpu:0') + self.assertIsNotNone(props) + + def test_get_device_properties_int(self): + if core.is_compiled_with_cuda(): + gpu_num = device_count() + for i in range(gpu_num): + props = get_device_properties(i) + self.assertIsNotNone(props) + + def test_get_device_properties_CUDAPlace(self): + if core.is_compiled_with_cuda(): + device = core.CUDAPlace(0) + props = get_device_properties(device) + self.assertIsNotNone(props) + + +class TestGetDevicePropertiesError(unittest.TestCase): + def test_error_api(self): + if core.is_compiled_with_cuda(): + + def test_device_indexError_error(): + device_error = device_count() + 1 + props = get_device_properties(device_error) + + self.assertRaises(IndexError, test_device_indexError_error) + + def test_device_value_error1(): + device_error = 'gpu1' + props = get_device_properties(device_error) + + self.assertRaises(ValueError, test_device_value_error1) + + def test_device_value_error2(): + device_error = float(device_count()) + props = get_device_properties(device_error) + + self.assertRaises(ValueError, test_device_value_error2) + + +if __name__ == "__main__": + unittest.main() From ad128144d9aa5667c7c5fa3328a00bd2a7606b00 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 28 Sep 2021 10:15:23 +0800 Subject: [PATCH 024/298] rename scale loss grad (#36162) --- paddle/fluid/framework/details/scale_loss_grad_op_handle.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index fcfbfd0557e256..c0c3e14c8bf231 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -105,7 +105,7 @@ void ScaleLossGradOpHandle::RunImpl() { #endif } -std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; } +std::string ScaleLossGradOpHandle::Name() const { return "ScaleLossGrad"; } } // namespace details } // namespace framework } // namespace paddle From d5268a6e0ebe77d25af677df9274031f21a08237 Mon Sep 17 00:00:00 2001 From: Guoxia Wang Date: Tue, 28 Sep 2021 10:42:29 +0800 Subject: [PATCH 025/298] fix bug of reduce_sum when src_dtype != dst_dtype and reduce_num == 1 (#36123) --- paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 4760270caa3c6d..28b6ebc2433224 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -34,6 +34,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/fast_divmod.h" @@ -705,8 +706,16 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, if (config.reduce_num == 1) { auto out_dims = y->dims(); - framework::TensorCopy(x, y->place(), y); - y->Resize(out_dims); + if (x.type() == y->type()) { + framework::TensorCopy(x, y->place(), y); + y->Resize(out_dims); + } else { + auto* dev_ctx = static_cast( + paddle::platform::DeviceContextPool::Instance().Get(x.place())); + framework::VisitDataType( + static_cast(y->type()), + CastOpFunctor(&x, y, *dev_ctx)); + } return; } From eef0a943084c02cd0469f89726118eca81101ba4 Mon Sep 17 00:00:00 2001 From: WangXi Date: Tue, 28 Sep 2021 10:45:44 +0800 Subject: [PATCH 026/298] [hybrid] optimizer sharding support optimize cast (#35878) --- .../sharding/offload_helper.py | 213 +++++++++++++++++- .../fleet/meta_optimizers/sharding/utils.py | 68 +++++- .../meta_optimizers/sharding_optimizer.py | 87 +++++-- .../test_fleet_hybrid_meta_optimizer.py | 76 +++++++ .../test_fleet_sharding_meta_optimizer.py | 50 ++-- 5 files changed, 440 insertions(+), 54 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py index 3816e9b3051abf..3ad6e320316c61 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy from ..common import is_optimizer_op, OP_ROLE_KEY, OpRole, is_update_op from paddle.fluid import core, unique_name +from .shard import Shard __all__ = [] @@ -23,11 +25,8 @@ class OffloadHelper(object): cuda_place_type = 1 cuda_pinned_place_type = 2 - def __init__(self): - pass - "0: dst is on CPUPlace. " - "1: dst is on CUDAPlace. " - "2: dst is on CUDAPinnedPlace. " + def __init__(self, ring_id=None): + self.ring_id = ring_id def _insert_cast_op(self, block, idx, src_name, dst_name): src_var = block.var(src_name) @@ -50,6 +49,21 @@ def _insert_cast_op(self, block, idx, src_name, dst_name): OP_ROLE_KEY: OpRole.Optimize }) + def _insert_broadcast_op(self, block, idx, param): + if self.ring_id is None: + return + block._insert_op_without_sync( + idx, + type="c_broadcast", + inputs={'X': param}, + outputs={'Out': param}, + attrs={ + 'ring_id': self.ring_id, + 'root': 0, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward, + }) + def _insert_memcpy_op(self, block, idx, src_name, dst_name, dst_place_type): src_var = block.var(src_name) dst_var = block.var(dst_name) @@ -206,6 +220,8 @@ def remove_param(input_name): # step5: startup_block add offload visited_vars = set() + # FIXME(wangxi): should insert in idx, need move comm init to the head. + insert_idx = len(startup_block.ops) for idx, op in reversed(list(enumerate(startup_block.ops))): for out_name in op.output_arg_names: if out_name in visited_vars: @@ -213,13 +229,16 @@ def remove_param(input_name): if out_name in param_name_to_offload_name: var_name = out_name - # FIXME(wangxi): offload should insert after broadcast param if offload: offload_var_name = param_name_to_offload_name[var_name] - self._insert_offload_op(startup_block, idx + 1, + self._insert_offload_op(startup_block, insert_idx, var_name, offload_var_name) - self._insert_cast_op(startup_block, idx + 1, var_name, + self._insert_cast_op(startup_block, insert_idx, var_name, param_to_fp16[var_name]) + # NOTE(wangxi): cast and offload should insert after broadcast param. + # the insert op order is: broadcast, cast, offload + self._insert_broadcast_op(startup_block, insert_idx, + var_name) visited_vars.add(out_name) @@ -303,3 +322,181 @@ def offload(self, block, startup_block): block._sync_with_cpp() startup_block._sync_with_cpp() + + def opt_sharding_cast_fp32param(self, + block, + startup_block, + params, + offload=False): + """ + (p_fp16) = cast(p) + (p_fp16_recompute) = cast(p) + (pout,) = adam(p) + ===========================> + rename(p_fp16_recompute, p_fp16) + + (pout,) = adam(p) + (p_fp16) = cast(p) + broadcast(p_fp16) + """ + global_params = set() + local_params = set() + param_to_fp16 = dict() + # recompute_var which need rename to fp16_param + fp16_param_to_recompute = dict() + recompute_to_fp16 = dict() + + def remove_param(input_name): + global_params.remove(input_name) + if input_name in local_params: + local_params.remove(input_name) + if input_name in param_to_fp16: + fp16_param = param_to_fp16.pop(input_name) + if fp16_param in fp16_param_to_recompute: + recompute = fp16_param_to_recompute.pop(fp16_param) + recompute_to_fp16.pop(recompute) + + # step1: record param + global_params = set(params) + for idx, op in reversed(list(enumerate(block.ops))): + if is_update_op(op): + param = op.desc.input("Param")[0] + local_params.add(param) + + # step2: remove param which can't offload and + # record param->fp16param, fp16param->recompute_var + for idx, op in enumerate(block.ops): + if is_optimizer_op(op): + break + # TODO (Yuang Liu): tmp solution for fuse_grad_merge + optimize_cast + if op.type == 'coalesce_tensor': + continue + for input_name in op.desc.input_arg_names(): + if input_name not in global_params: + continue + + # param which will be used by fp32 op + if op.type != 'cast': + remove_param(input_name) + continue + + # param is only used by cast op, + # which to cast fp32_param to fp16_param + output_name = op.output_arg_names[0] + if 'cast_fp16' not in output_name: + remove_param(input_name) + continue + + if 'subprog' not in output_name: + assert output_name == input_name + '.cast_fp16' + assert input_name not in param_to_fp16, \ + "There must be only one cast op from fp32 param to fp16 param." + param_to_fp16[input_name] = output_name + else: + # fp16-->recompute_var + assert input_name in param_to_fp16, \ + "param must first be cast to fp16" + fp16_param = param_to_fp16[input_name] + fp16_param_to_recompute[fp16_param] = output_name + recompute_to_fp16[output_name] = fp16_param + + param_name_to_offload_name = dict() + # step3: main_block add offload, cast op + # change recompute to fp16, remove cast(param) to fp16 + for idx, op in reversed(list(enumerate(block.ops))): + if is_update_op(op): + param = op.desc.input("Param")[0] + if param not in global_params: + continue + # step3.1: create offload_var + offload_var_name = self._get_offload_var_name(param) + param_name_to_offload_name[param] = offload_var_name + if offload: + self._create_offload_var(param, offload_var_name, + [block, startup_block]) + + # step3.2: insert cast op and offload op + self._insert_offload_op(block, idx + 1, param, + offload_var_name) + + assert param in param_to_fp16 + fp16_param_name = param_to_fp16[param] + fp16_param_var = block.var(fp16_param_name) + fp16_param_var.persistable = True + self._insert_cast_op(block, idx + 1, param, + param_to_fp16[param]) + + if offload: + # step3.3: insert fetch op + self._insert_fetch_op(block, idx, offload_var_name, param) + + continue + + # step3.4: remove cast op + if op.type == 'cast': + input_name = op.desc.input_arg_names()[0] + if input_name in global_params: + block._remove_op(idx, sync=False) + continue + + # step3.5: change recompute_param to fp16_param + for input_name in op.desc.input_arg_names(): + if input_name in recompute_to_fp16: + op._rename_input(input_name, recompute_to_fp16[input_name]) + for output_name in op.desc.output_arg_names(): + if output_name in recompute_to_fp16: + op._rename_output(output_name, + recompute_to_fp16[output_name]) + + # step4: remove recompute_param + for name in recompute_to_fp16.keys(): + block._remove_var(name, sync=False) + + # step5: remove fp32 param which not need + for idx, op in enumerate(block.ops): + if op.type not in ['coalesce_tensor', 'c_broadcast']: + continue + for input_name in op.desc.input_arg_names(): + if input_name in param_to_fp16: + op._rename_input(input_name, param_to_fp16[input_name]) + for output_name in op.desc.output_arg_names(): + if output_name in param_to_fp16: + op._rename_output(output_name, param_to_fp16[output_name]) + + for param in global_params: + assert param in param_to_fp16 + fp16_param_name = param_to_fp16[param] + fp16_param_var = block.var(fp16_param_name) + fp16_param_var.persistable = True + + if param not in local_params: + block._remove_var(param, sync=False) + + # step6: startup_block add offload + visited_vars = set() + insert_idx = len(startup_block.ops) + for idx, op in reversed(list(enumerate(startup_block.ops))): + for out_name in op.output_arg_names: + if out_name in visited_vars: continue + + if out_name in param_to_fp16: + var_name = out_name + if offload: + self._insert_offload_op( + startup_block, idx + 1, var_name, + param_name_to_offload_name[var_name]) + + self._insert_cast_op(startup_block, insert_idx, var_name, + param_to_fp16[var_name]) + + self._insert_broadcast_op(startup_block, insert_idx, + var_name) + + if var_name not in local_params: + param = startup_block.var(out_name) + param.persistable = False + + visited_vars.add(out_name) + + block._sync_with_cpp() + startup_block._sync_with_cpp() diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index 0b8f67a0a7cd9f..447b52ace69787 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -14,7 +14,7 @@ import paddle from paddle.fluid import core, unique_name from functools import reduce -from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op, is_backward_op +from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op, is_backward_op, is_optimizer_op from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY import re @@ -366,6 +366,24 @@ def insert_allreduce_ops(block, class FuseHelper(object): + @staticmethod + def sort_vars_by_dtype(block, vars_name): + fp32_vars = [] + fp16_vars = [] + other_vars = [] + for var in vars_name: + dtype = block.var(var).dtype + if dtype == paddle.float32: + fp32_vars.append(var) + elif dtype == paddle.float16: + fp16_vars.append(var) + else: + other_vars.append(var) + assert len(other_vars) == 0, "only support fp32/fp16 vars for fuse" + + fp32_vars.extend(fp16_vars) + return fp32_vars + @staticmethod def get_fused_groups(block, vars_name, fuse_size=32.): """ coalesce tensor, get fused group """ @@ -639,6 +657,54 @@ def insert_broadcast_param_ops(block, return param_in_this_device +def fuse_opt_broadcast_param_ops(block, + ring_id, + shard, + op_role=OpRole.Optimize, + strategy=None): + """ + fuse optimizer sharding broadcast param ops + """ + if strategy is None or not strategy.fuse_all_reduce_ops: + return + + fuse_size = strategy.fuse_grad_size_in_MB + + nranks = shard.worker_num + device_to_vars = [[] for _ in range(nranks)] + + for idx, op in reversed(list(enumerate(block.ops))): + if not is_optimizer_op(op) or op.type != 'c_broadcast': + break + var = op.input_arg_names[0] + root_id = op.attr('root') + device_to_vars[root_id].insert(0, var) + block._remove_op(idx, sync=False) + + insert_idx = idx + 1 + for root_id, vars_name in enumerate(device_to_vars): + vars_name = FuseHelper.sort_vars_by_dtype(block, vars_name) + groups = FuseHelper.get_fused_groups(block, vars_name, fuse_size) + + fused_vars, insert_num = FuseHelper.insert_coalesce_tensor( + block, insert_idx, groups, op_role, prefix="Param") + + for fused_var in fused_vars: + block._insert_op_without_sync( + insert_idx + insert_num, + type='c_broadcast', + inputs={'X': fused_var}, + outputs={'Out': fused_var}, + attrs={ + 'ring_id': ring_id, + 'root': root_id, + 'use_calc_stream': True, + OP_ROLE_KEY: op_role + }) + + block._sync_with_cpp() + + def get_grad_device(grad_name, shard): assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format( grad_name) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 1af646b3959e01..75a69e5527bc18 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -329,6 +329,7 @@ def _insert_allreduce_for_pp(self, params_grads): if self.pp_degree == 1: return strategy = self.user_defined_strategy + sharding_configs = strategy.sharding_configs main_block = self._main_program.global_block() startup_block = self._startup_program.global_block() @@ -399,6 +400,8 @@ def _insert_allreduce_for_pp(self, params_grads): first_optimize_op_index += (len(main_block.ops) - len_of_ops) len_of_ops = len(main_block.ops) + # NOTE(wangxi): we fused after optimize_cast + optimize_cast = sharding_configs['optimize_cast'] optimizer_param = utils.insert_broadcast_param_ops( main_block, len_of_ops, @@ -407,10 +410,10 @@ def _insert_allreduce_for_pp(self, params_grads): OpRole.Optimize, use_calc_stream=True, rank=self.dp_rank, - strategy=strategy) + strategy=None if optimize_cast else strategy) logger.info("Optimizer param in this rank {}".format( optimizer_param)) - if not strategy.fuse_grad_merge: + if not strategy.fuse_grad_merge and not optimize_cast: assert len(accumulated_grad_names) == len(optimizer_param) elif self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp": insert_allreduce_ops( @@ -458,18 +461,20 @@ def _insert_loss_grad_scale_op(self): main_block._sync_with_cpp() - def _apply_optimize_offload_pass(self): + def _apply_optimize_offload_pass(self, params_grads): strategy = self.user_defined_strategy sharding_configs = strategy.sharding_configs main_block = self._main_program.global_block() startup_block = self._startup_program.global_block() + dp_ring_id = self.dp_ring_id if self.dp_degree > 1 else None + # optimize offload should be enable while gradient merge is enable and # acc_step is quite large (e.g. >> 100). Since its memcpy could not be # overlap with calc, otherwise it will slower down training severely. if sharding_configs["optimize_offload"]: logger.info("Sharding with optimize offload !") - offload_helper = OffloadHelper() + offload_helper = OffloadHelper(ring_id=dp_ring_id) offload_helper.offload(main_block, startup_block) # The optimize_cast is already included in offload_fp32param offload_helper.offload_fp32param(main_block, startup_block) @@ -477,8 +482,17 @@ def _apply_optimize_offload_pass(self): logger.info("Sharding with optimize cast !") # NOTE(wangxi): optimize_cast will persist fp16 param, it # will take more memory, but will be faster. Trade space for time. - offload_helper = OffloadHelper() - offload_helper.cast_fp32param_in_optimize(main_block, startup_block) + offload_helper = OffloadHelper(ring_id=dp_ring_id) + if self._optimizer_sharding: + offload_helper.opt_sharding_cast_fp32param( + main_block, startup_block, + [x[0].name for x in params_grads]) + # NOTE(wangxi): fused after optimize_cast + utils.fuse_opt_broadcast_param_ops( + main_block, dp_ring_id, self._shard, strategy=strategy) + else: + offload_helper.cast_fp32param_in_optimize(main_block, + startup_block) def _dump_program_for_debug(self): main_block = self._main_program.global_block() @@ -525,7 +539,7 @@ def minimize_impl(self, self._insert_loss_grad_scale_op() # apply optimize offload or optimize cast - self._apply_optimize_offload_pass() + self._apply_optimize_offload_pass(params_grads) # step6: (optional) sharding gradient merge self._sharding_gradient_merge() @@ -1381,17 +1395,50 @@ def _initialization_broadcast(self): startup_block = self._startup_program.global_block() params = startup_block.all_parameters() + params_name = [] - broadcast_params = [] + # NOTE(wangxi): if param is not persistable, program.clone will + # failed, so we remove no persistable param, re add param as a var for param in params: - broadcast_params.append(param) - # optimize_cast need broadcast fp16 param - fp16_param_name = param.name + '.cast_fp16' - if startup_block.has_var(fp16_param_name): - fp16_param = startup_block.var(fp16_param_name) - broadcast_params.append(fp16_param) - - for param in broadcast_params: + params_name.append(param.name) + if not param.persistable: + name = param.name + shape = param.shape + dtype = param.dtype + type = param.type + lod_level = param.lod_level + stop_gradient = param.stop_gradient + trainable = param.trainable + optimize_attr = param.optimize_attr + regularizer = param.regularizer + + have_dist_attr = False + is_distributed = False + if hasattr(param, 'is_distributed'): + have_dist_attr = True + is_distributed = param.is_distributed + + startup_block._remove_var(name, sync=False) + var = startup_block.create_var( + name=name, + shape=shape, + dtype=dtype, + type=type, + lod_level=lod_level, + stop_gradient=stop_gradient, + trainable=trainable, + persistable=False) + if have_dist_attr: + var.is_distributed = is_distributed + + # offload and optimize_cast will insert broadcast op + broadcast_params = set() + for op in startup_block.ops: + if op.type == 'c_broadcast': + broadcast_params.add(op.desc.output_arg_names()[0]) + + for param in params_name: + if param in broadcast_params: continue startup_block.append_op( type='c_broadcast', inputs={'X': param}, @@ -1399,15 +1446,19 @@ def _initialization_broadcast(self): attrs={ 'ring_id': self.dp_ring_id, 'root': 0, + 'use_calc_stream': True, OP_ROLE_KEY: OpRole.Forward }) + startup_block.append_op( type='c_sync_comm_stream', - inputs={'X': broadcast_params}, - outputs={'Out': broadcast_params}, + inputs={'X': params_name}, + outputs={'Out': params_name}, attrs={'ring_id': self.dp_ring_id, OP_ROLE_KEY: OpRole.Forward}) + startup_block._sync_with_cpp() + # sharding gradient merge def create_persistable_gradients_and_insert_merge_ops( self, main_block, startup_block, insert_idx, grad_names, shard): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py index db8689c14c30f3..6eb566935d9d52 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py @@ -321,6 +321,82 @@ def test_opt_sharding_with_pp_amp_gclip_fuse_gm(self): 'c_broadcast' ]) + def test_opt_sharding_with_pp_amp_ckp_fuse_gm_optcast(self): + train_prog, startup_prog = static.Program(), static.Program() + avg_cost, strategy = self.pp_net(train_prog, startup_prog) + + self.set_strategy(strategy, 'pipeline') + self.set_strategy(strategy, 'amp') + strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], } + strategy.recompute = True + strategy.recompute_configs = { + "checkpoints": + ["fc_0.tmp_2", "fc_1.tmp_2", "fc_2.tmp_2", "fc_3.tmp_2"] + } + + strategy.sharding = True + strategy.sharding_configs = { + "sharding_degree": 1, + "pp_degree": 2, + "dp_degree": 2, + "_dp_as_optimizer_sharding": True, + 'optimize_cast': True, + } + strategy.fuse_all_reduce_ops = True + strategy.fuse_grad_size_in_MB = 32 + strategy.fuse_grad_merge = True + + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + train_prog = train_prog._pipeline_opt['section_program'] + startup_prog = startup_prog._pipeline_opt['startup_program'] + + # self._debug = True + self.debug_program(train_prog, startup_prog) + + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check program + startup_prog_op_types = [op.type for op in startup_prog_ops] + main_prog_op_types = [op.type for op in main_prog_ops] + + # global, sharding, pp_send, pp_recv + self.assertEqual(startup_prog_op_types, [ + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', + 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', + 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', + 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', + 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', + 'cast', 'c_broadcast', 'c_sync_comm_stream' + ]) + + self.assertEqual(main_prog_op_types, [ + 'recv_v2', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', + 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', + 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'cast', + 'elementwise_add', 'cast', 'softmax', 'cast', 'cross_entropy2', + 'mean', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', + 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', + 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad', + 'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', 'cast', + 'elementwise_add_grad', 'cast', 'mul_grad', 'cast', 'tanh_grad', + 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', + 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul', + 'elementwise_add', 'cast', 'tanh_grad', 'cast', + 'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream', + 'send_v2', 'cast', 'sum', 'sum', 'cast', 'sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', + 'check_finite_and_unscale', 'cast', 'c_allreduce_max', + 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', + 'cast', 'momentum', 'cast', 'momentum', 'cast', 'momentum', + 'momentum', 'cast', 'coalesce_tensor', 'c_broadcast', 'c_broadcast', + 'coalesce_tensor', 'c_broadcast' + ]) + class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index 61d98d32ec5fd7..73eacd118ecad5 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -922,18 +922,17 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast(self): # ring: mp, pp_group, pp_pair, pp_pair self.assertEqual(startup_prog_op_types, [ - 'uniform_random', 'cast', 'fill_constant', 'cast', 'uniform_random', - 'cast', 'fill_constant', 'cast', 'uniform_random', 'cast', - 'fill_constant', 'cast', 'uniform_random', 'cast', 'fill_constant', - 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'cast', + 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', + 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', + 'c_broadcast', 'c_sync_comm_stream' ]) self.assertEqual(main_prog_op_types, [ @@ -1019,19 +1018,17 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_offload(self): # ring: mp, pp_group, pp_pair, pp_pair self.assertEqual(startup_prog_op_types, [ - 'uniform_random', 'cast', 'memcpy', 'fill_constant', 'cast', - 'memcpy', 'uniform_random', 'cast', 'memcpy', 'fill_constant', - 'cast', 'memcpy', 'uniform_random', 'cast', 'memcpy', - 'fill_constant', 'cast', 'memcpy', 'uniform_random', 'cast', - 'memcpy', 'fill_constant', 'fill_constant', 'fill_constant', + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'cast', 'memcpy', + 'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy', + 'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy', + 'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'c_sync_comm_stream' ]) @@ -1122,18 +1119,17 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast_with_gradient_fuse( # ring: mp, pp_group, pp_pair, pp_pair self.assertEqual(startup_prog_op_types, [ - 'uniform_random', 'cast', 'fill_constant', 'cast', 'uniform_random', - 'cast', 'fill_constant', 'cast', 'uniform_random', 'cast', - 'fill_constant', 'cast', 'uniform_random', 'cast', 'fill_constant', - 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'cast', + 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', + 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', + 'c_broadcast', 'c_sync_comm_stream' ]) self.assertEqual(main_prog_op_types, [ From c719add76e470080d369b7a8e6dca34d0376864b Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 28 Sep 2021 10:46:50 +0800 Subject: [PATCH 027/298] reduce calls to SizeOfType (#36110) --- paddle/fluid/framework/tensor.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 4f6eb803d1c26e..fbd7aa588d49a8 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -29,14 +29,16 @@ void Tensor::check_memory_size() const { PADDLE_ENFORCE_NOT_NULL(holder_, platform::errors::PreconditionNotMet( "Tensor holds no memory. " "Call Tensor::mutable_data firstly.")); + size_t size = numel() * SizeOfType(type()); + PADDLE_ENFORCE_LE( - numel() * SizeOfType(type()), memory_size(), + size, memory_size(), platform::errors::PreconditionNotMet( "Tensor's dimension is out of bound." "Tensor's dimension must be equal or less than the size of its " "memory." "But received Tensor's dimension is d%, memory's size is %d.", - numel() * SizeOfType(type()), memory_size())); + size, memory_size())); } Tensor::Tensor(const proto::VarType::Type& dtype) From 53f9768d8aa4de1dddcd11b36ed693fef1c34292 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 28 Sep 2021 11:05:21 +0800 Subject: [PATCH 028/298] [re-submit] auto read all public envs from flags_map in paddle_gtest_main (#36121) * read envs in flags_map * add flags to undefok --- paddle/testing/paddle_gtest_main.cc | 50 +++++++++++------------------ 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 6feef11a366d97..d7f9a25ac7a880 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" +#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/npu_info.h" @@ -22,7 +23,6 @@ int main(int argc, char** argv) { paddle::memory::allocation::UseAllocatorStrategyGFlag(); testing::InitGoogleTest(&argc, argv); std::vector new_argv; - std::string gflags_env; for (int i = 0; i < argc; ++i) { new_argv.push_back(argv[i]); } @@ -38,35 +38,23 @@ int main(int argc, char** argv) { } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_ASCEND_CL) - envs.push_back("fraction_of_gpu_memory_to_use"); - envs.push_back("initial_gpu_memory_in_mb"); - envs.push_back("reallocate_gpu_memory_in_mb"); - envs.push_back("allocator_strategy"); - envs.push_back("selected_gpus"); -#elif __clang__ - envs.push_back("use_mkldnn"); - envs.push_back("initial_cpu_memory_in_mb"); - envs.push_back("allocator_strategy"); - - undefok.push_back("use_mkldnn"); - undefok.push_back("initial_cpu_memory_in_mb"); -#else - envs.push_back("use_pinned_memory"); - envs.push_back("use_mkldnn"); - envs.push_back("initial_cpu_memory_in_mb"); - envs.push_back("allocator_strategy"); - - undefok.push_back("use_pinned_memory"); - undefok.push_back("use_mkldnn"); - undefok.push_back("initial_cpu_memory_in_mb"); -#endif - -#if defined(PADDLE_WITH_ASCEND_CL) - envs.push_back("selected_npus"); - envs.push_back("npu_config_path"); -#endif + const auto& flag_map = paddle::platform::GetExportedFlagInfoMap(); + for (const auto& pair : flag_map) { + const std::string& name = pair.second.name; + // NOTE(zhiqiu): some names may not linked in some tests, so add to + // `undefok`. + // One way to handle that is to check each flag item by item, and put it in + // `envs` or `undefok`; + // another way is to add all flags to `envs` and `undeok`, basically it is + // not a good design, + // but it can simplify the procedure of creating new flag and seems no side + // effects. + // see details: https://gflags.github.io/gflags/#special + if (pair.second.is_writable) { // means public + envs.push_back(name); + undefok.push_back(name); + } + } char* env_str = nullptr; if (envs.size() > 0) { @@ -103,9 +91,7 @@ int main(int argc, char** argv) { #ifdef PADDLE_WITH_ASCEND_CL paddle::platform::AclInstance::Instance().Finalize(); #endif - if (env_str) free(env_str); if (undefok_str) free(undefok_str); - return ret; } From 0e07f20e02cf00fd97b98f93daf7eb71d4573dca Mon Sep 17 00:00:00 2001 From: kuizhiqing Date: Tue, 28 Sep 2021 11:30:13 +0800 Subject: [PATCH 029/298] py2 to py3 bug and iface fix for pslib (#36102) --- .../fluid/incubate/fleet/base/role_maker.py | 15 ++++++++------- .../fleet/parameter_server/pslib/__init__.py | 1 + .../parameter_server/pslib/optimizer_factory.py | 2 +- python/paddle/fluid/incubate/fleet/utils/hdfs.py | 4 ++-- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py index a5e508d0a0defc..77f9ab33c4c343 100644 --- a/python/paddle/fluid/incubate/fleet/base/role_maker.py +++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py @@ -383,7 +383,7 @@ def _worker_num(self): return the current number of worker """ if self._check_role_generation(): - return self._get_size() / self._proc_per_node + return int(self._get_size() / self._proc_per_node) return 0 def _server_num(self): @@ -391,30 +391,30 @@ def _server_num(self): return the current number of server """ if self._check_role_generation(): - return self._get_size() / self._proc_per_node + return int(self._get_size() / self._proc_per_node) else: self.generate_role() - return self._get_size() / self._proc_per_node + return int(self._get_size() / self._proc_per_node) def worker_index(self): """ return the index of worker """ if self._check_role_generation(): - return self._rank / self._proc_per_node + return int(self._rank / self._proc_per_node) else: self.generate_role() - return self._get_size() / 2 + return int(self._get_size() / 2) def server_index(self): """ return the index of server """ if self._check_role_generation(): - return self._rank / self._proc_per_node + return int(self._rank / self._proc_per_node) else: self.generate_role() - return self._get_size() / self._proc_per_node + return int(self._get_size() / self._proc_per_node) def _all_reduce(self, input, output, mode="sum"): """ @@ -612,6 +612,7 @@ def __init__(self, **kwargs): # set running status of http server self._http_server_d["running"] = False self._iface = self.__get_default_iface() + self._iface = "" if self._iface == "lo" else self._iface # this environment variable can be empty self._prefix = os.getenv("SYS_JOB_ID", "") diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py index e8d9cc3b77b6a8..d245ce222ca6cf 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py @@ -270,6 +270,7 @@ def stop_worker(self): self._role_maker._barrier_worker() if self._role_maker.is_first_worker(): self._fleet_ptr.stop_server() + if self._heter_ptr: self._heter_ptr.stop_xpu_service() self._role_maker._barrier_worker() self._role_maker._barrier_all() diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index e2fb29c5439e11..56d476210894e1 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -846,7 +846,7 @@ def _minimize(self, "user_define_dump_filename", "") opt_info["dump_fields_path"] = strategy.get("dump_fields_path", "") opt_info["dump_param"] = strategy.get("dump_param", []) - gpus_env = os.getenv("FLAGS_selected_gpus") + gpus_env = os.getenv("FLAGS_selected_gpus", "0") opt_info["worker_places"] = [int(s) for s in gpus_env.split(",")] opt_info["use_ps_gpu"] = strategy.get("use_ps_gpu", False) if server._server.downpour_server_param.downpour_table_param[ diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py index fe09692531ad3a..e5b2129e857f4b 100644 --- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py +++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py @@ -25,8 +25,8 @@ import time import logging import six -from . import fs -from .fs import FS, LocalFS, FSFileExistsError, FSFileNotExistsError, ExecuteError, FSTimeOut, FSShellCmdAborted +#from . import fs +from paddle.distributed.fleet.utils.fs import FS, LocalFS, FSFileExistsError, FSFileNotExistsError, ExecuteError, FSTimeOut, FSShellCmdAborted from paddle.fluid import core import functools From af4f018ade3d39f76233456ed2a8abb386afac51 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Tue, 28 Sep 2021 13:03:36 +0800 Subject: [PATCH 030/298] =?UTF-8?q?=E3=80=90Bug=20fix=E3=80=91Fix=20dygrap?= =?UTF-8?q?h=20double=20grad=20dtype=20error=20(#36125)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix dygraph double grad dtype error when calling for high differential senario * reinvoke ci * add test for partial_engine.cc --- paddle/fluid/framework/operator.cc | 17 +++++++++-------- paddle/fluid/imperative/partial_grad_engine.cc | 10 +++++++++- paddle/fluid/imperative/variable_wrapper.h | 1 + .../tests/unittests/autograd/test_jacobian.py | 4 ---- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 670cb36dcc3aba..2a543d48791a3d 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1589,14 +1589,15 @@ void OperatorWithKernel::ParseInputDataType( "not initialized.", Type(), name, ctx.InputNames(name).at(i))); proto::VarType::Type tmp = t->type(); - PADDLE_ENFORCE( - tmp == *data_type || *data_type == default_data_type, - platform::errors::InvalidArgument( - "The DataType of %s Op's duplicable Variable %s must be " - "consistent. The current variable type is (%s), but the " - "previous variable type is (%s).", - Type(), name, DataTypeToString(tmp), - DataTypeToString(*data_type))); + PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type, + platform::errors::InvalidArgument( + "The DataType of %s Op's duplicable or different " + "slot Variable %s must be " + "consistent or reigster GetExpectedKernelType. The " + "current variable type is (%s), but the " + "previous variable type is (%s).", + Type(), name, DataTypeToString(tmp), + DataTypeToString(*data_type))); *data_type = tmp; } } diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index c1ec675a557070..45756083c9047f 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -307,7 +307,15 @@ static void FillConstantLike(const VariableWrapper &ref_var, auto *dst_tensor = dst_var->MutableVar()->GetMutable(); auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); dst_tensor->Resize(ref_tensor.dims()); - dst_tensor->mutable_data(place, ref_var.DataType()); + // TOOD(jiabin): Ugly fix here we have fwd_data_type_ and data_type, since in + // grad mission + // we can't get data_type_ directly. We need to check if we can only use + // default data_type for now. + if (ref_var.ForwardDataType() != -1) { + dst_tensor->mutable_data(place, ref_var.ForwardDataType()); + } else { + dst_tensor->mutable_data(place, ref_var.DataType()); + } operators::math::set_constant(*dev_ctx, dst_tensor, value); } diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index 5fa8b89a396d9b..758e8e62718e7a 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -162,6 +162,7 @@ class VariableWrapper { return tensor->type(); } else { VLOG(6) << "The tensor of variable " << name_ << " is not initialized"; + return data_type_; } } diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py b/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py index 640292a47114a1..2722d2c83b130e 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py @@ -215,10 +215,6 @@ def setUpClass(self): self.x = paddle.rand(shape=self.shape, dtype=self.dtype) self.y = paddle.rand(shape=self.shape, dtype=self.dtype) - # NOTE(levi): skip this test case temporaryly. - def test_create_graph_true(self): - pass - if __name__ == "__main__": unittest.main() From 3bb4715e5725aae7ab4df9cd278c0de849923651 Mon Sep 17 00:00:00 2001 From: zhiboniu <31800336+zhiboniu@users.noreply.github.com> Date: Tue, 28 Sep 2021 13:24:49 +0800 Subject: [PATCH 031/298] remove new linalg api in paddle.__init__ (#36151) remove recent linalg api in paddle.init; add args 'name' in some new linalg api interface same change in develop branch to #36112 --- python/paddle/__init__.py | 7 ------- .../fluid/tests/unittests/test_linalg_cond.py | 16 ++++++++-------- python/paddle/tensor/linalg.py | 18 +++++++++--------- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 024415664d8a66..ad8640f6f55848 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -94,18 +94,12 @@ from .tensor.linalg import norm # noqa: F401 from .tensor.linalg import transpose # noqa: F401 from .tensor.linalg import dist # noqa: F401 -from .tensor.linalg import cond # noqa: F401 from .tensor.linalg import t # noqa: F401 from .tensor.linalg import cross # noqa: F401 from .tensor.linalg import cholesky # noqa: F401 from .tensor.linalg import bmm # noqa: F401 from .tensor.linalg import histogram # noqa: F401 from .tensor.linalg import mv # noqa: F401 -from .tensor.linalg import det # noqa: F401 -from .tensor.linalg import slogdet # noqa: F401 -from .tensor.linalg import matrix_power # noqa: F401 -from .tensor.linalg import svd # noqa: F401 -from .tensor.linalg import solve # noqa: F401 from .tensor.logic import equal # noqa: F401 from .tensor.logic import greater_equal # noqa: F401 from .tensor.logic import greater_than # noqa: F401 @@ -504,7 +498,6 @@ 'stack', 'sqrt', 'cholesky', - 'matrix_power', 'randperm', 'linspace', 'reshape', diff --git a/python/paddle/fluid/tests/unittests/test_linalg_cond.py b/python/paddle/fluid/tests/unittests/test_linalg_cond.py index 2b42eca38e6fc6..237c96430249bc 100644 --- a/python/paddle/fluid/tests/unittests/test_linalg_cond.py +++ b/python/paddle/fluid/tests/unittests/test_linalg_cond.py @@ -28,7 +28,7 @@ def test_static_assert_true(self, x_list, p_list): for x in x_list: with static.program_guard(static.Program(), static.Program()): input_data = static.data("X", shape=x.shape, dtype=x.dtype) - output = paddle.cond(input_data, p) + output = paddle.linalg.cond(input_data, p) exe = static.Executor() result = exe.run(feed={"X": x}, fetch_list=[output]) expected_output = np.linalg.cond(x, p) @@ -39,7 +39,7 @@ def test_dygraph_assert_true(self, x_list, p_list): for p in p_list: for x in x_list: input_tensor = paddle.to_tensor(x) - output = paddle.cond(input_tensor, p) + output = paddle.linalg.cond(input_tensor, p) expected_output = np.linalg.cond(x, p) self.assertTrue(np.allclose(output, expected_output)) @@ -103,12 +103,12 @@ def test_dygraph_api_error(self): for p in p_list_error: for x in (x_list_n_n + x_list_m_n): x_tensor = paddle.to_tensor(x) - self.assertRaises(ValueError, paddle.cond, x_tensor, p) + self.assertRaises(ValueError, paddle.linalg.cond, x_tensor, p) for p in p_list_n_n: for x in x_list_m_n: x_tensor = paddle.to_tensor(x) - self.assertRaises(ValueError, paddle.cond, x_tensor, p) + self.assertRaises(ValueError, paddle.linalg.cond, x_tensor, p) def test_static_api_error(self): paddle.enable_static() @@ -119,13 +119,13 @@ def test_static_api_error(self): for x in (x_list_n_n + x_list_m_n): with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) - self.assertRaises(ValueError, paddle.cond, x_data, p) + self.assertRaises(ValueError, paddle.linalg.cond, x_data, p) for p in p_list_n_n: for x in x_list_m_n: with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) - self.assertRaises(ValueError, paddle.cond, x_data, p) + self.assertRaises(ValueError, paddle.linalg.cond, x_data, p) # it's not supported when input is an empty tensor in static mode def test_static_empty_input_error(self): @@ -136,13 +136,13 @@ def test_static_empty_input_error(self): for x in x_list_n_n: with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) - self.assertRaises(ValueError, paddle.cond, x_data, p) + self.assertRaises(ValueError, paddle.linalg.cond, x_data, p) for p in (p_list_n_n + p_list_m_n): for x in x_list_n_n: with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) - self.assertRaises(ValueError, paddle.cond, x_data, p) + self.assertRaises(ValueError, paddle.linalg.cond, x_data, p) class TestCondEmptyTensorInput(unittest.TestCase): diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 9f2c4316d542db..9ba9370a43087d 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -448,7 +448,7 @@ def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None): format(axis)) -def dist(x, y, p=2): +def dist(x, y, p=2, name=None): r""" This OP returns the p-norm of (x - y). It is not a norm in a strict sense, only as a measure @@ -1251,7 +1251,7 @@ def bmm(x, y, name=None): return out -def histogram(input, bins=100, min=0, max=0): +def histogram(input, bins=100, min=0, max=0, name=None): """ Computes the histogram of a tensor. The elements are sorted into equal width bins between min and max. If min and max are both zero, the minimum and maximum values of the data are used. @@ -1351,7 +1351,7 @@ def __check_input(x, vec): return out -def det(x): +def det(x, name=None): """ Calculates determinant value of a square matrix or batches of square matrices. Args: @@ -1367,7 +1367,7 @@ def det(x): x = paddle.randn([3,3,3]) - A = paddle.det(x) + A = paddle.linalg.det(x) print(A) @@ -1399,7 +1399,7 @@ def det(x): return out -def slogdet(x): +def slogdet(x, name=None): """ Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant. The determinant can be computed with ``sign * exp(logabsdet) @@ -1422,7 +1422,7 @@ def slogdet(x): x = paddle.randn([3,3,3]) - A = paddle.slogdet(x) + A = paddle.linalg.slogdet(x) print(A) @@ -1563,17 +1563,17 @@ def matrix_power(x, n, name=None): x = paddle.to_tensor([[1, 2, 3], [1, 4, 9], [1, 8, 27]], dtype='float64') - print(paddle.matrix_power(x, 2)) + print(paddle.linalg.matrix_power(x, 2)) # [[6. , 34. , 102.], # [14. , 90. , 282.], # [36. , 250., 804.]] - print(paddle.matrix_power(x, 0)) + print(paddle.linalg.matrix_power(x, 0)) # [[1., 0., 0.], # [0., 1., 0.], # [0., 0., 1.]] - print(paddle.matrix_power(x, -2)) + print(paddle.linalg.matrix_power(x, -2)) # [[ 12.91666667, -12.75000000, 2.83333333 ], # [-7.66666667 , 8. , -1.83333333 ], # [ 1.80555556 , -1.91666667 , 0.44444444 ]] From 58c8f6b38ddd44834d822a8054858becc89cf550 Mon Sep 17 00:00:00 2001 From: xiayanming <41795079@qq.com> Date: Tue, 28 Sep 2021 13:54:46 +0800 Subject: [PATCH 032/298] [hybrid] seed and dropout op support force-cpu (#35820) * [HIP] fix op not support AMD GPU bug, the flag PADDLE_WITH_ROCM is invalid * [HIP] fix op not support AMD GPU bug, the flag PADDLE_WITH_ROCM is invalid * [HIP] fix op not support AMD GPU bug * [hybrid] seed and dropout op support force-cpu * [hybrid] seed and dropout op support force-cpu * [hybrid] seed and dropout op support force-cpu * [hybrid] seed and dropout op support force-cpu * [hybrid] seed and dropout op support force-cpu * [hybrid] fix seed ci failed issue * add AsExtra for force_cpu of seed op --- paddle/fluid/operators/dropout_impl.cu.h | 3 + paddle/fluid/operators/dropout_op.cc | 13 ++++ paddle/fluid/operators/seed_op.cc | 18 +++++ paddle/fluid/operators/seed_op.cu | 30 +++++--- paddle/fluid/operators/seed_op.h | 1 + python/paddle/fluid/backward.py | 9 ++- .../fluid/tests/unittests/test_dropout_op.py | 69 +++++++++++++++++++ .../fluid/tests/unittests/test_seed_op.py | 4 +- 8 files changed, 135 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 4261a5f2534c85..7a93d2db0dd1ce 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -205,6 +205,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor); seed_data = static_cast(seed_cpu_tensor.data()[0]); increment = offset; + } else if (seed && platform::is_cpu_place(seed->place())) { + seed_data = *(seed->data()); + increment = offset; } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) { auto seed_offset = gen_cuda->IncrementOffset(offset); seed_data = seed_offset.first; diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 9700b9a2f7a1c2..cbfb795d6a23e1 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -42,6 +42,19 @@ class DropoutOp : public framework::OperatorWithKernel { return framework::OpKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "Seed") { + VLOG(10) << "var_name:" << var_name + << " does not need to transform in dropout op"; + return expected_kernel_type; + } + + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } }; class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc index 2f3e4c9ba88c39..32daa8c3934aed 100644 --- a/paddle/fluid/operators/seed_op.cc +++ b/paddle/fluid/operators/seed_op.cc @@ -39,6 +39,12 @@ class SeedOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddOutput("Out", "The output of seed op."); AddAttr("seed", "Dropout random seed.").SetDefault(0); + AddAttr("force_cpu", + "(bool, default false) Force fill output variable to cpu " + "memory. Otherwise, fill output variable to the running " + "device") + .SetDefault(false) + .AsExtra(); AddComment(R"DOC( Seed Operator. )DOC"); @@ -55,3 +61,15 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL( seed, ops::CPUSeedKernel); + +/* ========================== register checkpoint ===========================*/ +REGISTER_OP_VERSION(seed) + .AddCheckpoint( + R"ROC( + Upgrade seed add a new attribute [force_cpu])ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "force_cpu", + "If true, Force fill output variable to cpu." + "memory. Otherwise, fill output variable to the running " + "device", + false)); diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu index c84407ba52dfd6..4593b88019621a 100644 --- a/paddle/fluid/operators/seed_op.cu +++ b/paddle/fluid/operators/seed_op.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/seed_op.h" namespace paddle { @@ -20,10 +21,10 @@ namespace operators { template class GPUSeedKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* out_data = out->mutable_data(context.GetPlace()); + void Compute(const framework::ExecutionContext &context) const override { + auto *out = context.Output("Out"); int user_seed = context.Attr("seed"); + auto force_cpu = context.Attr("force_cpu"); std::random_device rnd; int seed; if (user_seed != 0) { @@ -31,11 +32,24 @@ class GPUSeedKernel : public framework::OpKernel { } else { seed = rnd(); } - auto target_gpu_place = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); - auto stream = context.cuda_device_context().stream(); - memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed, - sizeof(int), stream); + + bool cpu_place = force_cpu || context.GetPlace() == platform::CPUPlace(); + if (cpu_place) { + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(context.GetPlace()); + out->mutable_data(platform::CPUPlace()); + math::SetConstant functor; + functor(reinterpret_cast(dev_ctx), + out, static_cast(seed)); + } else { + auto *out_data = out->mutable_data(context.GetPlace()); + auto target_gpu_place = + BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); + auto stream = context.cuda_device_context().stream(); + memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed, + sizeof(int), stream); + } } }; diff --git a/paddle/fluid/operators/seed_op.h b/paddle/fluid/operators/seed_op.h index f8b513fca4824c..671f397d4eaffc 100644 --- a/paddle/fluid/operators/seed_op.h +++ b/paddle/fluid/operators/seed_op.h @@ -14,6 +14,7 @@ #pragma once #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" namespace paddle { namespace operators { diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 8bf27f6d2fd988..7aa3c888f2ad18 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -197,13 +197,18 @@ def modify_forward_desc_for_recompute(self): if op.desc.has_attr(op_device_attr_name): op_device = op.desc.attr(op_device_attr_name) + # Setting the force_cpu of seed to true will make the output of seed in cpu memory, + # reduce the synchronous copy from GPU to CPU in dropout, and reduce the communication hang added_op = self.block._insert_op( index=op.idx, type='seed', inputs={}, outputs={'Out': [added_var]}, - attrs={'seed': seed, - 'op_device': op_device}) + attrs={ + 'seed': seed, + 'op_device': op_device, + 'force_cpu': True + }) self.ops.insert(op_idx, added_op) # modify dropout op desc so that it accept a seed var as input op.desc.set_input("Seed", [var_unique_name]) diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py index 89755d0365f2cb..396d55b3d0a8b5 100644 --- a/python/paddle/fluid/tests/unittests/test_dropout_op.py +++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py @@ -232,6 +232,75 @@ def init_test_case(self): self.fix_seed = False +class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase): + def test_seed_cpu_place(self): + paddle.enable_static() + main_program = Program() + with program_guard(main_program): + seed_input_name = "tensor@SeedInput" + x_var_name = "tensor@X" + x_out_var = "tensor@XOut" + + mask_var_name = "tensor@Mask" + seed_input_var = main_program.global_block().create_var( + name=seed_input_name, + shape=[1], + dtype='int32', + persistable=False, + stop_gradient=True) + x_out_var = main_program.global_block().create_var( + name=x_out_var, + shape=[40, 40], + dtype='float32', + persistable=False, + stop_gradient=True) + x_var = main_program.global_block().create_var( + name=x_var_name, + shape=[40, 40], + dtype='float32', + persistable=False, + stop_gradient=True) + mask_var = main_program.global_block().create_var( + name=mask_var_name, + shape=[1], + dtype='int', + persistable=False, + stop_gradient=True) + + main_program.global_block().append_op( + type="fill_constant", + outputs={"Out": x_var_name}, + attrs={ + "shape": [40, 40], + "dtype": x_var.dtype, + "value": 1.0, + "place_type": 0 + }) + main_program.global_block().append_op( + type='seed', + inputs={}, + outputs={'Out': seed_input_var}, + attrs={'seed': 1, + 'force_cpu': True}) + main_program.global_block().append_op( + type='dropout', + inputs={'X': x_var, + 'Seed': seed_input_var}, + attrs={'dropout_prob': 0.}, + outputs={'Out': x_out_var, + 'Mask': mask_var}) + place = fluid.CPUPlace() + if core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + x_out, mask_out = exe.run( + main_program, + feed={}, + fetch_list=[x_out_var.name, mask_var.name]) + x_in_np = np.ones([40, 40]).astype("float32") + self.assertTrue(np.allclose(x_out, x_in_np)) + + class TestDropoutOpError(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): diff --git a/python/paddle/fluid/tests/unittests/test_seed_op.py b/python/paddle/fluid/tests/unittests/test_seed_op.py index 7d6705f72569b6..08478d7140d434 100644 --- a/python/paddle/fluid/tests/unittests/test_seed_op.py +++ b/python/paddle/fluid/tests/unittests/test_seed_op.py @@ -25,7 +25,7 @@ def setUp(self): self.op_type = "seed" self.inputs = {} self.attrs = {"seed": 123} - self.outputs = {"Out": np.asarray((123)).astype('int32')} + self.outputs = {"Out": np.asarray((123)).astype('int')} def test_check_output(self): self.check_output() @@ -36,7 +36,7 @@ def setUp(self): self.op_type = "seed" self.inputs = {} self.attrs = {"seed": 0} - self.outputs = {"Out": np.asarray((123)).astype('int32')} + self.outputs = {"Out": np.asarray((123)).astype('int')} def test_check_output(self): self.check_output(no_check_set=["Out"]) From 97d306025f71d454aa51615c02fc8fcd683dfde8 Mon Sep 17 00:00:00 2001 From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Date: Tue, 28 Sep 2021 15:21:48 +0800 Subject: [PATCH 033/298] [HeterPs]ps gpu dump (#36157) * ps gpu dump * remove log --- paddle/fluid/framework/device_worker.h | 8 ----- paddle/fluid/framework/ps_gpu_trainer.cc | 45 ++++++++++++++++++++++-- paddle/fluid/framework/ps_gpu_worker.cc | 34 ++++++------------ paddle/fluid/framework/trainer.h | 8 +++-- 4 files changed, 59 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 810e9a087d1220..11beb84d74914a 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -454,7 +454,6 @@ class PSGPUWorker : public HogwildWorker { virtual void Initialize(const TrainerDesc& desc); virtual void TrainFiles(); virtual void TrainFilesWithProfiler(); - virtual void SetNeedDump(bool need_dump_field); virtual void SetChannelWriter(ChannelObject* queue); virtual void SetWorkerNum(int num) { worker_num_ = num; } virtual void CacheProgram(const ProgramDesc& main_program) { @@ -467,7 +466,6 @@ class PSGPUWorker : public HogwildWorker { protected: void PushGradients(); - void DumpParam(); void CopySparseTable(); void CopyDenseTable(); void CopyDenseVars(); @@ -475,18 +473,12 @@ class PSGPUWorker : public HogwildWorker { private: int mpi_rank_; std::mutex mutex_; - std::vector send_var_list_; int worker_num_; ProgramDesc program_; HeterObjectPool object_pool_; - bool need_dump_param_; - std::vector dump_param_; bool need_to_push_dense_; - bool need_dump_field_; bool dump_slot_; bool need_to_push_sparse_; - std::vector dump_fields_; - ChannelWriter writer_; DownpourWorkerParameter param_; float scale_datanorm_; // just save the value in param_ for easy access diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index 8b16b6a5d007ff..dc7b86d344d771 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -29,9 +29,12 @@ namespace framework { void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, Dataset* dataset) { - dataset_ = dataset; + SetDataset(dataset); thread_num_ = trainer_desc.thread_num(); param_ = trainer_desc.downpour_param(); + ParseDumpConfig(trainer_desc); + mpi_rank_ = trainer_desc.mpi_rank(); + mpi_size_ = trainer_desc.mpi_size(); for (int i = 0; i < param_.dense_table_size(); ++i) { uint64_t table_id = static_cast(param_.dense_table(i).table_id()); auto table = param_.dense_table(i); @@ -44,6 +47,8 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, int place_num = trainer_desc.worker_places_size(); const std::vector readers = dataset->GetReaders(); + dump_file_num_ = trainer_desc.dump_file_num(); + user_define_dump_filename_ = trainer_desc.user_define_dump_filename(); std::vector dev_ids; for (int i = 0; i < place_num; ++i) { int num = trainer_desc.worker_places(i); @@ -64,6 +69,11 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( trainer_desc.device_worker_name()); workers_[i]->SetDeviceIndex(i); + workers_[i]->SetNeedDumpField(need_dump_field_); + workers_[i]->SetNeedDumpParam(need_dump_param_); + workers_[i]->SetDumpFieldVector(dump_fields_); + workers_[i]->SetDumpParamVector(dump_param_); + workers_[i]->InitRandomDumpConfig(trainer_desc); workers_[i]->SetDataFeed(readers[i]); workers_[i]->Initialize(trainer_desc); workers_[i]->SetWorkerNum(place_num); @@ -71,7 +81,14 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, return; } -void PSGPUTrainer::DumpWork(int tid) {} +std::string PSGPUTrainer::GetDumpPath(int tid) { + if (user_define_dump_filename_ != "") { + return string::format_string("%s/part-%s-%05d", dump_fields_path_.c_str(), + user_define_dump_filename_.c_str(), tid); + } + return string::format_string("%s/part-%03d-%05d", dump_fields_path_.c_str(), + mpi_rank_, tid); +} void PSGPUTrainer::RegisterHeterCallback() { /* @@ -124,7 +141,28 @@ void PSGPUTrainer::InitTrainerEnv(const ProgramDesc& main_program, return; } +void PSGPUTrainer::InitDumpEnv() { + queue_ = paddle::framework::MakeChannel(); + for (size_t i = 0; i < places_.size(); ++i) { + workers_[i]->SetChannelWriter(queue_.get()); + } + dump_thread_num_ = 1; + if (dump_file_num_ > mpi_size_) { + dump_thread_num_ = dump_file_num_ / mpi_size_; + if (dump_file_num_ % mpi_size_ > mpi_rank_) { + dump_thread_num_ += 1; + } + } + for (int i = 0; i < dump_thread_num_; i++) { + dump_thread_.push_back( + std::thread(std::bind(&TrainerBase::DumpWork, this, i))); + } +} + void PSGPUTrainer::InitOtherEnv(const ProgramDesc& main_program) { + if (need_dump_field_ || need_dump_param_) { + InitDumpEnv(); + } VLOG(3) << "init other env done."; } @@ -204,6 +242,9 @@ void PSGPUTrainer::Finalize() { } } MergeDenseParam(); + if (need_dump_field_ || need_dump_param_) { + FinalizeDumpEnv(); + } root_scope_->DropKids(); } } // namespace framework diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index 66d8a40dda1607..e41768810c6d2c 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -34,11 +34,6 @@ void PSGPUWorker::Initialize(const TrainerDesc& desc) { dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_); mpi_rank_ = desc.mpi_rank(); trainer_desc_ = desc; - /* - for (int i = 0; i < trainer_desc_.xpu_recv_list_size(); ++i) { - send_var_list_.push_back(trainer_desc_.xpu_recv_list(i)); - } - */ for (int i = 0; i < param_.sparse_table_size(); ++i) { uint64_t table_id = static_cast(param_.sparse_table(i).table_id()); @@ -89,19 +84,7 @@ void PSGPUWorker::Initialize(const TrainerDesc& desc) { no_cvm_ = desc.no_cvm(); scale_datanorm_ = desc.scale_datanorm(); dump_slot_ = desc.dump_slot(); - dump_fields_.resize(desc.dump_fields_size()); - for (int i = 0; i < desc.dump_fields_size(); ++i) { - dump_fields_[i] = desc.dump_fields(i); - } adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); - need_dump_param_ = false; - dump_param_.resize(desc.dump_param_size()); - for (int i = 0; i < desc.dump_param_size(); ++i) { - dump_param_[i] = desc.dump_param(i); - } - if (desc.dump_param_size() != 0) { - need_dump_param_ = true; - } for (int i = 0; i < desc.check_nan_var_names_size(); ++i) { check_nan_var_names_.push_back(desc.check_nan_var_names(i)); } @@ -134,12 +117,6 @@ void PSGPUWorker::SetChannelWriter(ChannelObject* queue) { writer_.Reset(queue); } -void PSGPUWorker::SetNeedDump(bool need_dump_field) { - need_dump_field_ = need_dump_field; -} - -void PSGPUWorker::DumpParam() {} - void PSGPUWorker::TrainFiles() { platform::SetNumThreads(1); platform::Timer timeline; @@ -150,6 +127,7 @@ void PSGPUWorker::TrainFiles() { // how to accumulate fetched values here device_reader_->Start(); int cur_batch; + int batch_cnt = 0; while ((cur_batch = device_reader_->Next()) > 0) { total_ins_num += cur_batch; for (auto& op : ops_) { @@ -164,9 +142,19 @@ void PSGPUWorker::TrainFiles() { op->Run(*thread_scope_, place_); } } + if (need_dump_field_) { + DumpField(*thread_scope_, dump_mode_, dump_interval_); + } + if (need_dump_param_ && thread_id_ == 0) { + DumpParam(*thread_scope_, batch_cnt); + } PrintFetchVars(); thread_scope_->DropKids(); + ++batch_cnt; + } + if (need_dump_field_ || need_dump_param_) { + writer_.Flush(); } timeline.Pause(); VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 0f34c84549f2b9..f6e274e6257e4c 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -258,13 +258,12 @@ class PSGPUTrainer : public TrainerBase { virtual void Run(); virtual void Finalize(); virtual void RegisterHeterCallback(); - virtual void DumpWork(int tid); virtual Scope* GetWorkerScope(int thread_id); virtual void CacheProgram(const ProgramDesc& main_program) { new (&program_) ProgramDesc(main_program); } - virtual std::string GetDumpPath(int tid) { return ""; } - virtual void InitDumpEnv() {} + virtual std::string GetDumpPath(int tid); + virtual void InitDumpEnv() override; virtual void MergeDenseParam(); template @@ -286,6 +285,9 @@ class PSGPUTrainer : public TrainerBase { std::vector threads_; int use_ps_gpu_; int thread_num_; + int mpi_rank_; + int mpi_size_; + int dump_file_num_; }; #endif From 36791fddea73f23337d5a6cf77441af0507fce09 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Tue, 28 Sep 2021 16:18:01 +0800 Subject: [PATCH 034/298] [ROCM] bugfix for arg_min_max (#36098) --- .../fluid/operators/arg_min_max_op_base.cu.h | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h index b19ba1e1590fe1..2c34d6f8300a74 100644 --- a/paddle/fluid/operators/arg_min_max_op_base.cu.h +++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h @@ -89,22 +89,25 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input, const int64_t n) { auto cu_stream = ctx.stream(); auto ComputeBlockSize = [](int64_t col) { + auto block_size = 8; if (col > 512) - return 1024; + block_size = 1024; else if (col > 256) - return 512; + block_size = 512; else if (col > 128) - return 256; + block_size = 256; else if (col > 64) - return 128; + block_size = 128; else if (col > 32) - return 64; + block_size = 64; else if (col > 16) - return 32; + block_size = 32; else if (col > 8) - return 16; - else - return 8; + block_size = 16; +#ifdef __HIPCC__ + block_size = std::min(block_size, 256); +#endif + return block_size; }; int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x; From bc7e2b921d4b450f082c61d92b27a9b9479a5c7b Mon Sep 17 00:00:00 2001 From: Lijunhui <1578034415@qq.com> Date: Tue, 28 Sep 2021 17:05:59 +0800 Subject: [PATCH 035/298] add API paddle.linalg.eig (#35674) * Add paddle.linalg.eig op * remove comments * remove comments * extend batch_size to the origin * add real times complex functor & destroy the backward complex output bug * terminate output diff when input real tensors * correct tiny doc errors * move functions from eig_helper to svd_helper and remove eig_helper * remove tensor.Resize * remove no longer used code * use existing lapack functions * reply review comments 21/27 * remove .cu as this op is only executed on CPU * remove const_cast & add const in argument list for read-only references * fix sample code error in CI * remove template typename Tbase and more * remove eig exposure in paddle.* * add 'name=None' in eig python implementation * handle the unittest * try to solve the unittest * solve CI coverage * remove no longer used code * polish API doc and more * reply review comments * polish unittest, commit plan B * polish unittest --- paddle/fluid/operators/eig_op.cc | 168 +++++++++ paddle/fluid/operators/eig_op.h | 330 ++++++++++++++++++ paddle/fluid/operators/math/matrix_solve.h | 40 +++ paddle/fluid/operators/svd_helper.h | 66 ++++ .../paddle/fluid/tests/unittests/op_test.py | 4 + .../fluid/tests/unittests/test_eig_op.py | 250 +++++++++++++ python/paddle/linalg.py | 2 + python/paddle/tensor/__init__.py | 2 + python/paddle/tensor/linalg.py | 67 ++++ 9 files changed, 929 insertions(+) create mode 100644 paddle/fluid/operators/eig_op.cc create mode 100644 paddle/fluid/operators/eig_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_eig_op.py diff --git a/paddle/fluid/operators/eig_op.cc b/paddle/fluid/operators/eig_op.cc new file mode 100644 index 00000000000000..c1aac4546e36e3 --- /dev/null +++ b/paddle/fluid/operators/eig_op.cc @@ -0,0 +1,168 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/eig_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class EigOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eig"); + OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues", + "Eig"); + OP_INOUT_CHECK(ctx->HasOutput("Eigenvectors"), "Output", "Eigenvectors", + "Eig"); + + auto x_dims = ctx->GetInputDim("X"); + int rank = x_dims.size(); + PADDLE_ENFORCE_GE(rank, 2, platform::errors::InvalidArgument( + "Expects input tensor x to be not less than " + "2 dimentions, but got dimention %d", + rank)); + PADDLE_ENFORCE_EQ(x_dims[rank - 2], x_dims[rank - 1], + platform::errors::InvalidArgument( + "The input matrix must be a square matrix, " + "but receive a matrix with %d rows and %d colums", + x_dims[rank - 2], x_dims[rank - 1])); + + std::vector batch_dims_vec{}; + for (int i = 0; i < rank - 1; ++i) { + batch_dims_vec.emplace_back(x_dims[i]); + } + + ctx->SetOutputDim("Eigenvectors", x_dims); + ctx->SetOutputDim("Eigenvalues", framework::make_ddim(batch_dims_vec)); + } + + protected: + // The output of eig is always complex-valued even for real-valued inputs + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + if (dtype != framework::proto::VarType::FP32 && + dtype != framework::proto::VarType::FP64 && + dtype != framework::proto::VarType::COMPLEX64 && + dtype != framework::proto::VarType::COMPLEX128) { + PADDLE_THROW(platform::errors::InvalidArgument( + "unsupported data type: %s!", dtype)); + } + return framework::OpKernelType(dtype, ctx.GetPlace()); + } +}; + +class EigOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "X", + "(Tensor), A complex-valued or real-valued tensor with shape (*, " + "n, n). The accepted datatype is one of float32, float64, complex64 " + "or complex128"); + AddOutput("Eigenvalues", + "(Tensor), The output eigenvalues tensor with shape (*, n). The " + "datatype is complex64 or complex128"); + AddOutput("Eigenvectors", + "(Tensor), The output eigenvectors tensor with shape (*, n, n). " + "The datatype is complex64 or complex128"); + + AddComment(R"DOC( + Eig Operator. + +This API processes eigen decomposition for general square matrices. + +)DOC"); + } +}; + +class EigGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Eigenvalues"), "Input", "Eigenvalues", + "EigGrad"); + OP_INOUT_CHECK(ctx->HasInput("Eigenvectors"), "Input", "Eigenvectors", + "EigGrad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Eigenvalues")), + "Input", "Eigenvalues@GRAD", "EigGrad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Eigenvectors")), + "Input", "Eigenvectors@GRAD", "EigGrad"); + + auto dims = ctx->GetInputDim("Eigenvectors"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Eigenvectors")), + ctx.device_context()); + } +}; + +template +class EigGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType(this->ForwardOpType() + "_grad"); + op->SetInput("Eigenvalues", this->Output("Eigenvalues")); + op->SetInput("Eigenvectors", this->Output("Eigenvectors")); + op->SetInput(framework::GradVarName("Eigenvalues"), + this->OutputGrad("Eigenvalues")); + op->SetInput(framework::GradVarName("Eigenvectors"), + this->OutputGrad("Eigenvectors")); + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + } +}; + +} // namespace operators +} // namespace paddle + +using complex64 = paddle::platform::complex; +using complex128 = paddle::platform::complex; + +namespace ops = paddle::operators; +REGISTER_OPERATOR(eig, ops::EigOp, ops::EigOpMaker, + ops::EigGradOpMaker, + ops::EigGradOpMaker); + +REGISTER_OPERATOR(eig_grad, ops::EigGradOp); + +REGISTER_OP_CPU_KERNEL( + eig, ops::EigKernel, + ops::EigKernel, + ops::EigKernel, + ops::EigKernel); + +REGISTER_OP_CPU_KERNEL( + eig_grad, + ops::EigGradKernel, + ops::EigGradKernel, + ops::EigGradKernel, + ops::EigGradKernel); diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h new file mode 100644 index 00000000000000..b9a3cb300b4c21 --- /dev/null +++ b/paddle/fluid/operators/eig_op.h @@ -0,0 +1,330 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/operators/math/complex_functors.h" +#include "paddle/fluid/operators/math/lapack_function.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/matrix_solve.h" +#include "paddle/fluid/operators/svd_helper.h" +#include "paddle/fluid/operators/transpose_op.h" +#include "paddle/fluid/platform/for_range.h" +#define EPSILON 1e-6 + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +inline int BatchCount(const Tensor& matrix) { + int count = 1; + int num_dims = matrix.dims().size(); + for (int i = 0; i < num_dims - 2; ++i) { + count *= matrix.dims()[i]; + } + return count; +} + +inline int MatrixStride(const Tensor& matrix) { + framework::DDim dims_list = matrix.dims(); + int num_dims = dims_list.size(); + return dims_list[num_dims - 1] * dims_list[num_dims - 2]; +} + +// Transpose two axis of a Tensor +template +void TransposeTwoAxis(const Tensor& input, Tensor* transposed_input, + const int axis1, const int axis2, + const framework::ExecutionContext& context) { + std::vector permute(input.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis1] = axis2; + permute[axis2] = axis1; + + transposed_input->mutable_data(input.dims(), context.GetPlace()); + auto& dev_ctx = context.template device_context(); + + TransCompute(input.dims().size(), dev_ctx, input, + transposed_input, permute); +} + +// Apply eig to a batch of matrices, values, vectors and (intermidiate +// tensor) info are overritten +template +void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, + const framework::ExecutionContext& context) { + char jobvl = 'N'; + char jobvr = 'V'; // only right eigenvectors are computed + int num_dims = input->dims().size(); + int order = input->dims()[num_dims - 1]; + + T* input_data = input->data(); + int lda = std::max(1, order); + T* values_data = values->mutable_data(context.GetPlace()); + T* lvector_data = nullptr; + int ldvl = 1; + T* rvector_data = vectors->mutable_data(context.GetPlace()); + int ldvr = lda; + int lwork = -1; + + int batch_count = BatchCount(*input); + int matrix_stride = MatrixStride(*input); + int values_stride = values->dims()[values->dims().size() - 1]; + + Tensor rwork; + math::Real* rwork_data = nullptr; + + rwork.Resize(framework::make_ddim({lda * 2})); + rwork_data = rwork.mutable_data>(context.GetPlace()); + + // call lapackEig once to compute the size of work; + T computed_work_size; + math::lapackEig>( + jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl, + rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info); + + lwork = std::max(1, static_cast(math::Real(computed_work_size))); + Tensor work; + work.Resize(framework::make_ddim({lwork})); + T* work_data = work.mutable_data(context.GetPlace()); + + for (auto i = 0; i < batch_count; ++i) { + T* current_matrix = &input_data[i * matrix_stride]; + T* current_values = &values_data[i * values_stride]; + T* current_rvectors = &rvector_data[i * matrix_stride]; + + math::lapackEig>( + jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data, + ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info); + PADDLE_ENFORCE_EQ( + info, 0, + platform::errors::PreconditionNotMet( + "current info is not 0, computation failed. " + "= 0: successful exit." + "< 0: if INFO = -i, the i-th argument had an illegal value." + "> 0: if INFO = i, the QR algorithm failed to compute all the " + "eigenvalues, and no eigenvectors have been computed; " + "elements i+1:N of WR and WI contain eigenvalues which " + "have converged.")); + } +} + +template +void ApplyEigKernel(const Tensor& input, Tensor* values, Tensor* vectors, + const framework::ExecutionContext& context) { + Tensor input_column_major; + Tensor vectors_row_major; + int num_dims = input.dims().size(); + + // transfer to column-major memory layout i.e. make_ddim from tranposed_input: + // [batch,row,col]->[batch,col,row] + TransposeTwoAxis(input, &input_column_major, num_dims - 1, + num_dims - 2, context); + // make sure 'vectors_row_major' holds memory before passed to LapackEig() + vectors_row_major.Resize(input.dims()); + int info = 0; + LapackEig(&input_column_major, values, &vectors_row_major, info, context); + + // transfer column-major layout back + // vectors_row_major: column-major layout + // vector: original layout + TransposeTwoAxis(vectors_row_major, vectors, num_dims - 1, + num_dims - 2, context); +} + +template +void ConstructComplexVectors(Tensor* c_vectors, const Tensor& c_values, + const Tensor& r_vectors, + const framework::ExecutionContext& ctx, + int batch_count, int order) { + int matrix_stride = MatrixStride(r_vectors); + + auto* c_vectors_data = c_vectors->mutable_data(ctx.GetPlace()); + auto* c_values_data = c_values.data(); + auto* r_v_data = r_vectors.data(); + + for (int b = 0; b < batch_count; b++) { + auto* vecs = &r_v_data[b * matrix_stride]; + auto* res = &c_vectors_data[b * matrix_stride]; + auto* vals = &c_values_data[b * order]; + + for (int j = 0; j < order; j++) { + if (vals[j].imag < EPSILON) { + for (int i = 0; i < order; i++) { + res[j * order + i] = platform::complex(vecs[j * order + i], 0); + } + } else { + for (int i = 0; i < order; i++) { + res[j * order + i] = platform::complex(vecs[j * order + i], + vecs[(j + 1) * order + i]); + res[(j + 1) * order + i] = platform::complex( + vecs[j * order + i], -vecs[(j + 1) * order + i]); + } + j++; + } + } + } +} + +template +class EigKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out_values = context.Output("Eigenvalues"); + auto* out_vectors = context.Output("Eigenvectors"); + + if (!framework::IsComplexType(x->type())) { + out_values->mutable_data(context.GetPlace()); + out_vectors->mutable_data(context.GetPlace()); + + int batch_count = BatchCount(*x); + int order = x->dims()[x->dims().size() - 1]; + + Tensor real_values; + Tensor real_vectors; + // double the size of real_values, the first half stores the real part, + // the next half stores the imag part + std::vector origin_dim = + framework::vectorize(out_values->dims()); + int last_item = origin_dim.back(); + origin_dim.pop_back(); + origin_dim.push_back(last_item * 2); + framework::DDim big_dim = framework::make_ddim(origin_dim); + + real_values.mutable_data>(big_dim, context.GetPlace()); + real_vectors.mutable_data>(x->dims(), context.GetPlace()); + + ApplyEigKernel>(*x, &real_values, + &real_vectors, context); + auto dito = + math::DeviceIndependenceTensorOperations, + Tout>(context); + + // 1. extract real part & imag part from real_values + Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order}); + Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2}); + + // 2. construct complex values + auto* real_part_data = real_part.data>(); + auto* imag_part_data = imag_part.data>(); + int out_values_numel = out_values->numel(); + platform::ForRange for_range( + context.template device_context(), out_values_numel); + math::RealImagToComplexFunctor functor( + real_part_data, imag_part_data, + out_values->mutable_data(context.GetPlace()), out_values_numel); + for_range(functor); + + // 3. construct complex vectors + Tensor real_vector_trans = dito.Transpose(real_vectors); + Tensor out_vectors_trans; + out_vectors_trans.mutable_data(x->dims(), context.GetPlace()); + ConstructComplexVectors, Tout>( + &out_vectors_trans, *out_values, real_vector_trans, context, + batch_count, order); + TransposeTwoAxis(out_vectors_trans, out_vectors, + x->dims().size() - 1, + x->dims().size() - 2, context); + } else { + out_values->mutable_data(context.GetPlace()); + out_vectors->mutable_data(context.GetPlace()); + + ApplyEigKernel(*x, out_values, out_vectors, context); + } + } +}; + +template +void ComputeBackwardForComplexInput( + const Tensor& V, const Tensor& L, const Tensor& gL, const Tensor& gV, + Tout* x_grad_data, int batch_count, int order, + const framework::ExecutionContext& context) { + auto dito = + math::DeviceIndependenceTensorOperations( + context); + + Tensor trans_v = dito.Transpose(V); + Tensor Vh = dito.Conj(trans_v); + Tensor Lconj = dito.Conj(L); + Tensor Econj = dito.Sub(dito.Unsqueeze(Lconj, -2), dito.Unsqueeze(Lconj, -1)); + Tensor VhgV = dito.Matmul(Vh, gV); + Tensor diag_real = dito.Real(VhgV); + Tensor diag_res = dito.BatchDiag(diag_real, batch_count); + Tensor diag_unsqueezed = dito.Unsqueeze(diag_res, -2); + + // turn diag_unsqueezed into complex + auto numel = diag_unsqueezed.numel(); + Tensor diag_unsqueezed_complex; + auto* data_diag_un = diag_unsqueezed.data>(); + auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( + diag_unsqueezed.dims(), context.GetPlace(), + static_cast(numel * sizeof(Tout))); + auto& dev_ctx = context.template device_context(); + platform::ForRange for_range(dev_ctx, numel); + math::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, + numel); + for_range(functor); + // real tensor multiply complex tensor in broadcast manner + Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex); + Tensor res2 = dito.Matmul(Vh, res1); + Tensor result = dito.Sub(VhgV, res2); + + result.mutable_data(V.dims(), context.GetPlace()); + result = dito.Div(result, Econj); + result = dito.DiagFill(order, order, order, 0, gL, result); + Tensor rhs = dito.Matmul(result, Vh); + + // solve linear system + // solve(Vh, rhs, out, m, k) + // Vh: matrix with shape [m,m] + // rhs: rhs with shape [m,k] + // x_grad: out + int m = Vh.dims()[Vh.dims().size() - 1]; + int k = rhs.dims()[rhs.dims().size() - 1]; + auto* matrix_data = Vh.data(); + auto* rhs_data = rhs.data(); + math::SolveLinearSystem(matrix_data, rhs_data, x_grad_data, m, k, + batch_count); +} + +template +class EigGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto& L = *context.Input("Eigenvalues"); + auto& V = *context.Input("Eigenvectors"); + auto& gL = *context.Input(framework::GradVarName("Eigenvalues")); + auto& gV = *context.Input(framework::GradVarName("Eigenvectors")); + + auto& x_grad = *context.Output(framework::GradVarName("X")); + auto* x_grad_data = x_grad.mutable_data(context.GetPlace()); + + auto& dims = V.dims(); + framework::DDim dim_origin = dims; + int num_dims = dim_origin.size(); + int batch_count = BatchCount(V); + const int order = dim_origin[num_dims - 1]; + + ComputeBackwardForComplexInput( + V, L, gL, gV, x_grad_data, batch_count, order, context); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h index 93c37ae425640f..415d0c6dd8e0cf 100644 --- a/paddle/fluid/operators/math/matrix_solve.h +++ b/paddle/fluid/operators/math/matrix_solve.h @@ -70,6 +70,46 @@ void compute_solve_eigen(const DeviceContext& context, } } +// only used for complex input +template +void SolveLinearSystem(T* matrix_data, T* rhs_data, T* out_data, int order, + int rhs_cols, int batch) { + using Treal = typename Eigen::NumTraits::Real; + + // cast paddle::complex into std::complex + std::complex* matrix_data_ = + reinterpret_cast*>(matrix_data); + std::complex* rhs_data_ = + reinterpret_cast*>(rhs_data); + std::complex* out_data_ = + reinterpret_cast*>(out_data); + + using Matrix = Eigen::Matrix, Eigen::Dynamic, + Eigen::Dynamic, Eigen::RowMajor>; + using InputMatrixMap = Eigen::Map; + using OutputMatrixMap = Eigen::Map; + + for (int i = 0; i < batch; ++i) { + auto input_matrix = + InputMatrixMap(matrix_data_ + i * order * order, order, order); + auto input_rhs = + InputMatrixMap(rhs_data_ + i * order * rhs_cols, order, rhs_cols); + auto output = + OutputMatrixMap(out_data_ + i * order * rhs_cols, order, rhs_cols); + + Eigen::PartialPivLU lu_decomposition(order); + lu_decomposition.compute(input_matrix); + + const Treal min_abs_piv = + lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff(); + PADDLE_ENFORCE_GT(min_abs_piv, Treal(0), + platform::errors::InvalidArgument( + "Something's wrong with SolveLinearSystem. ")); + + output = lu_decomposition.solve(input_rhs); + } +} + template class MatrixSolveFunctor { public: diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index d592c62d499b35..9ba7c9a3062a04 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -96,6 +96,20 @@ struct PowFunctor { float exp_; }; +template +struct RealMulComplexFunctor { + // x: complex number (a+bj) + // y: complex number (c+0j) pretend to be a real number + // out: complex number (ac+bcj) + inline HOSTDEVICE T operator()(T x, T y) { + PADDLE_ENFORCE_LT(y.imag, 1e-6, platform::errors::InvalidArgument( + "The image part of y must to be 0" + "but got [%d]", + y.imag)); + return platform::complex>(x.real * y.real, x.imag * y.real); + } +}; + static std::vector GetBroadcastShape(InTensors ins) { PADDLE_ENFORCE_EQ(ins.size(), 2, platform::errors::InvalidArgument( "GetBroadcastShape Receive 2 tensors" @@ -286,6 +300,45 @@ struct DeviceIndependenceTensorOperations { for_range(DiagFunctor(x.data(), x.numel(), output)); return ret; } + + // batch_diag for CPU only + Tensor BatchDiag(const Tensor& x, int batch) { + Tensor out; + auto* x_data = x.data>(); + auto numel = x.numel(); + auto* out_data = out.mutable_data>( + x.dims(), context.GetPlace(), + static_cast(numel * sizeof(math::Real))); + + auto x_dims = x.dims(); + int num_dims = x_dims.size(); + std::vector out_shape; + + for (int i = 0; i < num_dims - 1; ++i) { + out_shape.push_back(x.dims()[i]); + } + out.Resize(framework::make_ddim(out_shape)); + int order = x.dims()[num_dims - 1]; + int stride_out = order * order; + int stride_in = order + 1; + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < order; ++j) { + out_data[i * order + j] = x_data[stride_out * i + stride_in * j]; + } + } + return out; + } + + // a complex number x times a real number y, which is represented as (a+0j) + Tensor RealMulComplex(const Tensor& x, const Tensor& y) { + framework::Tensor ret; + std::vector out_shape = GetBroadcastShape({&x, &y}); + ret.Resize(framework::make_ddim(out_shape)); + ElementwiseComputeEx, DeviceContext, T>( + context, &x, &y, -1, RealMulComplexFunctor(), &ret); + return ret; + } + framework::Tensor Div(const framework::Tensor& x, const framework::Tensor& y) { framework::Tensor ret; @@ -459,6 +512,19 @@ struct DeviceIndependenceTensorOperations { return out; } + Tensor Real(const Tensor& x) { + Tensor out; + auto numel = x.numel(); + auto* out_data = out.mutable_data>( + x.dims(), context.GetPlace(), + static_cast(numel * sizeof(math::Real))); + auto* x_data = x.data(); + auto for_range = GetForRange(numel); + math::RealFunctor functor(x_data, out_data, numel); + for_range(functor); + return out; + } + Tensor DiagFill(const int m, const int n, const int num_lower_diags, const int num_upper_diags, const Tensor& scale, const Tensor& input) { diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index a50a667f663eed..3621d20fa24721 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -134,6 +134,10 @@ def product(dim): delta = np.array(delta).astype(np.float16) elif tensor_to_check_dtype == core.VarDesc.VarType.BF16: tensor_to_check_dtype = np.float32 + elif tensor_to_check_dtype == core.VarDesc.VarType.COMPLEX64: + tensor_to_check_dtype = np.complex64 + elif tensor_to_check_dtype == core.VarDesc.VarType.COMPLEX128: + tensor_tp_check_dtype = np.complex128 else: raise ValueError("Not supported data type " + str( tensor_to_check_dtype)) diff --git a/python/paddle/fluid/tests/unittests/test_eig_op.py b/python/paddle/fluid/tests/unittests/test_eig_op.py new file mode 100644 index 00000000000000..bb83de7d0dd674 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eig_op.py @@ -0,0 +1,250 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from op_test import OpTest, skip_check_grad_ci +import unittest +from paddle.fluid.op import Operator +from paddle.fluid import compiler, Program, program_guard + + +# cast output to complex for numpy.linalg.eig +def cast_to_complex(input, output): + if (input.dtype == np.float32): + output = output.astype(np.complex64) + elif (input.dtype == np.float64): + output = output.astype(np.complex128) + return output + + +# define eig backward function for a single square matrix +def eig_backward(w, v, grad_w, grad_v): + v_tran = np.transpose(v) + v_tran = np.conjugate(v_tran) + w_conj = np.conjugate(w) + w_conj_l = w_conj.reshape(1, w.size) + w_conj_r = w_conj.reshape(w.size, 1) + w_conj_2d = w_conj_l - w_conj_r + + vhgv = np.matmul(v_tran, grad_v) + real_vhgv = np.real(vhgv) + diag_real = real_vhgv.diagonal() + + diag_2d = diag_real.reshape(1, w.size) + rhs = v * diag_2d + mid = np.matmul(v_tran, rhs) + result = vhgv - mid + + res = np.divide(result, w_conj_2d) + row, col = np.diag_indices_from(res) + res[row, col] = 1.0 + + tmp = np.matmul(res, v_tran) + dx = np.linalg.solve(v_tran, tmp) + return dx + + +class TestEigOp(OpTest): + def setUp(self): + paddle.enable_static() + paddle.device.set_device("cpu") + self.op_type = "eig" + self.__class__.op_type = self.op_type + self.init_input() + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)} + self.outputs = {'Eigenvalues': self.out[0], 'Eigenvectors': self.out[1]} + + def init_input(self): + self.set_dtype() + self.set_dims() + self.x = np.random.random(self.shape).astype(self.dtype) + self.out = np.linalg.eig(self.x) + self.out = (cast_to_complex(self.x, self.out[0]), + cast_to_complex(self.x, self.out[1])) + + # for the real input, a customized checker is needed + def checker(self, outs): + actual_out_w = outs[0].flatten() + expect_out_w = self.out[0].flatten() + actual_out_v = outs[1].flatten() + expect_out_v = self.out[1].flatten() + + length_w = len(expect_out_w) + act_w_real = np.sort( + np.array([np.abs(actual_out_w[i].real) for i in range(length_w)])) + act_w_imag = np.sort( + np.array([np.abs(actual_out_w[i].imag) for i in range(length_w)])) + exp_w_real = np.sort( + np.array([np.abs(expect_out_w[i].real) for i in range(length_w)])) + exp_w_imag = np.sort( + np.array([np.abs(expect_out_w[i].imag) for i in range(length_w)])) + + for i in range(length_w): + self.assertTrue( + np.allclose(act_w_real[i], exp_w_real[i], 1e-6, 1e-5), + "The eigenvalues real part have diff: \nExpected " + + str(act_w_real[i]) + "\n" + "But got: " + str(exp_w_real[i])) + self.assertTrue( + np.allclose(act_w_imag[i], exp_w_imag[i], 1e-6, 1e-5), + "The eigenvalues image part have diff: \nExpected " + + str(act_w_imag[i]) + "\n" + "But got: " + str(exp_w_imag[i])) + + length_v = len(expect_out_v) + act_v_real = np.sort( + np.array([np.abs(actual_out_v[i].real) for i in range(length_v)])) + act_v_imag = np.sort( + np.array([np.abs(actual_out_v[i].imag) for i in range(length_v)])) + exp_v_real = np.sort( + np.array([np.abs(expect_out_v[i].real) for i in range(length_v)])) + exp_v_imag = np.sort( + np.array([np.abs(expect_out_v[i].imag) for i in range(length_v)])) + + for i in range(length_v): + self.assertTrue( + np.allclose(act_v_real[i], exp_v_real[i], 1e-6, 1e-5), + "The eigenvectors real part have diff: \nExpected " + + str(act_v_real[i]) + "\n" + "But got: " + str(exp_v_real[i])) + self.assertTrue( + np.allclose(act_v_imag[i], exp_v_imag[i], 1e-6, 1e-5), + "The eigenvectors image part have diff: \nExpected " + + str(act_v_imag[i]) + "\n" + "But got: " + str(exp_v_imag[i])) + + def set_dtype(self): + self.dtype = np.complex64 + + def set_dims(self): + self.shape = (10, 10) + + def init_grad(self): + # grad_w, grad_v complex dtype + gtype = self.dtype + if self.dtype == np.float32: + gtype = np.complex64 + elif self.dtype == np.float64: + gtype = np.complex128 + self.grad_w = np.ones(self.out[0].shape, gtype) + self.grad_v = np.ones(self.out[1].shape, gtype) + self.grad_x = eig_backward(self.out[0], self.out[1], self.grad_w, + self.grad_v) + + def test_check_output(self): + self.check_output_with_place_customized( + checker=self.checker, place=core.CPUPlace()) + + def test_check_grad(self): + self.init_grad() + self.check_grad( + ['X'], ['Eigenvalues', 'Eigenvectors'], + user_defined_grads=[self.grad_x], + user_defined_grad_outputs=[self.grad_w, self.grad_v]) + + +class TestComplex128(TestEigOp): + def set_dtype(self): + self.dtype = np.complex128 + + +@skip_check_grad_ci( + reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig" +) +class TestDouble(TestEigOp): + def set_dtype(self): + self.dtype = np.float64 + + def test_check_grad(self): + pass + + +@skip_check_grad_ci( + reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig" +) +class TestEigBatchMarices(TestEigOp): + def set_dtype(self): + self.dtype = np.float64 + + def set_dims(self): + self.shape = (3, 10, 10) + + def test_check_grad(self): + pass + + +@skip_check_grad_ci( + reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig" +) +class TestFloat(TestEigOp): + def set_dtype(self): + self.dtype = np.float32 + + def test_check_grad(self): + pass + + +class TestEigStatic(TestEigOp): + def test_check_output_with_place(self): + paddle.enable_static() + place = core.CPUPlace() + input_np = np.random.random([3, 3]).astype('complex') + expect_val, expect_vec = np.linalg.eig(input_np) + with fluid.program_guard(fluid.Program(), fluid.Program()): + input = fluid.data(name="input", shape=[3, 3], dtype='complex') + act_val, act_vec = paddle.linalg.eig(input) + + exe = fluid.Executor(place) + fetch_val, fetch_vec = exe.run(fluid.default_main_program(), + feed={"input": input_np}, + fetch_list=[act_val, act_vec]) + self.assertTrue( + np.allclose(expect_val, fetch_val, 1e-6, 1e-6), + "The eigen values have diff: \nExpected " + str(expect_val) + "\n" + + "But got: " + str(fetch_val)) + self.assertTrue( + np.allclose(np.abs(expect_vec), np.abs(fetch_vec), 1e-6, 1e-6), + "The eigen vectors have diff: \nExpected " + + str(np.abs(expect_vec)) + "\n" + "But got: " + + str(np.abs(fetch_vec))) + + +class TestEigWrongDimsError(unittest.TestCase): + def test_error(self): + paddle.device.set_device("cpu") + paddle.disable_static() + a = np.random.random((3)).astype('float32') + x = paddle.to_tensor(a) + self.assertRaises(ValueError, paddle.linalg.eig, x) + + +class TestEigNotSquareError(unittest.TestCase): + def test_error(self): + paddle.device.set_device("cpu") + paddle.disable_static() + a = np.random.random((1, 2, 3)).astype('float32') + x = paddle.to_tensor(a) + self.assertRaises(ValueError, paddle.linalg.eig, x) + + +class TestEigUnsupportedDtypeError(unittest.TestCase): + def test_error(self): + paddle.device.set_device("cpu") + paddle.disable_static() + a = (np.random.random((3, 3)) * 10).astype('int64') + x = paddle.to_tensor(a) + self.assertRaises(ValueError, paddle.linalg.eig, x) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py index d57d9a4bdb6780..726355379e7b63 100644 --- a/python/paddle/linalg.py +++ b/python/paddle/linalg.py @@ -14,6 +14,7 @@ from .tensor.linalg import cholesky # noqa: F401 from .tensor.linalg import norm # noqa: F401 +from .tensor.linalg import eig # noqa: F401 from .tensor.linalg import cond # noqa: F401 from .tensor.linalg import matrix_power # noqa: F401 from .tensor.linalg import solve # noqa: F401 @@ -32,6 +33,7 @@ 'norm', 'cond', 'inv', + 'eig', 'eigvals', 'multi_dot', 'matrix_rank', diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 080a06455a681a..b5d79b60393202 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -45,6 +45,7 @@ from .linalg import bmm # noqa: F401 from .linalg import histogram # noqa: F401 from .linalg import mv # noqa: F401 +from .linalg import eig # noqa: F401 from .linalg import matrix_power # noqa: F401 from .linalg import eigvals # noqa: F401 from .linalg import multi_dot # noqa: F401 @@ -386,6 +387,7 @@ 'bitwise_xor', 'bitwise_not', 'broadcast_tensors', + 'eig', 'uniform_', 'multi_dot', 'solve', diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 9ba9370a43087d..f112603fbb60f1 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -23,6 +23,7 @@ from paddle.common_ops_import import core from paddle.common_ops_import import VarDesc from paddle import _C_ops +import paddle __all__ = [] @@ -1593,6 +1594,72 @@ def matrix_power(x, n, name=None): return out +def eig(x, name=None): + """ + This API performs the eigenvalue decomposition of a square matrix or a batch of square matrices. + + .. note:: + If the matrix is a Hermitian or a real symmetric matrix, please use :ref:`paddle.linalg.eigh` instead, which is much faster. + If only eigenvalues is needed, please use :ref:`paddle.linalg.eigvals` instead. + If the matrix is of any shape, please use :ref:`paddle.linalg.svd`. + This API is only supported on CPU device. + The output datatype is always complex for both real and complex input. + + Args: + x (Tensor): A tensor with shape math:`[*, N, N]`, The data type of the x should be one of ``float32``, + ``float64``, ``compplex64`` or ``complex128``. + name (str, optional): The default value is `None`. Normally there is no need for user to set + this property. For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Eigenvalues(Tensors): A tensor with shape math:`[*, N]` refers to the eigen values. + Eigenvectors(Tensors): A tensor with shape math:`[*, N, N]` refers to the eigen vectors. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + paddle.device.set_device("cpu") + + x_data = np.array([[1.6707249, 7.2249975, 6.5045543], + [9.956216, 8.749598, 6.066444 ], + [4.4251957, 1.7983172, 0.370647 ]]).astype("float32") + x = paddle.to_tensor(x_data) + w, v = paddle.linalg.eig(x) + print(w) + # Tensor(shape=[3, 3], dtype=complex128, place=CPUPlace, stop_gradient=False, + # [[(-0.5061363550800655+0j) , (-0.7971760990842826+0j) , + # (0.18518077798279986+0j)], + # [(-0.8308237755993192+0j) , (0.3463813401919749+0j) , + # (-0.6837005269141947+0j) ], + # [(-0.23142567697893396+0j), (0.4944999840400175+0j) , + # (0.7058765252952796+0j) ]]) + + print(v) + # Tensor(shape=[3], dtype=complex128, place=CPUPlace, stop_gradient=False, + # [ (16.50471283351188+0j) , (-5.5034820550763515+0j) , + # (-0.21026087843552282+0j)]) + """ + if in_dygraph_mode(): + w, v = _C_ops.eig(x) + return w, v + + check_variable_and_dtype( + x, 'X', ['float32', 'float64', 'complex64', 'complex128'], 'eig') + helper = LayerHelper('eig', **locals()) + + w = helper.create_variable_for_type_inference(x.dtype) + v = helper.create_variable_for_type_inference(x.dtype) + + inputs = {'X': x} + outputs = {'Eigenvalues': w, 'Eigenvectors': v} + helper.append_op(type='eig', inputs=inputs, outputs=outputs) + + return w, v + + def eigvals(x, name=None): """ Compute the eigenvalues of one or more general matrices. From 6b587e93d4b3c92ee8c6302339e42a140ee52062 Mon Sep 17 00:00:00 2001 From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com> Date: Tue, 28 Sep 2021 19:18:22 +0800 Subject: [PATCH 036/298] Add sparse_attention api, test=develop (#35676) Add sparse_attention OPs, python api will be added in next pr --- cmake/operators.cmake | 2 +- paddle/fluid/operators/CMakeLists.txt | 6 +- paddle/fluid/operators/sparse_attention_op.cc | 193 +++++++ paddle/fluid/operators/sparse_attention_op.cu | 537 ++++++++++++++++++ .../unittests/test_sparse_attention_op.py | 205 +++++++ .../white_list/op_threshold_white_list.py | 1 + 6 files changed, 942 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/operators/sparse_attention_op.cc create mode 100644 paddle/fluid/operators/sparse_attention_op.cu create mode 100644 python/paddle/fluid/tests/unittests/test_sparse_attention_op.py diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 2c010a1e6297f0..7541b234ceaa69 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -214,7 +214,7 @@ function(op_library TARGET) foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" -"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" +"sync_batch_norm_op" "sparse_attention_op" "dgc_op" "fused_fc_elementwise_layernorm_op" "skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op" "fused_bn_add_activation_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 0d7d0a5e13bf3d..c487313f91c588 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -78,7 +78,7 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() -register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op +register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op sparse_attention_op lstm_op run_program_op eye_op recurrent_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) @@ -94,6 +94,10 @@ if (WITH_GPU OR WITH_ROCM) endif() op_library(sync_batch_norm_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n") + if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) ) + op_library(sparse_attention_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sparse_attention);\n") + endif() else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() diff --git a/paddle/fluid/operators/sparse_attention_op.cc b/paddle/fluid/operators/sparse_attention_op.cc new file mode 100644 index 00000000000000..9b6bc1b6290451 --- /dev/null +++ b/paddle/fluid/operators/sparse_attention_op.cc @@ -0,0 +1,193 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class SparseAttentionOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Q", + "(Tensor), The input tensor of query in attention, " + "whose dimension : `[batch_size, num_heads, target_len, head_dim]`."); + AddInput( + "K", + "(Tensor), The input tensor of key in attention, " + "whose dimension : `[batch_size, num_heads, target_len, head_dim]`."); + AddInput( + "V", + "(Tensor), The input tensor of value in attention, " + "whose dimension : `[batch_size, num_heads, target_len, head_dim]`."); + AddInput("Offset", + "(Tensor, default: Tensor), The input tensor of offset in " + "CSR sparse format, " + "whose dimension : `[batch_size, num_heads, target_len + 1]`."); + AddInput("Columns", + "(Tensor, default: Tensor), The input tensor of columns in " + "CSR sparse format, " + "whose dimension : `[batch_size, num_heads, sparse_nnz_num]`."); + AddOutput( + "Out", + "(Tensor), The output tensor of result in attention, " + "whose dimension : `[batch_size, num_heads, target_len, head_dim]`."); + AddOutput("SparseDotSdd", + "(Tensor), The output tensor of result in SparseDotSdd step, " + "whose dimension : `[batch_size, num_heads, sparse_nnz_dim]`.") + .AsIntermediate(); + AddOutput("Softmax", + "(Tensor), The output tensor of result in Softmax step, " + "whose dimension : `[batch_size, num_heads, sparse_nnz_dim]`.") + .AsIntermediate(); + AddComment(R"DOC( + Compute the value of the sparse attention module. Its input value includes five tensors. + Q, K, and V represent query, key, and value in the Attention module, respectively. + The CSR format is used to represent the sparsity feature in the Attention module. + The CSR format contains two tensors, offset and columns. + )DOC"); + } +}; + +class SparseAttentionOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Q"), "Input", "Q", "sparse_attention"); + OP_INOUT_CHECK(ctx->HasInput("K"), "Input", "K", "sparse_attention"); + OP_INOUT_CHECK(ctx->HasInput("V"), "Input", "V", "sparse_attention"); + OP_INOUT_CHECK(ctx->HasInput("Offset"), "Input", "Offset", + "sparse_attention"); + OP_INOUT_CHECK(ctx->HasInput("Columns"), "Input", "Columns", + "sparse_attention"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "sparse_attention"); + OP_INOUT_CHECK(ctx->HasOutput("SparseDotSdd"), "Output", "SparseDotSdd", + "sparse_attention"); + OP_INOUT_CHECK(ctx->HasOutput("Softmax"), "Output", "Softmax", + "sparse_attention"); + + auto dims_q = ctx->GetInputDim("Q"); + auto dims_k = ctx->GetInputDim("K"); + auto dims_v = ctx->GetInputDim("V"); + auto dims_columns = ctx->GetInputDim("Columns"); + + PADDLE_ENFORCE_EQ(dims_q.size(), static_cast(4), + platform::errors::InvalidArgument( + "Dimension in query' shapes should be 4.")); + PADDLE_ENFORCE_EQ(dims_k.size(), static_cast(4), + platform::errors::InvalidArgument( + "Dimension in key' shapes should be 4.")); + PADDLE_ENFORCE_EQ(dims_v.size(), static_cast(4), + platform::errors::InvalidArgument( + "Dimension in value' shapes should be 4.")); + + auto batch_size = dims_q[0]; + auto num_heads = dims_q[1]; + auto M = dims_q[2]; + auto N = dims_q[3]; + auto sparse_nnz = dims_columns[2]; + ctx->SetOutputDim("Out", {batch_size, num_heads, M, N}); + ctx->SetOutputDim("SparseDotSdd", {batch_size, num_heads, sparse_nnz}); + ctx->SetOutputDim("Softmax", {batch_size, num_heads, sparse_nnz}); + ctx->ShareLoD("Q", "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = + OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "Q", "K"); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + +class SparseAttentionOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Q"), "Input", "Q", "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("K"), "Input", "K", "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("V"), "Input", "V", "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("Offset"), "Input", "Offset", + "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("Columns"), "Input", "Columns", + "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("SparseDotSdd"), "Input", "SparseDotSdd", + "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("Softmax"), "Input", "Softmax", + "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + "Out@GRAD", "sparse_attention_grad"); + + auto x_grad_name = framework::GradVarName("Q"); + auto y_grad_name = framework::GradVarName("K"); + auto z_grad_name = framework::GradVarName("V"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("Q")); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, ctx->GetInputDim("K")); + } + if (ctx->HasOutput(z_grad_name)) { + ctx->SetOutputDim(z_grad_name, ctx->GetInputDim("V")); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.GetPlace()); + } +}; + +template +class SparseAttentionGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("sparse_attention_grad"); + op->SetInput("Q", this->Input("Q")); + op->SetInput("K", this->Input("K")); + op->SetInput("V", this->Input("V")); + op->SetInput("Offset", this->Input("Offset")); + op->SetInput("Columns", this->Input("Columns")); + op->SetInput("SparseDotSdd", this->Output("SparseDotSdd")); + op->SetInput("Softmax", this->Output("Softmax")); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetOutput(framework::GradVarName("Q"), this->InputGrad("Q")); + op->SetOutput(framework::GradVarName("K"), this->InputGrad("K")); + op->SetOutput(framework::GradVarName("V"), this->InputGrad("V")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sparse_attention, ops::SparseAttentionOp, + ops::SparseAttentionOpMaker, + ops::SparseAttentionGradOpMaker, + ops::SparseAttentionGradOpMaker); + +REGISTER_OPERATOR(sparse_attention_grad, ops::SparseAttentionOpGrad); diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu new file mode 100644 index 00000000000000..88ee8999c5f4af --- /dev/null +++ b/paddle/fluid/operators/sparse_attention_op.cu @@ -0,0 +1,537 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#if defined(PADDLE_WITH_CUDA) +#include "paddle/fluid/platform/dynload/cusparse.h" +#endif + +namespace ops = paddle::operators; +namespace plf = paddle::platform; + +namespace paddle { +namespace operators { + +template +__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, + int width = warpSize) { + return __shfl_xor_sync(mask, val, width); +} + +template +__device__ __forceinline__ void WarpReduceSum(T* sum) { +#pragma unroll + for (int offset = warp_size / 2; offset > 0; offset /= 2) { +#pragma unroll + for (int i = 0; i < batch_size; ++i) { + T sum_val = CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); + sum[i] = sum[i] + sum_val; + } + } +} + +template +__device__ __forceinline__ void WarpReduceMax(T* sum) { +#pragma unroll + for (int offset = warp_size / 2; offset > 0; offset /= 2) { +#pragma unroll + for (int i = 0; i < batch_size; ++i) { + T max_val = CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); + sum[i] = max(sum[i], max_val); + } + } +} + +template +__global__ void BlockSparseSoftmaxForward(T* softmax, const T* src, T scale, + const T* kp_mask, const T* attn_mask, + const int* layout_rowptr, + const int* layout_colindex, + int num_rows) { + // current thread related info + const int WarpSize = 32; + const int cur_row = blockIdx.x * blockDim.y + threadIdx.y; + if (cur_row < num_rows) { + const int cur_block_row = cur_row / BlockSize; + const int cur_block_nnz = + layout_rowptr[cur_block_row + 1] - layout_rowptr[cur_block_row]; + + T srcdata[(BlockSize * BlockNnzMax + WarpSize - 1) / WarpSize]; + T attndata[(BlockSize * BlockNnzMax + WarpSize - 1) / WarpSize]; + + // read kp mask + T cur_kp_mask = (kp_mask == nullptr) ? 0 : kp_mask[cur_row]; + + // read tensor data, attn mask + const int iter = (cur_block_nnz + WarpSize - 1) / WarpSize; + const T* srcptr = src + layout_rowptr[cur_block_row]; + T* attnptr = nullptr; + if (attn_mask != nullptr) { + const T* attnptr = attn_mask + cur_block_row * num_rows; + } + const int* colindex = layout_colindex + layout_rowptr[cur_block_row]; + for (int j = 0; j < iter; j++) { + int cur_block_col = j * WarpSize + threadIdx.x; + int cur_reg_index = j; + if (cur_block_col < cur_block_nnz) { + if ((attnptr != nullptr) && + std::abs(attnptr[colindex[cur_block_col]]) < + std::numeric_limits::epsilon()) { + srcdata[cur_reg_index] = + -std::numeric_limits::infinity() * scale + cur_kp_mask; + } else { + srcdata[cur_reg_index] = scale * srcptr[cur_block_col] + cur_kp_mask; + } + } else { + srcdata[cur_reg_index] = -std::numeric_limits::infinity(); + } + } + + // max value + T max_value = srcdata[0]; + const int kIteration = + (cur_block_nnz * BlockSize + WarpSize - 1) / WarpSize; +#pragma unroll + for (int it = 1; it < kIteration; ++it) { + max_value = (max_value > srcdata[it]) ? max_value : srcdata[it]; + } + WarpReduceMax(&max_value); + + // exp sum + T sum = 0; +#pragma unroll + for (int it = 0; it < kIteration; ++it) { + srcdata[it] = std::exp(srcdata[it] - max_value); + sum += srcdata[it]; + } + WarpReduceSum(&sum); + + // compute softmax and write out + T* softmaxptr = softmax + layout_rowptr[cur_block_row]; + for (int j = 0; j < iter; j++) { + int cur_block_col = j * WarpSize + threadIdx.x; + int cur_reg_index = j; + if (cur_block_col < cur_block_nnz) { + softmaxptr[cur_block_col] = srcdata[cur_reg_index] / sum; + } + } + } +} + +template +__global__ void BlockSparseSoftmaxBackward(T* dst, const T* grad, const T* src, + T scale, const int* layout_rowptr, + const int* layout_colindex, + int num_rows) { + // current thread related info + const int WarpSize = 32; + const int cur_row = blockIdx.x * blockDim.y + threadIdx.y; + if (cur_row < num_rows) { + const int cur_block_row = cur_row / BlockSize; + const int cur_block_nnz = + layout_rowptr[cur_block_row + 1] - layout_rowptr[cur_block_row]; + + T srcdata[(BlockSize * BlockNnzMax + WarpSize - 1) / WarpSize]; + T graddata[(BlockSize * BlockNnzMax + WarpSize - 1) / WarpSize]; + + // read tensor data, attn mask + const int iter = (cur_block_nnz + WarpSize - 1) / WarpSize; + const T* srcptr = src + layout_rowptr[cur_block_row]; + const T* gradptr = grad + layout_rowptr[cur_block_row]; + for (int j = 0; j < iter; j++) { + int cur_block_col = j * WarpSize + threadIdx.x; + int cur_reg_index = j; + if (cur_block_col < cur_block_nnz) { + srcdata[cur_reg_index] = srcptr[cur_block_col]; + graddata[cur_reg_index] = gradptr[cur_block_col]; + } else { + srcdata[cur_reg_index] = 0; + graddata[cur_reg_index] = 0; + } + } + + T sum = 0; + const int kIteration = + (cur_block_nnz * BlockSize + WarpSize - 1) / WarpSize; +#pragma unroll + for (int it = 0; it < kIteration; ++it) { + sum += srcdata[it] * graddata[it]; + } + WarpReduceSum(&sum); + + // compute softmax and write out + T* dstptr = dst + layout_rowptr[cur_block_row]; + for (int j = 0; j < iter; j++) { + int cur_block_col = j * WarpSize + threadIdx.x; + int cur_reg_index = j; + if (cur_block_col < cur_block_nnz) { + dstptr[cur_block_col] = + scale * srcdata[cur_reg_index] * (graddata[cur_reg_index] - sum); + } + } + } +} + +using Tensor = framework::Tensor; +/* +input: sparse C in CSR format (num_rows,num_rows) +output: sparse C after softmax operation +*/ +template +void SparseSoftmaxForward(const platform::CUDADeviceContext& ctx, + const Tensor* offset, const Tensor* columns, + Tensor* input, Tensor* output, const int blocksize, + const int num_rows, const int num_cols) { + const int* offset_data = offset->data(); + const int* columns_data = columns->data(); + T* input_data = input->data(); + T* output_data = output->data(); + + const int block_size = 1; + dim3 blocks(32, 4, 1); + int grid = (num_rows * block_size + 3) / 4; + T scaling = static_cast(1.0) / sqrt(static_cast(num_cols)); + + const int block_nnz_max = 256; + BlockSparseSoftmaxForward<<>>( + output_data, input_data, scaling, nullptr, nullptr, offset_data, + columns_data, num_rows); +} + +template +void SparseSoftmaxBackward(const platform::CUDADeviceContext& ctx, + const Tensor* offset, const Tensor* columns, + Tensor* dx, const Tensor* dout, const Tensor* out, + const int blocksize, const int num_rows, + const int num_cols) { + const int* offset_data = offset->data(); + const int* columns_data = columns->data(); + T* dx_data = dx->data(); + const T* dout_data = dout->data(); + const T* out_data = out->data(); + + const int block_size = 1; + dim3 blocks(32, 4, 1); + int grid = (num_rows * block_size + 3) / 4; + T scaling = static_cast(1.0) / sqrt(static_cast(num_cols)); + + const int block_nnz_max = 256; + BlockSparseSoftmaxBackward<<>>( + dx_data, dout_data, out_data, scaling, offset_data, columns_data, + num_rows); +} + +using VarType = framework::proto::VarType; +inline cudaDataType_t GetGpuType(const VarType::Type data_type) { + if (data_type == VarType::FP32) { + return CUDA_R_32F; + } else if (data_type == VarType::FP64) { + return CUDA_R_64F; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Not support tensor type in sparse_attention OP: %s", + framework::DataTypeToString(data_type))); + } +} + +inline cusparseOperation_t GetTransposeOperation(const bool transpose) { + if (transpose) { + return CUSPARSE_OPERATION_TRANSPOSE; + } else { + return CUSPARSE_OPERATION_NON_TRANSPOSE; + } +} + +void CusparseDestroy(cusparseDnMatDescr_t* dn_mat_first, + cusparseDnMatDescr_t* dn_mat_second, + cusparseSpMatDescr_t* sp_mat) { + platform::dynload::cusparseDestroyDnMat(*dn_mat_first); + platform::dynload::cusparseDestroyDnMat(*dn_mat_second); + platform::dynload::cusparseDestroySpMat(*sp_mat); +} + +/* +input: dense A (num_rows,num_cols), dense B (num_rows,num_cols) +output: sparse C in CSR format (num_rows,num_rows) +*/ +template +void DotSdd(const platform::CUDADeviceContext& ctx, const Tensor* a, + const Tensor* b, const Tensor* c_offset, const Tensor* c_columns, + Tensor* c_value, const int num_rows, const int num_cols, + const bool a_transpose, const bool b_transpose) { + const T* a_data = a->data(); + const T* b_data = b->data(); + const int* c_offset_data = c_offset->data(); + const int* c_columns_data = c_columns->data(); + T* c_value_data = c_value->data(); + + cudaDataType_t gpu_type = GetGpuType(c_value->type()); + cusparseHandle_t handle = nullptr; + cusparseDnMatDescr_t mat_a, mat_b; + cusparseSpMatDescr_t mat_c; + platform::dynload::cusparseCreate(&handle); + + // Create dense matrix A + platform::dynload::cusparseCreateDnMat(&mat_a, num_rows, num_cols, num_cols, + const_cast(a_data), gpu_type, + CUSPARSE_ORDER_ROW); + // Create dense matrix B + platform::dynload::cusparseCreateDnMat(&mat_b, num_rows, num_cols, num_cols, + const_cast(b_data), gpu_type, + CUSPARSE_ORDER_ROW); + // Create sparse matrix C in CSR format + int c_nnz = c_columns->dims()[1]; + platform::dynload::cusparseCreateCsr( + &mat_c, num_rows, num_rows, c_nnz, const_cast(c_offset_data), + const_cast(c_columns_data), c_value_data, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, gpu_type); + + T alpha = 1; + T beta = 0; + + size_t buffer_size = 0; + platform::dynload::cusparseSDDMM_bufferSize( + handle, GetTransposeOperation(a_transpose), + GetTransposeOperation(b_transpose), &alpha, mat_a, mat_b, &beta, mat_c, + gpu_type, CUSPARSE_SDDMM_ALG_DEFAULT, &buffer_size); + auto d_buffer_ptr = paddle::memory::Alloc(ctx, buffer_size); + void* d_buffer = static_cast(d_buffer_ptr->ptr()); + + platform::dynload::cusparseSDDMM(handle, GetTransposeOperation(a_transpose), + GetTransposeOperation(b_transpose), &alpha, + mat_a, mat_b, &beta, mat_c, gpu_type, + CUSPARSE_SDDMM_ALG_DEFAULT, d_buffer); + + CusparseDestroy(&mat_a, &mat_b, &mat_c); + platform::dynload::cusparseDestroy(handle); +} + +/* +input: sparse A in CSR format (num_rows,num_rows), dense B (num_rows,num_cols) +output: dense C (num_rows,num_cols) +*/ +template +void DotDsd(const platform::CUDADeviceContext& ctx, const Tensor* a_offset, + const Tensor* a_columns, const Tensor* a_value, const Tensor* b, + Tensor* c, const int num_rows, const int num_cols, + const bool a_transpose, const bool b_transpose) { + const int* a_offset_data = a_offset->data(); + const int* a_columns_data = a_columns->data(); + const T* a_value_data = a_value->data(); + const T* b_data = b->data(); + T* c_data = c->data(); + + cudaDataType_t gpu_type = GetGpuType(c->type()); + cusparseHandle_t handle = nullptr; + cusparseSpMatDescr_t mat_a; + cusparseDnMatDescr_t mat_b, mat_c; + platform::dynload::cusparseCreate(&handle); + + // Create sparse matrix A in CSR format + int a_nnz = a_columns->dims()[1]; + platform::dynload::cusparseCreateCsr( + &mat_a, num_rows, num_rows, a_nnz, const_cast(a_offset_data), + const_cast(a_columns_data), const_cast(a_value_data), + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + gpu_type); + + // Create dense matrix B + platform::dynload::cusparseCreateDnMat(&mat_b, num_rows, num_cols, num_cols, + const_cast(b_data), gpu_type, + CUSPARSE_ORDER_ROW); + // Create dense matrix C + platform::dynload::cusparseCreateDnMat(&mat_c, num_rows, num_cols, num_cols, + c_data, gpu_type, CUSPARSE_ORDER_ROW); + + T alpha = 1; + T beta = 0; + + size_t buffer_size = 0; + // allocate an external buffer if needed + platform::dynload::cusparseSpMM_bufferSize( + handle, GetTransposeOperation(a_transpose), + GetTransposeOperation(b_transpose), &alpha, mat_a, mat_b, &beta, mat_c, + gpu_type, CUSPARSE_SPMM_ALG_DEFAULT, &buffer_size); + auto d_buffer_ptr = paddle::memory::Alloc(ctx, buffer_size); + void* d_buffer = static_cast(d_buffer_ptr->ptr()); + + platform::dynload::cusparseSpMM(handle, GetTransposeOperation(a_transpose), + GetTransposeOperation(b_transpose), &alpha, + mat_a, mat_b, &beta, mat_c, gpu_type, + CUSPARSE_SPMM_ALG_DEFAULT, d_buffer); + + CusparseDestroy(&mat_b, &mat_c, &mat_a); + platform::dynload::cusparseDestroy(handle); +} + +std::vector GetSplitTensor(Tensor* input) { + auto dims = input->dims(); + int batch_size = dims[0]; + int num_heads = dims[1]; + std::vector new_dims(dims.size() - 1); + new_dims[0] = batch_size * num_heads; + for (int i = 1; i < new_dims.size(); i++) { + new_dims[i] = dims[i + 1]; + } + input->Resize(framework::make_ddim(new_dims)); + return input->Split(1, 0); +} + +template +class SparseAttentionCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto query = *ctx.Input("Q"); + auto key = *ctx.Input("K"); + auto value = *ctx.Input("V"); + auto offset = *ctx.Input("Offset"); + auto columns = *ctx.Input("Columns"); + auto output_ptr = ctx.Output("Out"); + output_ptr->mutable_data(ctx.GetPlace()); + auto sparse_dot_sdd_ptr = ctx.Output("SparseDotSdd"); + sparse_dot_sdd_ptr->mutable_data(ctx.GetPlace()); + auto softmax_ptr = ctx.Output("Softmax"); + softmax_ptr->mutable_data(ctx.GetPlace()); + + auto output = *output_ptr; + auto result_sdd = *sparse_dot_sdd_ptr; + auto result_softmax = *softmax_ptr; + + auto query_dims = query.dims(); + int batch_size = query_dims[0]; + int num_heads = query_dims[1]; + int M = query_dims[2]; + int N = query_dims[3]; + + std::vector query_lists = GetSplitTensor(&query); + std::vector key_lists = GetSplitTensor(&key); + std::vector value_lists = GetSplitTensor(&value); + std::vector offset_lists = GetSplitTensor(&offset); + std::vector columns_lists = GetSplitTensor(&columns); + std::vector result_sdd_lists = GetSplitTensor(&result_sdd); + std::vector result_softmax_lists = GetSplitTensor(&result_softmax); + std::vector output_lists = GetSplitTensor(&output); + + const auto& dev_ctx = ctx.cuda_device_context(); + const int iter_num = batch_size * num_heads; + for (int i = 0; i < iter_num; i++) { + DotSdd(dev_ctx, &query_lists[i], &key_lists[i], + &offset_lists[i], &columns_lists[i], + &result_sdd_lists[i], M, N, false, true); + + SparseSoftmaxForward( + dev_ctx, &offset_lists[i], &columns_lists[i], &result_sdd_lists[i], + &result_softmax_lists[i], 1, M, N); + + DotDsd(dev_ctx, &offset_lists[i], &columns_lists[i], + &result_softmax_lists[i], &value_lists[i], + &output_lists[i], M, N, false, false); + } + } +}; + +template +class SparseAttentionGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto query = *ctx.Input("Q"); + auto key = *ctx.Input("K"); + auto value = *ctx.Input("V"); + auto offset = *ctx.Input("Offset"); + auto columns = *ctx.Input("Columns"); + auto sparse_dot_sdd = *ctx.Input("SparseDotSdd"); + auto softmax = *ctx.Input("Softmax"); + auto dout = *ctx.Input(framework::GradVarName("Out")); + auto* dquery_ptr = ctx.Output(framework::GradVarName("Q")); + auto* dkey_ptr = ctx.Output(framework::GradVarName("K")); + auto* dvalue_ptr = ctx.Output(framework::GradVarName("V")); + dquery_ptr->mutable_data(ctx.GetPlace()); + dkey_ptr->mutable_data(ctx.GetPlace()); + dvalue_ptr->mutable_data(ctx.GetPlace()); + auto dquery = *dquery_ptr; + auto dkey = *dkey_ptr; + auto dvalue = *dvalue_ptr; + + auto query_dims = query.dims(); + int batch_size = query_dims[0]; + int num_heads = query_dims[1]; + int M = query_dims[2]; + int N = query_dims[3]; + + std::vector query_lists = GetSplitTensor(&query); + std::vector key_lists = GetSplitTensor(&key); + std::vector value_lists = GetSplitTensor(&value); + std::vector offset_lists = GetSplitTensor(&offset); + std::vector columns_lists = GetSplitTensor(&columns); + std::vector sparse_dot_sdd_lists = GetSplitTensor(&sparse_dot_sdd); + std::vector softmax_lists = GetSplitTensor(&softmax); + std::vector dout_lists = GetSplitTensor(&dout); + std::vector dquery_lists = GetSplitTensor(&dquery); + std::vector dkey_lists = GetSplitTensor(&dkey); + std::vector dvalue_lists = GetSplitTensor(&dvalue); + + const int iter_num = batch_size * num_heads; + const auto& dev_ctx = ctx.cuda_device_context(); + for (int i = 0; i < iter_num; i++) { + // dValue = transpose(result_softmax) * dOut + DotDsd(dev_ctx, &offset_lists[i], &columns_lists[i], + &softmax_lists[i], &dout_lists[i], + &dvalue_lists[i], M, N, true, false); + + // dSoftmax = dOut * transpose(Value) + int nnz_num = columns.dims()[0]; + Tensor dsoftmax; + dsoftmax.Resize({nnz_num}); + dsoftmax.mutable_data(ctx.GetPlace()); + DotSdd(dev_ctx, &dout_lists[i], &value_lists[i], + &offset_lists[i], &columns_lists[i], &dsoftmax, + M, N, false, true); + + // dSparseDotSdd = dSoftmax * softmax'(SparseDotSdd) + Tensor dsparse_dot_sdd; + dsparse_dot_sdd.Resize({nnz_num}); + dsparse_dot_sdd.mutable_data(ctx.GetPlace()); + SparseSoftmaxBackward( + dev_ctx, &offset_lists[i], &columns_lists[i], &dsparse_dot_sdd, + &dsoftmax, &softmax_lists[i], 1, M, N); + + // dQuery = dSparseDotSdd * Key + DotDsd(dev_ctx, &offset_lists[i], &columns_lists[i], + &dsparse_dot_sdd, &key_lists[i], + &dquery_lists[i], M, N, false, false); + + // dKey = transpose(dSparseDotSdd) * Query + DotDsd(dev_ctx, &offset_lists[i], &columns_lists[i], + &dsparse_dot_sdd, &query_lists[i], + &dkey_lists[i], M, N, true, false); + } + } +}; + +} // namespace operators +} // namespace paddle +REGISTER_OP_CUDA_KERNEL( + sparse_attention, + ops::SparseAttentionCUDAKernel, + ops::SparseAttentionCUDAKernel); + +REGISTER_OP_CUDA_KERNEL( + sparse_attention_grad, + ops::SparseAttentionGradCUDAKernel, + ops::SparseAttentionGradCUDAKernel); diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py new file mode 100644 index 00000000000000..ad618edd24d55b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py @@ -0,0 +1,205 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core +import paddle +import os +import re +import platform + + +def get_cuda_version(): + result = os.popen("nvcc --version").read() + regex = r'release (\S+),' + match = re.search(regex, result) + if match: + num = str(match.group(1)) + integer, decimal = num.split('.') + return int(integer) * 1000 + int(float(decimal) * 10) + else: + return -1 + + +def get_linux_platform(): + if platform.system().lower() == 'windows': + return 0 + elif platform.system().lower() == 'linux': + return 1 + else: + return -1 + + +def get_suitable_env(): + if get_cuda_version() >= 11020 and get_linux_platform() == 1: + return True + else: + return False + + +def softmax(x): + max = np.max(x, axis=1, keepdims=True) + e_x = np.exp(x - max) + sum = np.sum(e_x, axis=1, keepdims=True) + f_x = e_x / sum + return f_x + + +def get_csr_value(mat, layout, nnz): + row, col = mat.shape[0], mat.shape[1] + value = np.zeros(nnz) + ptr = 0 + for i in range(row): + for j in range(col): + if layout[i][j] == 1: + value[ptr] = mat[i][j] + ptr += 1 + return value + + +def ref_sparse_attention(q, k, v, offset, columns): + row, col, nnz = q.shape[0], q.shape[1], columns.shape[0] + mat = np.zeros((row, row)) + for cur_row in range(row): + start_ptr = int(offset[cur_row]) + end_ptr = int(offset[cur_row + 1]) + for ptr in range(start_ptr, end_ptr): + cur_col = int(columns[ptr]) + mat[cur_row][cur_col] = 1 + a = np.dot(q, k.T) * mat + a_value = get_csr_value(a, mat, nnz) + scaling = float(col)**-0.5 + a = scaling * a + for i in range(row): + for j in range(row): + if mat[i][j] == 0: + a[i][j] = float('-inf') + b = softmax(a) + b_value = get_csr_value(b, mat, nnz) + result = np.dot(b, v) + return result, a_value, b_value + + +def ref_batch_sparse_attention(q, k, v, offset, columns): + batch_size, num_heads, row, col = q.shape + nnz = columns.shape[2] + result = np.zeros((batch_size, num_heads, row, col)) + result_sdd = np.zeros((batch_size, num_heads, nnz)) + result_softmax = np.zeros((batch_size, num_heads, nnz)) + for i in range(batch_size): + for j in range(num_heads): + cur_q, cur_k, cur_v, = q[i][j], k[i][j], v[i][j] + cur_offset, cur_columns = offset[i][j], columns[i][j] + cur_result, cur_sdd, cur_softmax = ref_sparse_attention( + cur_q, cur_k, cur_v, cur_offset, cur_columns) + result[i][j] = cur_result + result_sdd[i][j], result_softmax[i][j] = cur_sdd, cur_softmax + return result, result_sdd, result_softmax + + +def init_csr_format(batch_size, num_heads, rows, blocksize): + block_num, block_last = rows / blocksize, rows % blocksize + nnz_num = block_num * blocksize * blocksize + block_last * block_last + offset = np.zeros(rows + 1) + columns = np.zeros(int(nnz_num)) + mat = np.zeros((rows, rows)) + for i in range(0, rows, blocksize): + for x in range(blocksize): + for y in range(blocksize): + p_x, p_y = i + x, i + y + if (p_x < rows) and (p_y < rows): + mat[p_x][p_y] = 1 + p_offset, p_column, count = 0, 0, 0 + for i in range(rows): + for j in range(rows): + if mat[i][j] != 0: + count += 1 + columns[p_column] = j + p_column += 1 + p_offset += 1 + offset[p_offset] = count + offset = np.expand_dims(np.expand_dims(offset, 0), 0) + offset = offset.repeat(num_heads, axis=1) + offset = offset.repeat(batch_size, axis=0) + columns = np.expand_dims(np.expand_dims(columns, 0), 0) + columns = columns.repeat(num_heads, axis=1) + columns = columns.repeat(batch_size, axis=0) + return offset, columns + + +@unittest.skipIf( + not core.is_compiled_with_cuda() or get_suitable_env() == False, + "core is not compiled with CUDA and cuda version need >= 11.2 in windows") +class TestSparseAttentionOp(OpTest): + def config(self): + self.shape = (1, 1, 16, 8) + self.blocksize = 2 + self.dtype = "float64" + + def setUp(self): + paddle.enable_static() + self.config() + self.op_type = "sparse_attention" + self.place = paddle.CUDAPlace(0) + self.q = np.random.random(self.shape).astype(self.dtype) + self.k = np.random.random(self.shape).astype(self.dtype) + self.v = np.random.random(self.shape).astype(self.dtype) + offset, columns = init_csr_format(self.shape[0], self.shape[1], + self.shape[2], self.blocksize) + self.offset = offset.astype('int32') + self.columns = columns.astype('int32') + + result, result_sdd, result_softmax = ref_batch_sparse_attention( + self.q, self.k, self.v, self.offset, self.columns) + + self.inputs = { + 'Q': self.q, + 'K': self.k, + 'V': self.v, + 'offset': self.offset, + 'columns': self.columns + } + self.outputs = { + 'Out': result.astype(self.dtype), + 'ResultSdd': result_sdd.astype(self.dtype), + 'ResultSoftmax': result_softmax.astype(self.dtype) + } + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['Q'], 'Out') + self.check_grad_with_place(self.place, ['K'], 'Out') + self.check_grad_with_place(self.place, ['V'], 'Out') + + +class TestSparseAttentionOpFp32Test(TestSparseAttentionOp): + def config(self): + self.shape = (1, 1, 8, 16) + self.blocksize = 2 + self.dtype = "float32" + + +class TestSparseAttentionOpShapeTest(TestSparseAttentionOp): + def config(self): + self.shape = (2, 2, 32, 8) + self.blocksize = 8 + self.dtype = "float64" + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py index 26d63826cc87a9..1c8c89d13abc7f 100644 --- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py @@ -46,6 +46,7 @@ 'cudnn_lstm', \ 'rnn', \ 'lgamma', \ + 'sparse_attention', \ 'svd', \ 'matrix_power', \ 'solve', \ From f068e08d725faf61ccf3128efd70fdcd89cd8a1c Mon Sep 17 00:00:00 2001 From: Feng Ni Date: Tue, 28 Sep 2021 20:18:26 +0800 Subject: [PATCH 037/298] add roi_align (#35102) * add roi_align in vision/ops.py --- python/paddle/tests/test_ops_roi_align.py | 108 +++++++++++++++ python/paddle/vision/ops.py | 159 ++++++++++++++++++++++ 2 files changed, 267 insertions(+) create mode 100644 python/paddle/tests/test_ops_roi_align.py diff --git a/python/paddle/tests/test_ops_roi_align.py b/python/paddle/tests/test_ops_roi_align.py new file mode 100644 index 00000000000000..4a37831a0ccf21 --- /dev/null +++ b/python/paddle/tests/test_ops_roi_align.py @@ -0,0 +1,108 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle +from paddle.vision.ops import roi_align, RoIAlign + + +class TestRoIAlign(unittest.TestCase): + def setUp(self): + self.data = np.random.rand(1, 256, 32, 32).astype('float32') + boxes = np.random.rand(3, 4) + boxes[:, 2] += boxes[:, 0] + 3 + boxes[:, 3] += boxes[:, 1] + 4 + self.boxes = boxes.astype('float32') + self.boxes_num = np.array([3], dtype=np.int32) + + def roi_align_functional(self, output_size): + if isinstance(output_size, int): + output_shape = (3, 256, output_size, output_size) + else: + output_shape = (3, 256, output_size[0], output_size[1]) + + if paddle.in_dynamic_mode(): + data = paddle.to_tensor(self.data) + boxes = paddle.to_tensor(self.boxes) + boxes_num = paddle.to_tensor(self.boxes_num) + + align_out = roi_align( + data, boxes, boxes_num=boxes_num, output_size=output_size) + np.testing.assert_equal(align_out.shape, output_shape) + + else: + data = paddle.static.data( + shape=self.data.shape, dtype=self.data.dtype, name='data') + boxes = paddle.static.data( + shape=self.boxes.shape, dtype=self.boxes.dtype, name='boxes') + boxes_num = paddle.static.data( + shape=self.boxes_num.shape, + dtype=self.boxes_num.dtype, + name='boxes_num') + + align_out = roi_align( + data, boxes, boxes_num=boxes_num, output_size=output_size) + + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + + align_out = exe.run(paddle.static.default_main_program(), + feed={ + 'data': self.data, + 'boxes': self.boxes, + 'boxes_num': self.boxes_num + }, + fetch_list=[align_out]) + + np.testing.assert_equal(align_out[0].shape, output_shape) + + def test_roi_align_functional_dynamic(self): + self.roi_align_functional(3) + self.roi_align_functional(output_size=(3, 4)) + + def test_roi_align_functional_static(self): + paddle.enable_static() + self.roi_align_functional(3) + paddle.disable_static() + + def test_RoIAlign(self): + roi_align_c = RoIAlign(output_size=(4, 3)) + data = paddle.to_tensor(self.data) + boxes = paddle.to_tensor(self.boxes) + boxes_num = paddle.to_tensor(self.boxes_num) + + align_out = roi_align_c(data, boxes, boxes_num) + np.testing.assert_equal(align_out.shape, (3, 256, 4, 3)) + + def test_value(self, ): + data = np.array([i for i in range(1, 17)]).reshape(1, 1, 4, + 4).astype(np.float32) + boxes = np.array( + [[1., 1., 2., 2.], [1.5, 1.5, 3., 3.]]).astype(np.float32) + boxes_num = np.array([2]).astype(np.int32) + output = np.array([[[[6.]]], [[[9.75]]]], dtype=np.float32) + + data = paddle.to_tensor(data) + boxes = paddle.to_tensor(boxes) + boxes_num = paddle.to_tensor(boxes_num) + + roi_align_c = RoIAlign(output_size=1) + align_out = roi_align_c(data, boxes, boxes_num) + np.testing.assert_almost_equal(align_out.numpy(), output) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 84dcdfa4cfcc4f..965cf8b55e7936 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -34,6 +34,8 @@ 'RoIPool', 'psroi_pool', 'PSRoIPool', + 'roi_align', + 'RoIAlign', ] @@ -1138,3 +1140,160 @@ def forward(self, x, boxes, boxes_num): def extra_repr(self): main_str = 'output_size={_output_size}, spatial_scale={_spatial_scale}' return main_str.format(**self.__dict__) + + +def roi_align(x, + boxes, + boxes_num, + output_size, + spatial_scale=1.0, + sampling_ratio=-1, + aligned=True, + name=None): + """ + This operator implements the roi_align layer. + Region of Interest (RoI) Align operator (also known as RoI Align) is to + perform bilinear interpolation on inputs of nonuniform sizes to obtain + fixed-size feature maps (e.g. 7*7), as described in Mask R-CNN. + + Dividing each region proposal into equal-sized sections with the pooled_width + and pooled_height. Location remains the origin result. + + In each ROI bin, the value of the four regularly sampled locations are + computed directly through bilinear interpolation. The output is the mean of + four locations. Thus avoid the misaligned problem. + + Args: + x (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W], + where N is the batch size, C is the input channel, H is Height, + W is weight. The data type is float32 or float64. + boxes (Tensor): Boxes (RoIs, Regions of Interest) to pool over. It + should be a 2-D Tensor of shape (num_boxes, 4). The data type is + float32 or float64. Given as [[x1, y1, x2, y2], ...], (x1, y1) is + the top left coordinates, and (x2, y2) is the bottom right coordinates. + boxes_num (Tensor): The number of boxes contained in each picture in + the batch, the data type is int32. + output_size (int or Tuple[int, int]): The pooled output size(h, w), data + type is int32. If int, h and w are both equal to output_size. + spatial_scale (float32): Multiplicative spatial scale factor to translate + ROI coords from their input scale to the scale used when pooling. + Default: 1.0 + sampling_ratio (int32): number of sampling points in the interpolation + grid used to compute the output value of each pooled output bin. + If > 0, then exactly ``sampling_ratio x sampling_ratio`` sampling + points per bin are used. + If <= 0, then an adaptive number of grid points are used (computed + as ``ceil(roi_width / output_width)``, and likewise for height). + Default: -1 + aligned (bool): If False, use the legacy implementation. If True, pixel + shift the box coordinates it by -0.5 for a better alignment with the + two neighboring pixel indices. This version is used in Detectron2. + Default: True + name(str, optional): For detailed information, please refer to : + ref:`api_guide_Name`. Usually name is no need to set and None by + default. + + Returns: + Tensor: The output of ROIAlignOp is a 4-D tensor with shape (num_boxes, + channels, pooled_h, pooled_w). The data type is float32 or float64. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.ops import roi_align + + data = paddle.rand([1, 256, 32, 32]) + boxes = paddle.rand([3, 4]) + boxes[:, 2] += boxes[:, 0] + 3 + boxes[:, 3] += boxes[:, 1] + 4 + boxes_num = paddle.to_tensor([3]).astype('int32') + align_out = roi_align(data, boxes, boxes_num, output_size=3) + assert align_out.shape == [3, 256, 3, 3] + """ + + check_type(output_size, 'output_size', (int, tuple), 'roi_align') + if isinstance(output_size, int): + output_size = (output_size, output_size) + + pooled_height, pooled_width = output_size + if in_dygraph_mode(): + assert boxes_num is not None, "boxes_num should not be None in dygraph mode." + align_out = core.ops.roi_align( + x, boxes, boxes_num, "pooled_height", pooled_height, "pooled_width", + pooled_width, "spatial_scale", spatial_scale, "sampling_ratio", + sampling_ratio, "aligned", aligned) + return align_out + + else: + check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'roi_align') + check_variable_and_dtype(boxes, 'boxes', ['float32', 'float64'], + 'roi_align') + helper = LayerHelper('roi_align', **locals()) + dtype = helper.input_dtype() + align_out = helper.create_variable_for_type_inference(dtype) + inputs = { + "X": x, + "ROIs": boxes, + } + if boxes_num is not None: + inputs['RoisNum'] = boxes_num + helper.append_op( + type="roi_align", + inputs=inputs, + outputs={"Out": align_out}, + attrs={ + "pooled_height": pooled_height, + "pooled_width": pooled_width, + "spatial_scale": spatial_scale, + "sampling_ratio": sampling_ratio, + "aligned": aligned, + }) + return align_out + + +class RoIAlign(Layer): + """ + This interface is used to construct a callable object of the `RoIAlign` class. + Please refer to :ref:`api_paddle_vision_ops_roi_align`. + + Args: + output_size (int or tuple[int, int]): The pooled output size(h, w), + data type is int32. If int, h and w are both equal to output_size. + spatial_scale (float32, optional): Multiplicative spatial scale factor + to translate ROI coords from their input scale to the scale used + when pooling. Default: 1.0 + + Returns: + align_out (Tensor): The output of ROIAlign operator is a 4-D tensor with + shape (num_boxes, channels, pooled_h, pooled_w). + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.ops import RoIAlign + + data = paddle.rand([1, 256, 32, 32]) + boxes = paddle.rand([3, 4]) + boxes[:, 2] += boxes[:, 0] + 3 + boxes[:, 3] += boxes[:, 1] + 4 + boxes_num = paddle.to_tensor([3]).astype('int32') + roi_align = RoIAlign(output_size=(4, 3)) + align_out = roi_align(data, boxes, boxes_num) + assert align_out.shape == [3, 256, 4, 3] + """ + + def __init__(self, output_size, spatial_scale=1.0): + super(RoIAlign, self).__init__() + self._output_size = output_size + self._spatial_scale = spatial_scale + + def forward(self, x, boxes, boxes_num, aligned=True): + return roi_align( + x=x, + boxes=boxes, + boxes_num=boxes_num, + output_size=self._output_size, + spatial_scale=self._spatial_scale, + aligned=aligned) From 1b1210ea72e215f35b7fdb019794f60c1282a4fa Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 29 Sep 2021 08:18:06 +0800 Subject: [PATCH 038/298] fix flags approval (#36192) --- tools/check_file_diff_approvals.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index 53b5cb9a722c4e..6104b168798c99 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -88,7 +88,7 @@ function run_tools_test() { cd ${CUR_PWD} } -changed_env_var_count=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/paddle | grep 'DEFINE_EXPORTED' | wc -l` +changed_env_var_count=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/paddle | grep 'DEFINE_EXPORTED' | grep -v '@@' | wc -l` if [[ $changed_env_var_count -gt 0 ]]; then echo_line="You must have one RD (lanxianghit (Recommend), phlrain or luotao1) approval for changing the FLAGS, which manages the environment variables.\n" check_approval 1 6836917 47554610 43953930 From 5e1d0b5cae8d68928f27d7fc2d01db6a8be86b8b Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Wed, 29 Sep 2021 10:00:25 +0800 Subject: [PATCH 039/298] [ROCM] bugfix for bilinear_interp_v2_grad (#36160) --- paddle/fluid/operators/interpolate_v2_op.cu | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index 6f8b89ce64523d..fe9228135606dc 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -1198,7 +1198,12 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout); } else if ("bicubic" == interp_method) { - KeBicubicInterpFw<<<<>>( input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); @@ -1606,9 +1611,11 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false; bool optimize_flag = false; +#ifndef __HIPCC__ optimize_flag = (in_h < (out_h >> 6) && in_w < (out_w >> 6)) ? true : ((in_h == 1 && in_w == 1) ? true : false); +#endif if (optimize_flag & is_nchw) { KeBilinearInterpBwShareMemory< @@ -1623,7 +1630,12 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, ratio_h, ratio_w, align_type_value, is_nchw); } } else if ("bicubic" == interp_method) { - KeBicubicInterpBw<<<<>>( input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); From 092d45c3947724537a04633826a4666099a2bcda Mon Sep 17 00:00:00 2001 From: Li Min <11663212+limin2021@users.noreply.github.com> Date: Wed, 29 Sep 2021 10:11:22 +0800 Subject: [PATCH 040/298] Add fused_dropout wrapper to ease use. (#36185) --- paddle/fluid/operators/dropout_impl.cu.h | 29 +- paddle/fluid/operators/dropout_impl_util.h | 53 ++++ .../operators/fused/fused_dropout_helper.h | 282 ++++++++++++++++++ 3 files changed, 339 insertions(+), 25 deletions(-) create mode 100644 paddle/fluid/operators/dropout_impl_util.h create mode 100644 paddle/fluid/operators/fused/fused_dropout_helper.h diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 7a93d2db0dd1ce..695d29b294a51a 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -30,6 +30,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/dropout_impl_util.h" #include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/gpu_launch_config.h" @@ -196,31 +197,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, config.thread_per_block.x * vec_size) + 1) * vec_size; - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - - if ((seed) && platform::is_gpu_place(seed->place())) { - framework::Tensor seed_cpu_tensor; - TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor); - seed_data = static_cast(seed_cpu_tensor.data()[0]); - increment = offset; - } else if (seed && platform::is_cpu_place(seed->place())) { - seed_data = *(seed->data()); - increment = offset; - } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) { - auto seed_offset = gen_cuda->IncrementOffset(offset); - seed_data = seed_offset.first; - increment = seed_offset.second; - } else { - if (seed) { - seed_data = *(seed->data()); - } else { - std::random_device rnd; - seed_data = is_fix_seed ? seed_val : rnd(); - } - increment = offset; - } + + GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset, + &seed_data, &increment); #ifdef __HIPCC__ if (vec_size == 4 && size % 4 == 0) { diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h new file mode 100644 index 00000000000000..a7188efe7139c7 --- /dev/null +++ b/paddle/fluid/operators/dropout_impl_util.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace operators { + +inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, + const framework::Tensor* seed, + const bool is_fix_seed, const int seed_val, + const int offset, uint64_t* seed_data, + uint64_t* increment) { + int device_id = + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId(); + auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); + + if ((seed) && platform::is_gpu_place(seed->place())) { + framework::Tensor seed_cpu_tensor; + TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor); + *seed_data = static_cast(seed_cpu_tensor.data()[0]); + *increment = offset; + } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) { + auto seed_offset = gen_cuda->IncrementOffset(offset); + *seed_data = seed_offset.first; + *increment = seed_offset.second; + } else { + if (seed) { + *seed_data = *(seed->data()); + } else { + std::random_device rnd; + *seed_data = is_fix_seed ? seed_val : rnd(); + } + *increment = offset; + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h new file mode 100644 index 00000000000000..fcfa405a52f9b1 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_dropout_helper.h @@ -0,0 +1,282 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/operators/dropout_impl_util.h" +#include "paddle/fluid/operators/fused/fused_dropout_act_bias.h" +#include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h" +#include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" +#include "paddle/fluid/operators/math/functors.h" + +namespace paddle { +namespace operators { + +/** + * Support two Dropouts in the use senarieo. + * This warpper can be used in FFN op. + * The DropoutParam will be used in the fused_dropout_act_bias, + * fused_residual_dropout_bias(pre_layer_norm=ture) or + * fused_layernorm_residual_dropout_bias(pre_layer_norm=false). +*/ +struct DropoutParam { + uint64_t seed; + float dropout_prob; + bool is_upscale_in_train; + bool is_test; + bool fix_seed; + int increment; + const framework::Tensor* tensor_seed; + int seed_val; + + DropoutParam() { + fix_seed = false; + seed = 0; + is_test = false; + is_upscale_in_train = false; + dropout_prob = 0.5; + tensor_seed = nullptr; + seed_val = 0; + } + + /** + * dropout_index: can be 0, 1, 2. 0 means there is only one dropout, + * 1 and 2 represent two dropout, the parameter name of dropout + * will be "dropout" + dropout_index + param name, such as dropout1_seed, + * dropout1_is_test. + */ + DropoutParam(const framework::ExecutionContext& context, + const int dropout_index) { + std::string pre_fix = "dropout"; + std::string str_index = std::to_string(dropout_index); + if (dropout_index > 0) { + pre_fix = pre_fix + str_index + "_"; + } else { + pre_fix = pre_fix + "_"; + } + dropout_prob = context.Attr(pre_fix + "prob"); + auto& dropout_implementation = + context.Attr(pre_fix + "implementation"); + is_upscale_in_train = (dropout_implementation == "upscale_in_train"); + is_test = context.Attr(pre_fix + "is_test"); + fix_seed = context.Attr(pre_fix + "fix_seed"); + + std::string str_seed = "Dropout"; + if (dropout_index > 0) { + str_seed = str_seed + str_index + "Seed"; + } else { + str_seed = str_seed + "Seed"; + } + tensor_seed = + context.HasInput(str_seed) ? context.Input(str_seed) : nullptr; + seed_val = context.Attr(pre_fix + "seed"); + } + + int UpdateSeedAndIncrement(const platform::CUDADeviceContext& ctx, + const int offset) { + uint64_t tmp_increment; + GetSeedDataAndIncrement(ctx, tensor_seed, fix_seed, seed_val, offset, &seed, + &tmp_increment); + increment = static_cast(tmp_increment); + return increment; + } +}; + +template +class FusedDropoutHelper { + private: + int GetIncrement(const platform::CUDADeviceContext& ctx) { + const int VecSize = MAX_CACHE_BYTES / sizeof(T); + const int real_vec_size = cols_ % VecSize == 0 ? VecSize : 1; + auto config = + Get1DBlocksAnd2DGrids(ctx, static_cast(rows_), + static_cast(cols_), real_vec_size); + int increment = ((cols_ - 1) / (config.thread_per_block.x * + config.block_per_grid.x * real_vec_size) + + 1) * + real_vec_size; + increment = dropout_param_.UpdateSeedAndIncrement(ctx, increment); + return increment; + } + + public: + FusedDropoutHelper() {} + FusedDropoutHelper(const platform::CUDADeviceContext& ctx, const int rows, + const int cols, const DropoutParam& dropout_param) { + rows_ = rows; + cols_ = cols; + dropout_param_ = dropout_param; + } + + // out = residual + dropout( src + bias ) + void ResidualDropoutBias(const platform::CUDADeviceContext& ctx, const T* src, + const T* residual, const T* bias, T* out, + MaskType* mask) { + auto increment = GetIncrement(ctx); + LaunchResidualDropoutBias( + rows_, cols_, increment, dropout_param_.seed, + dropout_param_.dropout_prob, dropout_param_.is_test, + dropout_param_.is_upscale_in_train, src, residual, bias, mask, out, + ctx); + } + + void ResidualDropoutBiasGrad(const platform::CUDADeviceContext& ctx, + const T* d_out, const MaskType* mask, T* d_src, + T* d_residual, T* d_bias) { + LaunchResidualDropoutBiasGrad( + d_out, mask, dropout_param_.dropout_prob, + dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx); + auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + memory::Copy(cuda_place, d_residual, cuda_place, d_out, + rows_ * cols_ * sizeof(T), ctx.stream()); + } + + // out = dropout(activation(src + bias)) + void DropoutActBias(const platform::CUDADeviceContext& ctx, const T* src, + const T* bias, const std::string& act_method, T* out, + MaskType* mask) { + auto increment = GetIncrement(ctx); + if (act_method == "gelu") { + GeluFunctor gelu; + LaunchDropoutActBias>( + gelu, dropout_param_.seed, rows_, cols_, dropout_param_.increment, + dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train, + dropout_param_.is_test, src, bias, out, mask, ctx); + } else if (act_method == "relu") { + math::ReluFunctor relu; + LaunchDropoutActBias>( + relu, dropout_param_.seed, rows_, cols_, increment, + dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train, + dropout_param_.is_test, src, bias, out, mask, ctx); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently only supports gelu or relu activation functions!")); + } + } + + void DropoutActBiasGrad(const platform::CUDADeviceContext& ctx, const T* dout, + const T* src, const T* bias, const MaskType* mask, + T* d_src, T* d_bias, const std::string& act_method) { + if (act_method == "gelu") { + GeluGradFunctor gelu_grad; + LaunchDropoutActBiasGrad>( + gelu_grad, dout, mask, src, bias, dropout_param_.dropout_prob, + dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx); + } else if (act_method == "relu") { + math::ReluGradFunctor relu_grad; + LaunchDropoutActBiasGrad>( + relu_grad, dout, mask, src, bias, dropout_param_.dropout_prob, + dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently only supports gelu or relu activation functions!")); + } + } + + protected: + int rows_; + int cols_; + DropoutParam dropout_param_; +}; + +template +class FusedDropoutLayerNormHelper : public FusedDropoutHelper { + public: + FusedDropoutLayerNormHelper() {} + FusedDropoutLayerNormHelper(const int rows, const int cols, + const float epsilon) { + using U = LayerNormParamType; + this->rows_ = rows; + this->cols_ = cols; + epsilon_ = epsilon; + } + + FusedDropoutLayerNormHelper(const platform::CUDADeviceContext& ctx, + const int rows, const int cols, + const DropoutParam& dropout_param, + const float epsilon) + : FusedDropoutHelper(ctx, rows, cols, dropout_param) { + using U = LayerNormParamType; + epsilon_ = epsilon; + } + + // call layer_norm + void LayerNorm(const platform::CUDADeviceContext& ctx, const T* src, + const LayerNormParamType* gamma, + const LayerNormParamType* beta, T* out, + LayerNormParamType* mean, LayerNormParamType* variance) { + using U = LayerNormParamType; + switch (GetDesiredBlockDim(this->cols_)) { + FIXED_BLOCK_DIM_CASE( + LayerNormForward< + T, U, kBlockDim><<rows_, kBlockDim, 0, ctx.stream()>>>( + src, gamma, beta, out, mean, variance, epsilon_, this->cols_)); + } + } + + void LayerNormGrad(const platform::CUDADeviceContext& ctx, const T* dout, + const T* src, const LayerNormParamType* gamma, + const LayerNormParamType* mean, + const LayerNormParamType* variance, T* d_src, + LayerNormParamType* d_scale, + LayerNormParamType* d_bias) { + using U = LayerNormParamType; + LayerNormBackward(src, dout, gamma, mean, variance, d_src, d_scale, + d_bias, epsilon_, this->rows_, this->cols_, ctx); + } + + // out = layernorm(residual + dropout(src + bias)) + void LayernormResidualDropoutBias( + const platform::CUDADeviceContext& ctx, const T* src, const T* residual, + const T* bias, const LayerNormParamType* gamma, + const LayerNormParamType* beta, T* dropout_out, MaskType* mask, T* out, + LayerNormParamType* mean, LayerNormParamType* variance) { + using U = LayerNormParamType; + int vec_size = MAX_CACHE_BYTES / sizeof(T); + if (this->cols_ % vec_size != 0) { + vec_size = 1; + } + int threads = GetDesiredBlockDim(this->cols_ / vec_size); + int increment = ((this->cols_ - 1) / (threads * vec_size) + 1) * vec_size; + increment = this->dropout_param_.UpdateSeedAndIncrement(ctx, increment); + LaunchLayernormResidualDropoutBias( + this->rows_, this->cols_, increment, this->dropout_param_.seed, + this->dropout_param_.dropout_prob, epsilon_, + this->dropout_param_.is_upscale_in_train, this->dropout_param_.is_test, + src, residual, bias, gamma, beta, mask, dropout_out, out, mean, + variance, ctx); + } + + void LayernormResidualDropoutBiasGrad( + const platform::CUDADeviceContext& ctx, const T* d_out, + const T* layernorm_src, const MaskType* mask, + const LayerNormParamType* gamma, const LayerNormParamType* mean, + const LayerNormParamType* variance, T* d_layernorm_src, + LayerNormParamType* d_scale, LayerNormParamType* d_layernorm_bias, + T* d_dropout_src, T* d_bias, T* d_residual) { + using U = LayerNormParamType; + LayerNormBackward(layernorm_src, d_out, gamma, mean, variance, + d_layernorm_src, d_scale, d_layernorm_bias, + epsilon_, this->rows_, this->cols_, ctx); + this->ResidualDropoutBiasGrad(ctx, d_layernorm_src, mask, d_dropout_src, + d_residual, d_bias); + } + + protected: + float epsilon_; +}; + +} // namespace operators +} // namespace paddle From 7e60cc63c33f0c17df36b0ee52ae50a3d04a6697 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 29 Sep 2021 10:13:07 +0800 Subject: [PATCH 041/298] refine case when thread_num = 1 (#36201) --- .../fast_threaded_ssa_graph_executor.cc | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 120bdd2bc9f563..a690b3026dbc2f 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -47,7 +47,16 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( << "Change thread number to 1 because the toposort order is unique"; strategy_.num_threads_ = 1; } - pool_.reset(new ::ThreadPool(strategy.num_threads_)); + if (strategy_.num_threads_ > 1) { + pool_.reset(new ::ThreadPool(strategy.num_threads_)); + } else { + auto nodes = ir::TopologySortOperations(*graph_); + traced_ops_.clear(); + traced_ops_.reserve(nodes.size()); + for (auto *node : nodes) { + traced_ops_.push_back(&node->Wrapper()); + } + } for (auto &op : ir::FilterByNodeWrapper(*graph_)) { int dep = static_cast(op->NotReadyInputSize()); op_deps_.emplace(op, dep); @@ -228,7 +237,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( OpHandleBase *op, const std::shared_ptr> &complete_q) { ++remaining_; - this->pool_->enqueue([=] { + auto func = [=] { std::deque op_queue; op_queue.push_front(op); @@ -287,7 +296,12 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( } --remaining_; complete_q->Push(complete); - }); + }; + if (pool_) { + pool_->enqueue(func); + } else { + func(); + } } void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() { From 667bf1883cc69e75c50198cb4726358bd54e58c2 Mon Sep 17 00:00:00 2001 From: baoachun <962571062@qq.com> Date: Wed, 29 Sep 2021 10:22:17 +0800 Subject: [PATCH 042/298] fix nullptr block in op_teller (#36197) --- paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 5958728946c2ed..1864899b07e018 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -548,7 +548,8 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, std::string new_input = quantized_op_input_node->Name(); std::string new_output = dequant_op_out_node->Name(); - framework::OpDesc new_op_desc(base_op_desc, nullptr); + framework::OpDesc new_op_desc(base_op_desc, + quantized_op_node->Op()->Block()); new_op_desc.SetType(quantized_op_type); new_op_desc.SetAttr("enable_int8", true); if (quantized_op_type == "conv2d" || quantized_op_type == "conv2d_fusion" || From b3d2dc7b7a15ed26db3f51e855dbfa337c5e3ad5 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 29 Sep 2021 10:36:25 +0800 Subject: [PATCH 043/298] remove wait if no fetch (#36150) --- .../framework/details/fast_threaded_ssa_graph_executor.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index a690b3026dbc2f..eb027d7c2f636a 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -139,10 +139,12 @@ FetchResultType FastThreadedSSAGraphExecutor::Run( } } // Wait FetchOps. - ClearFetchOp(graph_, &fetch_ops); + if (!fetch_ops.empty()) { + ClearFetchOp(graph_, &fetch_ops); - for (auto &place : places_) { - fetch_ctxs_.Get(place)->Wait(); + for (auto &place : places_) { + fetch_ctxs_.Get(place)->Wait(); + } } return fetches; From 767050d934222464e866a8dc73cafeed3e943c69 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Wed, 29 Sep 2021 10:37:16 +0800 Subject: [PATCH 044/298] Implement the grad and enhance the cache of norm_convolution fusion ops. (#36168) --- .../fluid/framework/operator_kernel_configs.h | 2 + .../operators/fused/cudnn_fusion_helper.h | 65 +-- .../operators/fused/cudnn_norm_conv.cu.h | 357 ++++++++++---- .../operators/fused/cudnn_norm_conv_test.cc | 459 ++++++++++++------ 4 files changed, 630 insertions(+), 253 deletions(-) diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h index 68edb7c89dd872..ab812a30981f0d 100644 --- a/paddle/fluid/framework/operator_kernel_configs.h +++ b/paddle/fluid/framework/operator_kernel_configs.h @@ -15,8 +15,10 @@ limitations under the License. */ #pragma once #include +#include #include #include +#include "glog/logging.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h index 4434681e60b3b1..fcd354df938ace 100644 --- a/paddle/fluid/operators/fused/cudnn_fusion_helper.h +++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h @@ -14,10 +14,8 @@ limitations under the License. */ #pragma once -#include #include -#include "paddle/fluid/platform/cudnn_desc.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/enforce.h" @@ -41,12 +39,9 @@ class CudnnFusionOp { } ~CudnnFusionOp() { - // New 'fused op' descriptor destruction - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_)); + dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_); + dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_); + dynload::cudnnDestroyFusedOpsPlan(op_); } // Execute fused op @@ -121,41 +116,49 @@ class CudnnFusionOp { // Get the workspace, which is required before Execute(). size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) { - size_t workspace_bytes = 0U; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan( - cudnn_handle, op_, op_const_params_, &workspace_bytes)); - plan_created_ = true; - return workspace_bytes; + if (!plan_created_) { + workspace_bytes_ = 0U; + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan( + cudnn_handle, op_, op_const_params_, &workspace_bytes_)); + plan_created_ = true; + } + return workspace_bytes_; } private: bool plan_created_; + size_t workspace_bytes_; cudnnFusedOpsPlan_t op_; cudnnFusedOpsConstParamPack_t op_const_params_; cudnnFusedOpsVariantParamPack_t op_variant_params_; }; -static inline std::vector GetStrides(const std::vector &shape) { - if (shape.size() < 1) { - return {}; +class CudnnFusionOpCache { + public: + static CudnnFusionOpCache &Instance() { + static CudnnFusionOpCache instance; + return instance; + } + + framework::AlgorithmsCache *GetForward() { + return &forward_cache_; } - int dim = static_cast(shape.size()); - std::vector pro_shape(shape); - std::vector strides(dim); - int temp = pro_shape[1]; - pro_shape.erase(pro_shape.begin() + 1); - pro_shape.push_back(temp); - strides.back() = 1; - for (int i = dim - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * pro_shape[i + 1]; + framework::AlgorithmsCache *GetBackward() { + return &backward_cache_; } - strides.pop_back(); - strides.insert(strides.begin() + 1, 1); - return strides; -} -static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; } + private: + CudnnFusionOpCache() {} + ~CudnnFusionOpCache() { + // Need to delete the memory of cache. + } + CudnnFusionOpCache(const CudnnFusionOpCache &) {} + + private: + framework::AlgorithmsCache forward_cache_; + framework::AlgorithmsCache backward_cache_; +}; #endif // CUDNN_VERSION >= 8000 } // namespace operators diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h index 1ead78b8b64e18..1a73281cb8dc64 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h +++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h @@ -15,125 +15,320 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" +#include "paddle/fluid/platform/cudnn_desc.h" +#include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; namespace dynload = platform::dynload; +template +using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; + #if CUDNN_VERSION >= 8000 + +static size_t RoundUp(int64_t a, int64_t b) { return (a + b - 1) / b * b; } + template -class CudnnNormConvolutionOp { +struct NormConvolutionArgs { + NormConvolutionArgs() { + dtype = platform::CudnnDataType::type; + format = CUDNN_TENSOR_NHWC; + compute_type = platform::CudnnDataType::type; + } + + void Set(const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, int padding, int stride, + int dilation, int group) { + PADDLE_ENFORCE_EQ( + input_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of input_shape is expected to 4. But recieved " + "input_shape's size is %d, input_shape is [%s].", + input_shape.size(), framework::make_ddim(input_shape))); + PADDLE_ENFORCE_EQ( + filter_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of filter_shape is expected to 4. But recieved " + "filter_shape's size is %d, filter_shape is [%s].", + filter_shape.size(), framework::make_ddim(filter_shape))); + PADDLE_ENFORCE_EQ(filter_shape[1] == filter_shape[2] && + (filter_shape[1] == 1 || filter_shape[1] == 3), + true, + platform::errors::InvalidArgument( + "The filter_shape is expected to store as nhwc, and " + "h = w = 1 or 3. But recieved filter_shape is [%s].", + framework::make_ddim(filter_shape))); + PADDLE_ENFORCE_EQ( + output_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of output_shape is expected to 4. But recieved " + "filter_shape's size is %d, filter_shape is [%s].", + output_shape.size(), framework::make_ddim(output_shape))); + + for (size_t i = 0; i < input_shape.size(); ++i) { + in_dims.push_back(input_shape[i]); + } + for (size_t i = 0; i < filter_shape.size(); ++i) { + filter_dims.push_back(filter_shape[i]); + } + paddings = {padding, padding}; + strides = {stride, stride}; + dilations = {dilation, dilation}; + + in_desc.set(input_shape, format, dtype); + filter_desc.set(filter_shape, format, dtype, group); + out_desc.set(output_shape, format, dtype); + + int output_channel = filter_shape[0]; + std::vector stats_shape = {1, 1, 1, output_channel}; + out_stats_desc.set(stats_shape, format, compute_type); + + conv_desc.set(dtype, paddings, strides, dilations, false, group); + } + + cudnnDataType_t dtype; + cudnnTensorFormat_t format; + cudnnDataType_t compute_type; + + std::vector in_dims; + std::vector filter_dims; + std::vector strides; + std::vector paddings; + std::vector dilations; + + platform::TensorDescriptor in_desc; + platform::FilterDescriptor filter_desc; + platform::TensorDescriptor out_desc; + platform::TensorDescriptor out_stats_desc; + platform::ConvolutionDescriptor conv_desc; +}; + +template +class CudnnNormConvolution { public: - CudnnNormConvolutionOp() - : fwd_op_(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS) {} - ~CudnnNormConvolutionOp() {} - - void Init(const platform::CUDADeviceContext &ctx, - const std::vector &input_shape, - const std::vector &filter_shape, - const std::vector &output_shape, const int &pad, - const int &stride, const int &dilate, const int &group) { - cudnn_fwd_compute_type_ = platform::CudnnDataType::type; - dtype_ = platform::CudnnDataType::type; - format_ = CUDNN_TENSOR_NHWC; - - InitDescriptors(ctx, input_shape, filter_shape, output_shape, pad, stride, - dilate, group); - GetWorkspaceSize(ctx); + CudnnNormConvolution(const platform::CUDADeviceContext &ctx, + const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, const int &padding, + const int &stride, const int &dilation, + const int &group) { + args_.Set(input_shape, filter_shape, output_shape, padding, stride, + dilation, group); } + ~CudnnNormConvolution() {} void Forward(const platform::CUDADeviceContext &ctx, T *input_ptr, T *filter_ptr, T *output_ptr, float *sum_ptr, float *sum_of_squares_ptr) { - auto handle = ctx.cudnn_handle(); - auto workspace_handle = ctx.cudnn_workspace_handle(); + auto cudnn_handle = ctx.cudnn_handle(); + + CudnnFusionOp *fwd_op = GetForwardOp(ctx); + size_t workspace_size = RoundUp( + static_cast(fwd_op->GetWorkspaceSizeInBytes(cudnn_handle)), + 512); + // Set variant_param // input ptr - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr); - fwd_op_.SetOpVariantParamAttrPtr( - CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr); + fwd_op->SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size); + // output ptr - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr); - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); - workspace_handle.RunFunc( + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); + + ctx.cudnn_workspace_handle().RunFunc( [&](void *workspace_ptr) { // workspace ptr - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); // fused op execute - fwd_op_.Execute(handle); + fwd_op->Execute(cudnn_handle); }, - fwd_workspace_byte_); + workspace_size); } - // TBD - void Backward(const platform::CUDADeviceContext &ctx) {} + private: + CudnnFusionOp *GetForwardOp(const platform::CUDADeviceContext &ctx) { + framework::AlgorithmsCache &cache = + *(CudnnFusionOpCache::Instance().GetForward()); + + CudnnFusionOp *fwd_op = cache.GetAlgorithm( + args_.in_dims, args_.filter_dims, args_.strides, args_.paddings, + args_.dilations, 0, static_cast(args_.dtype), [&]() { + CudnnFusionOp *fwd_op = + new CudnnFusionOp(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS); + + // Set constant_param + fwd_op->SetOpConstParamAttr( + {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER, + CUDNN_PARAM_YDATA_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + fwd_op->SetOpConstParamAttr( + {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + + // conv desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, + args_.conv_desc.desc()); + // input desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc()); + // filter desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_WDESC, + args_.filter_desc.desc()); + // output desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_YDESC, args_.out_desc.desc()); + // output_stats desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC, + args_.out_stats_desc.desc()); + // batch_norm mode + fwd_op->SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + + // Make cudnn fused ops plan + fwd_op->GetWorkspaceSizeInBytes(ctx.cudnn_handle()); + return fwd_op; + }); + return fwd_op; + } private: - void InitDescriptors(const platform::CUDADeviceContext &ctx, - const std::vector &input_shape, - const std::vector &filter_shape, - const std::vector &output_shape, const int &pad, - const int &stride, const int &dilate, const int &group) { - // Set constant_param - fwd_op_.SetOpConstParamAttr( - {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER, - CUDNN_PARAM_YDATA_PLACEHOLDER}, - CUDNN_PTR_16B_ALIGNED); - fwd_op_.SetOpConstParamAttr( - {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER}, - CUDNN_PTR_16B_ALIGNED); - - std::vector pad_vec = {pad, pad}; - std::vector stride_vec = {stride, stride}; - std::vector dilate_vec = {dilate, dilate}; - int output_channel = filter_shape[0]; - std::vector stats_shape = {1, 1, 1, output_channel}; + NormConvolutionArgs args_; +}; - // set conv desc - conv_desc_.set(dtype_, pad_vec, stride_vec, dilate_vec, false, group); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, conv_desc_.desc()); +template +class CudnnNormConvolutionGrad { + public: + CudnnNormConvolutionGrad(const platform::CUDADeviceContext &ctx, + const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, + const int &padding, const int &stride, + const int &dilation, const int &group) { + args_.Set(input_shape, filter_shape, output_shape, padding, stride, + dilation, group); + dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } + ~CudnnNormConvolutionGrad() {} - // set input desc - in_desc_.set(input_shape, format_, dtype_); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, in_desc_.desc()); + void Backward(const platform::CUDADeviceContext &ctx, T *input_ptr, + T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr, + T *filter_grad_ptr, bool use_addto = false) { + if (filter_grad_ptr) { + BackwardFilter(ctx, input_ptr, output_grad_ptr, filter_ptr, + filter_grad_ptr); + } + if (input_grad_ptr) { + BackwardData(ctx, input_ptr, output_grad_ptr, filter_ptr, input_grad_ptr, + use_addto); + } + } - // set filter desc - filter_desc_.set(filter_shape, format_, dtype_, group); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_WDESC, filter_desc_.desc()); + private: + void BackwardFilter(const platform::CUDADeviceContext &ctx, T *input_ptr, + T *output_grad_ptr, T *filter_ptr, T *filter_grad_ptr) { + auto cudnn_handle = ctx.cudnn_handle(); - // set output desc - out_desc_.set(output_shape, format_, dtype_); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YDESC, out_desc_.desc()); + CudnnFusionOp *wgrad_op = GetBackwardFilterOp(ctx); + size_t workspace_size = RoundUp( + static_cast(wgrad_op->GetWorkspaceSizeInBytes(cudnn_handle)), + 512); - // set output_stats desc - out_stats_desc_.set(stats_shape, format_, cudnn_fwd_compute_type_); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC, - out_stats_desc_.desc()); + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, output_grad_ptr); + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_DWDATA, filter_grad_ptr); + wgrad_op->SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size); - fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, CUDNN_BATCHNORM_SPATIAL); + ctx.cudnn_workspace_handle().RunFunc( + [&](void *workspace_ptr) { + // workspace ptr + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + workspace_ptr); + // fused op execute + wgrad_op->Execute(cudnn_handle); + }, + workspace_size); } - void GetWorkspaceSize(const platform::CUDADeviceContext &ctx) { - auto handle = ctx.cudnn_handle(); - fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle); + void BackwardData(const platform::CUDADeviceContext &ctx, T *input_ptr, + T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr, + bool use_addto = false) { + auto cudnn_handle = ctx.cudnn_handle(); + size_t workspace_size = GetWorkspaceSizeBwdData(ctx); + + // Convolution dgrad followed optionally by batchnorm dgrad + ScalingParamType alpha = 1.0f; + ScalingParamType beta = use_addto ? 1.0f : 0.0f; + ctx.cudnn_workspace_handle().RunFunc( + [&](void *cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionBackwardData( + cudnn_handle, &alpha, args_.filter_desc.desc(), filter_ptr, + args_.out_desc.desc(), output_grad_ptr, + args_.conv_desc.desc(), dgrad_algo_, cudnn_workspace_ptr, + workspace_size, &beta, args_.in_desc.desc(), input_grad_ptr)); + }, + workspace_size); } - size_t fwd_workspace_byte_ = 0; + CudnnFusionOp *GetBackwardFilterOp(const platform::CUDADeviceContext &ctx) { + framework::AlgorithmsCache &cache = + *(CudnnFusionOpCache::Instance().GetBackward()); + + CudnnFusionOp *wgrad_op = cache.GetAlgorithm( + args_.in_dims, args_.filter_dims, args_.strides, args_.paddings, + args_.dilations, 0, static_cast(args_.dtype), [&]() { + CudnnFusionOp *wgrad_op = + new CudnnFusionOp(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD); + + wgrad_op->SetOpConstParamAttr( + {CUDNN_PARAM_DYDATA_PLACEHOLDER, CUDNN_PARAM_XDATA_PLACEHOLDER, + CUDNN_PARAM_DWDATA_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); - cudnnDataType_t dtype_; - cudnnDataType_t cudnn_fwd_compute_type_; - platform::TensorDescriptor in_desc_; - platform::FilterDescriptor filter_desc_; - platform::TensorDescriptor out_desc_; - platform::TensorDescriptor out_stats_desc_; - platform::ConvolutionDescriptor conv_desc_; - cudnnTensorFormat_t format_; + // conv desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, + args_.conv_desc.desc()); + // input desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_XDESC, + args_.in_desc.desc()); + // filter desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_DWDESC, + args_.filter_desc.desc()); + // output desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_DYDESC, + args_.out_desc.desc()); + wgrad_op->SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); - CudnnFusionOp fwd_op_; + // Make cudnn fused ops plan + wgrad_op->GetWorkspaceSizeInBytes(ctx.cudnn_handle()); + return wgrad_op; + }); + return wgrad_op; + } + + size_t GetWorkspaceSizeBwdData(const platform::CUDADeviceContext &ctx) { + size_t workspace_size = 0U; + auto handle = ctx.cudnn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, args_.filter_desc.desc(), args_.out_desc.desc(), + args_.conv_desc.desc(), args_.in_desc.desc(), dgrad_algo_, + &workspace_size)); + return RoundUp(workspace_size, 512); + } + + private: + NormConvolutionArgs args_; + cudnnConvolutionBwdDataAlgo_t dgrad_algo_; }; + #endif } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 125ed856422920..fff7b327f3f2ec 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #include #include @@ -29,23 +30,80 @@ namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; USE_OP(conv2d); +USE_OP(conv2d_grad); USE_OP_DEVICE_KERNEL(conv2d, CUDNN); +USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN); + +template +void InitRandomTensor(const std::vector &dims, + framework::Tensor *cpu_out) { + T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), + platform::CPUPlace()); + + std::default_random_engine random(0); + std::uniform_real_distribution dis(0.0, 1.0); + for (int i = 0; i < cpu_out->numel(); ++i) { + cpu_out_ptr[i] = static_cast(dis(random)); + } +} -// get paddle conv2d op results as baseline template -void Conv2DForwardCompute(const Tensor &x, const Tensor &w, Tensor *y, - const platform::CUDADeviceContext &ctx) { +void TransposeNchwToNhwc(const framework::Tensor &cpu_in, + framework::Tensor *cpu_out) { + auto in_dims = cpu_in.dims(); + EXPECT_EQ(cpu_in.dims().size(), 4); + + const T *cpu_in_ptr = cpu_in.data(); + T *cpu_out_ptr = cpu_out->mutable_data( + {in_dims[0], in_dims[2], in_dims[3], in_dims[1]}, platform::CPUPlace()); + + int64_t n = in_dims[0]; + int64_t c = in_dims[1]; + int64_t hw = in_dims[2] * in_dims[3]; + for (int i = 0; i < n; ++i) { + for (int j = 0; j < hw; ++j) { + for (int k = 0; k < c; ++k) { + int dst_idx = i * hw * c + j * c + k; + int src_idx = i * c * hw + k * hw + j; + cpu_out_ptr[dst_idx] = cpu_in_ptr[src_idx]; + } + } + } +} + +template +void CheckOutput(const framework::Tensor &cpu_res, + const framework::Tensor &cpu_base, float diff, + bool is_relative_atol = false) { + EXPECT_EQ(cpu_res.dims(), cpu_base.dims()); + + const T *cpu_res_ptr = cpu_res.data(); + const T *cpu_base_ptr = cpu_base.data(); + for (int i = 0; i < cpu_res.numel(); ++i) { + if (is_relative_atol) { + EXPECT_LT(static_cast(std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) / + cpu_base_ptr[i])), + diff); + } else { + EXPECT_LT(static_cast(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i])), + diff); + } + } +} + +// Use Paddle conv2d op results as baseline +template +void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_input, const Tensor &cpu_filter, + Tensor *cpu_output) { framework::Scope scope; - auto var_x = scope.Var("Input"); - auto tensor_x = var_x->GetMutable(); - auto var_w = scope.Var("Filter"); - auto tensor_w = var_w->GetMutable(); - auto var_y = scope.Var("Output"); - auto tensor_y = var_y->GetMutable(); + auto *input = scope.Var("Input")->GetMutable(); + auto *filter = scope.Var("Filter")->GetMutable(); + auto *output = scope.Var("Output")->GetMutable(); auto place = ctx.GetPlace(); - TensorCopySync(x, place, tensor_x); - TensorCopySync(w, place, tensor_w); + TensorCopySync(cpu_input, place, input); + TensorCopySync(cpu_filter, place, filter); framework::AttributeMap attrs; bool use_cudnn = true; @@ -60,25 +118,94 @@ void Conv2DForwardCompute(const Tensor &x, const Tensor &w, Tensor *y, {{"Output", {"Output"}}}, attrs); op->Run(scope, ctx.GetPlace()); - TensorCopySync(*tensor_y, place, y); - ctx.Wait(); + TensorCopySync(*output, platform::CPUPlace(), cpu_output); } +// Use Paddle conv2d_grad op results as baseline template -class TestCudnnNormConvOpForward { - public: - TestCudnnNormConvOpForward() { - batch_size_ = 2; - height_ = 8; - width_ = 8; - input_channels_ = 8; - output_channels_ = 32; - kernel_size_ = 1; - stride_ = 1; - pad_ = 0; +void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_input, const Tensor &cpu_filter, + const Tensor &cpu_output_grad, + framework::Tensor *cpu_input_grad, + framework::Tensor *cpu_filter_grad, int stride, + int padding, int dilation) { + framework::Scope scope; + auto *input = scope.Var("Input")->GetMutable(); + auto *filter = scope.Var("Filter")->GetMutable(); + auto *output_grad = + scope.Var("Output@GRAD")->GetMutable(); + auto *input_grad = + scope.Var("Input@GRAD")->GetMutable(); + auto *filter_grad = + scope.Var("Filter@GRAD")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_input, place, input); + TensorCopySync(cpu_filter, place, filter); + TensorCopySync(cpu_output_grad, place, output_grad); + + framework::AttributeMap attrs; + bool use_cudnn = true; + std::string data_format = "NHWC"; + std::string padding_algorithm = "SAME"; + std::vector strides = {stride, stride}; + std::vector paddings = {padding, padding}; + std::vector dilations = {dilation, dilation}; + int groups = 1; + bool exhaustive_search = false; + bool use_addto = false; + attrs.insert({"use_cudnn", use_cudnn}); + attrs.insert({"data_format", data_format}); + attrs.insert({"padding_algorithm", padding_algorithm}); + attrs.insert({"strides", strides}); + attrs.insert({"paddings", paddings}); + attrs.insert({"dilations", dilations}); + attrs.insert({"groups", groups}); + attrs.insert({"exhaustive_search", exhaustive_search}); + attrs.insert({"use_addto", use_addto}); + + auto op = framework::OpRegistry::CreateOp( + "conv2d_grad", {{"Input", {"Input"}}, + {"Filter", {"Filter"}}, + {"Output@GRAD", {"Output@GRAD"}}}, + {{"Input@GRAD", {"Input@GRAD"}}, {"Filter@GRAD", {"Filter@GRAD"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*input_grad, platform::CPUPlace(), cpu_input_grad); + TensorCopySync(*filter_grad, platform::CPUPlace(), cpu_filter_grad); +} + +template +void ComputeSumAndSquareSum(const framework::Tensor &cpu_out, + framework::Tensor *cpu_sum, + framework::Tensor *cpu_sum_of_square) { + auto dims = cpu_out.dims(); + int64_t c = dims[3]; + + const T *cpu_out_ptr = cpu_out.data(); + float *cpu_sum_ptr = + cpu_sum->mutable_data({1, 1, 1, c}, platform::CPUPlace()); + float *cpu_sum_square_ptr = cpu_sum_of_square->mutable_data( + {1, 1, 1, c}, platform::CPUPlace()); + + for (int j = 0; j < c; ++j) { + float tmp_sum = 0.0f; + float tmp_sum_of_squares = 0.0f; + for (int i = 0; i < cpu_out.numel() / c; ++i) { + float tmp_out = static_cast(cpu_out_ptr[i * c + j]); + tmp_sum += tmp_out; + tmp_sum_of_squares += tmp_out * tmp_out; + } + cpu_sum_ptr[j] = tmp_sum; + cpu_sum_square_ptr[j] = tmp_sum_of_squares; } +} - TestCudnnNormConvOpForward(int batch_size, int height, int width, +template +class CudnnNormConvolutionTester { + public: + CudnnNormConvolutionTester(int batch_size, int height, int width, int input_channels, int output_channels, int kernel_size, int stride) { batch_size_ = batch_size; @@ -88,133 +215,183 @@ class TestCudnnNormConvOpForward { output_channels_ = output_channels; kernel_size_ = kernel_size; stride_ = stride; - pad_ = (kernel_size_ - 1) / 2; + padding_ = (kernel_size_ - 1) / 2; + SetUp(); } - ~TestCudnnNormConvOpForward() {} + ~CudnnNormConvolutionTester() {} + + void CheckForward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + framework::Tensor cpu_output_base; + framework::Tensor cpu_sum_base; + framework::Tensor cpu_sum_of_square_base; + BaselineForward(*ctx, &cpu_output_base, &cpu_sum_base, + &cpu_sum_of_square_base); + + framework::Tensor cpu_output; + framework::Tensor cpu_sum; + framework::Tensor cpu_sum_of_square; + FusedForward(*ctx, &cpu_output, &cpu_sum, &cpu_sum_of_square); + + // Check forward correctness between baseline and results of normconv. + CheckOutput(cpu_output, cpu_output_base, diff, is_relative_atol); + CheckOutput(cpu_sum, cpu_sum_base, diff, is_relative_atol); + CheckOutput(cpu_sum_of_square, cpu_sum_of_square_base, diff, + is_relative_atol); + } + + void CheckBackward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + framework::Tensor cpu_input_grad_base; + framework::Tensor cpu_filter_nchw_grad_base; + framework::Tensor cpu_filter_nhwc_grad_base; + BaselineBackward(*ctx, &cpu_input_grad_base, &cpu_filter_nchw_grad_base); + TransposeNchwToNhwc(cpu_filter_nchw_grad_base, + &cpu_filter_nhwc_grad_base); + + framework::Tensor cpu_input_grad; + framework::Tensor cpu_filter_nhwc_grad; + FusedBackward(*ctx, &cpu_input_grad, &cpu_filter_nhwc_grad); + + // Check backward correctness between baseline and results of normconv. + CheckOutput(cpu_input_grad, cpu_input_grad_base, diff, is_relative_atol); + CheckOutput(cpu_filter_nhwc_grad, cpu_filter_nhwc_grad_base, diff, + is_relative_atol); + } + private: void SetUp() { - input_size_ = batch_size_ * height_ * width_ * input_channels_; - filter_size_ = - output_channels_ * input_channels_ * kernel_size_ * kernel_size_; - output_size_ = batch_size_ * height_ * width_ * output_channels_; - param_size_ = output_channels_; - - input_vec_.resize(input_size_); - filter_raw_vec_.resize(filter_size_); - filter_pro_vec_.resize(filter_size_); - - std::default_random_engine random(0); - std::uniform_real_distribution dis(0.0, 1.0); - for (int i = 0; i < input_size_; ++i) { - input_vec_[i] = static_cast(dis(random)); - } - for (int i = 0; i < filter_size_; ++i) { - filter_raw_vec_[i] = static_cast(dis(random)); - } - // transpoes for filter - // NCHW->NHWC - for (int oc = 0; oc < output_channels_; ++oc) { - for (int kh = 0; kh < kernel_size_; ++kh) { - for (int kw = 0; kw < kernel_size_; ++kw) { - for (int ic = 0; ic < input_channels_; ++ic) { - int dst_idx = oc * kernel_size_ * kernel_size_ * input_channels_ + - kh * kernel_size_ * input_channels_ + - kw * input_channels_ + ic; - int src_idx = oc * kernel_size_ * kernel_size_ * input_channels_ + - ic * kernel_size_ * kernel_size_ + kh * kernel_size_ + - kw; - filter_pro_vec_[dst_idx] = filter_raw_vec_[src_idx]; - } - } - } - } + InitRandomTensor({batch_size_, height_, width_, input_channels_}, + &cpu_input_); + InitRandomTensor( + {output_channels_, input_channels_, kernel_size_, kernel_size_}, + &cpu_filter_nchw_); + // transpoes for filter, NCHW -> NHWC + TransposeNchwToNhwc(cpu_filter_nchw_, &cpu_filter_nhwc_); + InitRandomTensor({batch_size_, height_, width_, output_channels_}, + &cpu_output_grad_); + } - framework::TensorFromVector(input_vec_, *ctx_, &input_); - input_.Resize({batch_size_, height_, width_, input_channels_}); - framework::TensorFromVector(filter_raw_vec_, *ctx_, &filter_raw_); - filter_raw_.Resize( - {output_channels_, input_channels_, kernel_size_, kernel_size_}); - framework::TensorFromVector(filter_pro_vec_, *ctx_, &filter_pro_); - filter_pro_.Resize( - {output_channels_, kernel_size_, kernel_size_, input_channels_}); - output_.Resize({batch_size_, height_, width_, output_channels_}); - base_output_.Resize({batch_size_, height_, width_, output_channels_}); - sum_.Resize({1, 1, 1, output_channels_}); - sum_of_squares_.Resize({1, 1, 1, output_channels_}); - ctx_->Wait(); + void BaselineForward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_output_base, + framework::Tensor *cpu_sum_base, + framework::Tensor *cpu_sum_of_square_base) { + ComputeConv2DForward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base); + ComputeSumAndSquareSum(*cpu_output_base, cpu_sum_base, + cpu_sum_of_square_base); } - void BaselineForward() { - Conv2DForwardCompute(input_, filter_raw_, &base_output_, *ctx_); - ctx_->Wait(); + void BaselineBackward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_input_grad_base, + framework::Tensor *cpu_filter_grad_base) { + ComputeConv2DBackward(ctx, cpu_input_, cpu_filter_nchw_, + cpu_output_grad_, cpu_input_grad_base, + cpu_filter_grad_base, stride_, padding_, + dilation_); } // get forward results of cudnn_norm_conv - void FusedForward() { - auto input_shape = framework::vectorize(input_.dims()); - auto filter_shape = framework::vectorize(filter_pro_.dims()); - auto output_shape = framework::vectorize(output_.dims()); - T *input_ptr = input_.data(); - T *filter_ptr = filter_pro_.data(); - T *output_ptr = output_.mutable_data(place_); - float *sum_ptr = sum_.mutable_data(place_); - float *sum_of_squares_ptr = sum_of_squares_.mutable_data(place_); - - std::shared_ptr> conv_op( - new op::CudnnNormConvolutionOp()); - conv_op->Init(*ctx_, input_shape, filter_shape, output_shape, pad_, stride_, - dilate_, group_); - conv_op->Forward(*ctx_, input_ptr, filter_ptr, output_ptr, sum_ptr, - sum_of_squares_ptr); - ctx_->Wait(); - } + void FusedForward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_output, framework::Tensor *cpu_sum, + framework::Tensor *cpu_sum_of_square) { + framework::Tensor input; + framework::Tensor filter_nhwc; + framework::Tensor output; + framework::Tensor sum; + framework::Tensor sum_of_square; - void Run() { - SetUp(); - BaselineForward(); - FusedForward(); + auto place = ctx.GetPlace(); + TensorCopySync(cpu_input_, place, &input); + TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); + + T *input_ptr = input.data(); + T *filter_ptr = filter_nhwc.data(); + T *output_ptr = output.mutable_data( + {batch_size_, height_, width_, output_channels_}, place); + float *sum_ptr = + sum.mutable_data({1, 1, 1, output_channels_}, place); + float *sum_of_square_ptr = + sum_of_square.mutable_data({1, 1, 1, output_channels_}, place); + + auto input_shape = framework::vectorize(input.dims()); + auto filter_shape = framework::vectorize(filter_nhwc.dims()); + auto output_shape = framework::vectorize(output.dims()); + op::CudnnNormConvolution conv_op(ctx, input_shape, filter_shape, + output_shape, padding_, stride_, + dilation_, group_); + conv_op.Forward(ctx, input_ptr, filter_ptr, output_ptr, sum_ptr, + sum_of_square_ptr); + + TensorCopySync(output, platform::CPUPlace(), cpu_output); + TensorCopySync(sum, platform::CPUPlace(), cpu_sum); + TensorCopySync(sum_of_square, platform::CPUPlace(), cpu_sum_of_square); } - // check forward correctness between baseline and results of normconv. - void CheckOut(const T diff, bool is_relative_atol = false) { - std::vector base_output_vec, output_vec; - output_vec.resize(output_size_); - base_output_vec.resize(output_size_); - TensorToVector(base_output_, *ctx_, &base_output_vec); - TensorToVector(output_, *ctx_, &output_vec); - ctx_->Wait(); - - for (int i = 0; i < output_size_; ++i) { - if (is_relative_atol) { - EXPECT_LT( - std::abs((output_vec[i] - base_output_vec[i]) / base_output_vec[i]), - diff); - } else { - EXPECT_LT(std::abs(output_vec[i] - base_output_vec[i]), diff); - } - } + void FusedBackward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_input_grad, + framework::Tensor *cpu_filter_grad) { + framework::Tensor input; + framework::Tensor filter_nhwc; + framework::Tensor output_grad; + framework::Tensor input_grad; + framework::Tensor filter_grad; + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_input_, place, &input); + TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); + TensorCopySync(cpu_output_grad_, place, &output_grad); + + T *input_ptr = input.data(); + T *filter_ptr = filter_nhwc.data(); + T *output_grad_ptr = output_grad.data(); + T *input_grad_ptr = input_grad.mutable_data(input.dims(), place); + T *filter_grad_ptr = filter_grad.mutable_data(filter_nhwc.dims(), place); + + auto input_shape = framework::vectorize(input.dims()); + auto filter_shape = framework::vectorize(filter_nhwc.dims()); + auto output_shape = framework::vectorize(output_grad.dims()); + op::CudnnNormConvolutionGrad conv_grad_op(ctx, input_shape, filter_shape, + output_shape, padding_, + stride_, dilation_, group_); + conv_grad_op.Backward(ctx, input_ptr, output_grad_ptr, filter_ptr, + input_grad_ptr, filter_grad_ptr); + + TensorCopySync(input_grad, platform::CPUPlace(), cpu_input_grad); + TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad); } private: - int batch_size_, height_, width_, input_channels_, output_channels_; - int kernel_size_, stride_, pad_; - const int dilate_ = 1; + int batch_size_; + int height_; + int width_; + int input_channels_; + int output_channels_; + int kernel_size_; + int stride_; + int padding_; + const int dilation_ = 1; const int group_ = 1; - int input_size_, filter_size_, output_size_, param_size_; - framework::Tensor input_, filter_raw_, filter_pro_, output_, base_output_; - framework::Tensor sum_, sum_of_squares_; - std::vector input_vec_, filter_raw_vec_, filter_pro_vec_; + // Forward input + framework::Tensor cpu_input_; + framework::Tensor cpu_filter_nchw_; + framework::Tensor cpu_filter_nhwc_; - platform::CUDAPlace place_ = platform::CUDAPlace(0); - platform::CUDADeviceContext *ctx_ = - static_cast( - platform::DeviceContextPool::Instance().Get(place_)); + // Backward input + framework::Tensor cpu_output_grad_; }; // test for fp16, kernel = 1, output_channels = input_channels -TEST(CudnnNormConvForward, GPUCudnnNormConvForward1Fp16) { +TEST(CudnnNormConvFp16, K1S1) { int batch_size = 4; int height = 56; int width = 56; @@ -222,15 +399,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward1Fp16) { int output_channels = 32; int kernel_size = 1; int stride = 1; - TestCudnnNormConvOpForward test( + CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.Run(); - test.CheckOut(static_cast(1e-3), true); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3, true); } // test for fp16, kernel = 3, output_channels = input_channels -TEST(CudnnNormConvForward, GPUCudnnNormConvForward2Fp16) { +TEST(CudnnNormConvFp16, K3S1) { int batch_size = 4; int height = 56; int width = 56; @@ -238,15 +415,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward2Fp16) { int output_channels = 32; int kernel_size = 3; int stride = 1; - TestCudnnNormConvOpForward test( + CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.Run(); - test.CheckOut(static_cast(1e-3), true); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3, true); } // test for fp16, kernel = 1, output_channels = input_channels * 4 -TEST(CudnnNormConvForward, GPUCudnnNormConvForward3Fp16) { +TEST(CudnnNormConvFp16, K1S1O4) { int batch_size = 4; int height = 56; int width = 56; @@ -254,9 +431,9 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward3Fp16) { int output_channels = 128; int kernel_size = 1; int stride = 1; - TestCudnnNormConvOpForward test( + CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.Run(); - test.CheckOut(static_cast(1e-3), true); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3, true); } From 6d4435ac0f76fc2bebe0eeb7fef46b000456b278 Mon Sep 17 00:00:00 2001 From: Yanxing Shi <48111042+Yanxing-Shi@users.noreply.github.com> Date: Wed, 29 Sep 2021 10:42:33 +0800 Subject: [PATCH 045/298] fix paddle.device.cuda.get_device_properties doc (#36178) * Initial Commit * add unittest and add error information * modify doc * fix some error * fix some word * fix bug cudaDeviceProp* and modify error explanation * fix cudaDeviceProp* error and unnitest samples * fix hip error and PADDLE_WITH_HIP * update style * fix error is_compiled_with_cuda * fix paddle.device.cuda.get_device_properties * fix error for multi thread safe * update style * merge conflict * modify after mentor review * update style * delete word * fix unittest error for windows * support string input and modify some code * modify doc to support string input * fix error for express information * fix error for express information * fix unnitest for windows * fix device.startswith('gpu:') * format error and doc * fix after review * format code * fix error for doc compile * fix error for doc compile * fix error for doc compile * fix error for doc compile * fix error for doc compile * fix py2 error * fix wrong words and doc * fix _gpuDeviceProperties * test=document_fix --- python/paddle/device/cuda/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index a559df21ad2413..4a65f53fe58d02 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -212,15 +212,15 @@ def get_device_properties(device=None): Return the properties of given device. Args: - device(paddle.CUDAPlace or int or str): The device, the id of the device - or the string name of device like 'gpu:x' which to get the properties of - the device from. If device is None, the device is the current device. + device(paddle.CUDAPlace or int or str): The device, the id of the device or + the string name of device like 'gpu:x' which to get the properties of the + device from. If device is None, the device is the current device. Default: None. Returns: - _gpuDeviceProperties: the properties of the device which include ASCII string + _gpuDeviceProperties: The properties of the device which include ASCII string identifying device, major compute capability, minor compute capability, global - memory available on device and the number of multiprocessors on the device. + memory available and the number of multiprocessors on the device. Examples: From f703558dd037ee6d13c4711964d0abad6bbc9466 Mon Sep 17 00:00:00 2001 From: hlygit66666 <32728786+hlygit66666@users.noreply.github.com> Date: Wed, 29 Sep 2021 11:14:48 +0800 Subject: [PATCH 046/298] Add op paddle.device.cuda.get_device_name and paddle.device.cuda.get_device_capability. (#35672) * add op paddle.device.cuda.get_device_name * fix some bugs * fix some bugs * fix error message bugs * fix en docs * fix bugs * fix bugs * fix bugs * add error message test case * add get_device_name and get_device_capability * fix review * fix docs bug * fix docs * fix docs --- python/paddle/device/cuda/__init__.py | 60 +++++++++++++++++++ .../test_cuda_device_name_capability.py | 55 +++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_device_name_capability.py diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index 4a65f53fe58d02..970fb35bfaeb1a 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -28,6 +28,8 @@ 'empty_cache', 'stream_guard', 'get_device_properties', + 'get_device_name', + 'get_device_capability', ] @@ -271,3 +273,61 @@ def get_device_properties(device=None): device_id = -1 return core.get_device_properties(device_id) + + +def get_device_name(device=None): + ''' + Return the name of the device which is got from CUDA function `cudaDeviceProp `_. + + Parameters: + device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device. + + Returns: + str: The name of the device. + + Examples: + + .. code-block:: python + + # required: gpu + + import paddle + + paddle.device.cuda.get_device_name() + + paddle.device.cuda.get_device_name(0) + + paddle.device.cuda.get_device_name(paddle.CUDAPlace(0)) + + ''' + + return get_device_properties(device).name + + +def get_device_capability(device=None): + ''' + Return the major and minor revision numbers defining the device's compute capability which are got from CUDA function `cudaDeviceProp `_. + + Parameters: + device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device. + + Returns: + tuple(int,int): the major and minor revision numbers defining the device's compute capability. + + Examples: + + .. code-block:: python + + # required: gpu + + import paddle + + paddle.device.cuda.get_device_capability() + + paddle.device.cuda.get_device_capability(0) + + paddle.device.cuda.get_device_capability(paddle.CUDAPlace(0)) + + ''' + prop = get_device_properties(device) + return prop.major, prop.minor diff --git a/python/paddle/fluid/tests/unittests/test_cuda_device_name_capability.py b/python/paddle/fluid/tests/unittests/test_cuda_device_name_capability.py new file mode 100644 index 00000000000000..88f71f28412e34 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_device_name_capability.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest + + +class TestDeviceName(unittest.TestCase): + def test_device_name_default(self): + if paddle.is_compiled_with_cuda(): + name = paddle.device.cuda.get_device_name() + self.assertIsNotNone(name) + + def test_device_name_int(self): + if paddle.is_compiled_with_cuda(): + name = paddle.device.cuda.get_device_name(0) + self.assertIsNotNone(name) + + def test_device_name_CUDAPlace(self): + if paddle.is_compiled_with_cuda(): + name = paddle.device.cuda.get_device_name(paddle.CUDAPlace(0)) + self.assertIsNotNone(name) + + +class TestDeviceCapability(unittest.TestCase): + def test_device_capability_default(self): + if paddle.is_compiled_with_cuda(): + capability = paddle.device.cuda.get_device_capability() + self.assertIsNotNone(capability) + + def test_device_capability_int(self): + if paddle.is_compiled_with_cuda(): + capability = paddle.device.cuda.get_device_capability(0) + self.assertIsNotNone(capability) + + def test_device_capability_CUDAPlace(self): + if paddle.is_compiled_with_cuda(): + capability = paddle.device.cuda.get_device_capability( + paddle.CUDAPlace(0)) + self.assertIsNotNone(capability) + + +if __name__ == "__main__": + unittest.main() From bec9fc9a902daf5f6669f1a34067f3411da21cc7 Mon Sep 17 00:00:00 2001 From: WangXi Date: Wed, 29 Sep 2021 11:51:20 +0800 Subject: [PATCH 047/298] [hybrid] Fix model parallel non-distributed param broadcast (#36186) --- .../sharding/offload_helper.py | 48 ++++++---- .../meta_optimizers/sharding_optimizer.py | 96 ++++++++++++------- .../test_fleet_hybrid_meta_optimizer.py | 16 ++-- .../test_fleet_sharding_meta_optimizer.py | 14 +-- 4 files changed, 105 insertions(+), 69 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py index 3ad6e320316c61..bb6af1b3195f70 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py @@ -25,8 +25,9 @@ class OffloadHelper(object): cuda_place_type = 1 cuda_pinned_place_type = 2 - def __init__(self, ring_id=None): - self.ring_id = ring_id + def __init__(self, mp_ring_id=None, dp_ring_id=None): + self.mp_ring_id = mp_ring_id + self.dp_ring_id = dp_ring_id def _insert_cast_op(self, block, idx, src_name, dst_name): src_var = block.var(src_name) @@ -49,20 +50,31 @@ def _insert_cast_op(self, block, idx, src_name, dst_name): OP_ROLE_KEY: OpRole.Optimize }) - def _insert_broadcast_op(self, block, idx, param): - if self.ring_id is None: - return - block._insert_op_without_sync( - idx, - type="c_broadcast", - inputs={'X': param}, - outputs={'Out': param}, - attrs={ - 'ring_id': self.ring_id, - 'root': 0, - 'use_calc_stream': True, - OP_ROLE_KEY: OpRole.Forward, - }) + def _insert_broadcast_op(self, block, idx, param_name): + rings = [] + + if self.dp_ring_id is not None: + rings.append(self.dp_ring_id) + + # need sync non distributed param in mp group + if self.mp_ring_id is not None: + param = block.var(param_name) + if not hasattr(param, 'is_distributed') or not param.is_distributed: + rings.append(self.mp_ring_id) + + # the insert op order is: mp, dp + for ring in rings: + block._insert_op_without_sync( + idx, + type="c_broadcast", + inputs={'X': param_name}, + outputs={'Out': param_name}, + attrs={ + 'ring_id': ring, + 'root': 0, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward, + }) def _insert_memcpy_op(self, block, idx, src_name, dst_name, dst_place_type): src_var = block.var(src_name) @@ -236,7 +248,7 @@ def remove_param(input_name): self._insert_cast_op(startup_block, insert_idx, var_name, param_to_fp16[var_name]) # NOTE(wangxi): cast and offload should insert after broadcast param. - # the insert op order is: broadcast, cast, offload + # the insert op order is: {mp, dp}broadcast, cast, offload self._insert_broadcast_op(startup_block, insert_idx, var_name) @@ -489,6 +501,8 @@ def remove_param(input_name): self._insert_cast_op(startup_block, insert_idx, var_name, param_to_fp16[var_name]) + # NOTE(wangxi): cast and offload should insert after broadcast param. + # the insert op order is: {mp, dp}broadcast, cast, offload self._insert_broadcast_op(startup_block, insert_idx, var_name) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 75a69e5527bc18..18211459a4e083 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -467,14 +467,16 @@ def _apply_optimize_offload_pass(self, params_grads): main_block = self._main_program.global_block() startup_block = self._startup_program.global_block() + mp_ring_id = self.mp_ring_id if self.mp_degree > 1 else None dp_ring_id = self.dp_ring_id if self.dp_degree > 1 else None + offload_helper = OffloadHelper( + mp_ring_id=mp_ring_id, dp_ring_id=dp_ring_id) # optimize offload should be enable while gradient merge is enable and # acc_step is quite large (e.g. >> 100). Since its memcpy could not be # overlap with calc, otherwise it will slower down training severely. if sharding_configs["optimize_offload"]: logger.info("Sharding with optimize offload !") - offload_helper = OffloadHelper(ring_id=dp_ring_id) offload_helper.offload(main_block, startup_block) # The optimize_cast is already included in offload_fp32param offload_helper.offload_fp32param(main_block, startup_block) @@ -482,7 +484,6 @@ def _apply_optimize_offload_pass(self, params_grads): logger.info("Sharding with optimize cast !") # NOTE(wangxi): optimize_cast will persist fp16 param, it # will take more memory, but will be faster. Trade space for time. - offload_helper = OffloadHelper(ring_id=dp_ring_id) if self._optimizer_sharding: offload_helper.opt_sharding_cast_fp32param( main_block, startup_block, @@ -554,6 +555,10 @@ def minimize_impl(self, # init param broadcast should be called after startup pruning self._initialization_broadcast() + # NOTE(wangxi): if param is not persistable, program.clone will + # failed, so we remove no persistable param, recreate param as a var + self._recreate_not_persist_param_as_var() + self._dump_program_for_debug() # GPU need to wait server ready, GPU and NPU is Layered connection @@ -1385,23 +1390,14 @@ def _build_groups(self): return - def _initialization_broadcast(self): - """ - this funtion is to ensure the initialization between dp group to be - identical when hybrid-dp is used. - """ - if not self.hybrid_dp: - return - - startup_block = self._startup_program.global_block() - params = startup_block.all_parameters() - params_name = [] + def _recreate_not_persist_param_as_var(self): + def recreate_not_persist_param_as_var(program): + block = program.global_block() + params = block.all_parameters() + for param in params: + if param.persistable: + continue - # NOTE(wangxi): if param is not persistable, program.clone will - # failed, so we remove no persistable param, re add param as a var - for param in params: - params_name.append(param.name) - if not param.persistable: name = param.name shape = param.shape dtype = param.dtype @@ -1411,15 +1407,14 @@ def _initialization_broadcast(self): trainable = param.trainable optimize_attr = param.optimize_attr regularizer = param.regularizer - have_dist_attr = False is_distributed = False if hasattr(param, 'is_distributed'): have_dist_attr = True is_distributed = param.is_distributed - startup_block._remove_var(name, sync=False) - var = startup_block.create_var( + block._remove_var(name, sync=False) + var = block.create_var( name=name, shape=shape, dtype=dtype, @@ -1431,6 +1426,31 @@ def _initialization_broadcast(self): if have_dist_attr: var.is_distributed = is_distributed + block._sync_with_cpp() + + recreate_not_persist_param_as_var(self._startup_program) + recreate_not_persist_param_as_var(self._main_program) + + def _initialization_broadcast(self): + """ + this funtion is to ensure the initialization between dp group to be + identical when hybrid-dp is used, and the initialization of + not distributed param between mp group to be identical. + """ + if self.dp_degree <= 1 and self.mp_degree <= 1: + return + + startup_block = self._startup_program.global_block() + + params = startup_block.all_parameters() + params_name = [] + not_dist_param_name = set() + + for param in params: + params_name.append(param.name) + if not hasattr(param, 'is_distributed') or not param.is_distributed: + not_dist_param_name.add(param.name) + # offload and optimize_cast will insert broadcast op broadcast_params = set() for op in startup_block.ops: @@ -1439,23 +1459,25 @@ def _initialization_broadcast(self): for param in params_name: if param in broadcast_params: continue - startup_block.append_op( - type='c_broadcast', - inputs={'X': param}, - outputs={'Out': param}, - attrs={ - 'ring_id': self.dp_ring_id, - 'root': 0, - 'use_calc_stream': True, - OP_ROLE_KEY: OpRole.Forward - }) - startup_block.append_op( - type='c_sync_comm_stream', - inputs={'X': params_name}, - outputs={'Out': params_name}, - attrs={'ring_id': self.dp_ring_id, - OP_ROLE_KEY: OpRole.Forward}) + rings = [] + # need sync not distributed param in mp group + if self.mp_degree > 1 and param in not_dist_param_name: + rings.append(self.mp_ring_id) + if self.dp_degree > 1: + rings.append(self.dp_ring_id) + + for ring in rings: + startup_block.append_op( + type='c_broadcast', + inputs={'X': param}, + outputs={'Out': param}, + attrs={ + 'ring_id': ring, + 'root': 0, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward + }) startup_block._sync_with_cpp() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py index 6eb566935d9d52..35b74eac4b0750 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py @@ -72,8 +72,7 @@ def test_opt_sharding_with_pp(self): 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -155,8 +154,7 @@ def test_opt_sharding_with_pp_with_allreduce_fuse(self): 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -218,7 +216,7 @@ def test_opt_sharding_with_pp_amp_gclip(self): 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -292,7 +290,7 @@ def test_opt_sharding_with_pp_amp_gclip_fuse_gm(self): 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -371,7 +369,7 @@ def test_opt_sharding_with_pp_amp_ckp_fuse_gm_optcast(self): 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', - 'cast', 'c_broadcast', 'c_sync_comm_stream' + 'cast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -460,7 +458,7 @@ def test_opt_sharding_with_pp_amp_gclip_boundary(self): 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', - 'c_comm_init', 'c_broadcast', 'c_sync_comm_stream' + 'c_comm_init', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -511,7 +509,7 @@ def test_opt_sharding_with_pp_amp_gclip_boundary_card1(self): 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_sync_comm_stream' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index 73eacd118ecad5..7cb033b748874c 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -655,7 +655,9 @@ def test_hybrid_with_mp_pp_amp_gclip(self): 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -764,7 +766,7 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce(self): 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -932,7 +934,7 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast(self): 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', - 'c_broadcast', 'c_sync_comm_stream' + 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -1029,7 +1031,7 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_offload(self): 'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy', - 'c_broadcast', 'c_sync_comm_stream' + 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -1129,7 +1131,7 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast_with_gradient_fuse( 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', - 'c_broadcast', 'c_sync_comm_stream' + 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -1221,7 +1223,7 @@ def test_hybrid_with_pp_dp_amp_with_gradient_fuse(self): 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ From 7bddf2e88fe1ee64cf695b4198cc398504cf90b5 Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Wed, 29 Sep 2021 14:42:51 +0800 Subject: [PATCH 048/298] [NPU] mod for model bert (#36165) * merge conflict of paddle_gtest_main.cc * modify FLAGS_npu_precision_mode and default not to call aclSetCompileopt --- .../elementwise/elementwise_sub_op_npu.cc | 4 +- .../fluid/operators/fill_any_like_op_npu.cc | 12 +- paddle/fluid/operators/npu_op_runner.cc | 8 + paddle/fluid/operators/slice_op_npu.cc | 27 ++- paddle/fluid/platform/flags.cc | 7 + .../npu/test_elementwise_sub_op_npu.py | 5 + .../npu/test_fill_any_like_op_npu.py | 6 + .../tests/unittests/npu/test_slice_op_npu.py | 226 ++++++++++++++++++ 8 files changed, 290 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc index 94e78defbbee5d..48b98dafc7bb56 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc @@ -166,9 +166,11 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel, +REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel, + ops::ElementwiseSubNPUKernel, ops::ElementwiseSubNPUKernel); REGISTER_OP_NPU_KERNEL(elementwise_sub_grad, + ops::ElementwiseSubGradNPUKernel, ops::ElementwiseSubGradNPUKernel, ops::ElementwiseSubGradNPUKernel); diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc index d5204f5cacfc68..566b265bfdba9b 100644 --- a/paddle/fluid/operators/fill_any_like_op_npu.cc +++ b/paddle/fluid/operators/fill_any_like_op_npu.cc @@ -63,9 +63,12 @@ class FillAnyLikeNPUKernel : public framework::OpKernel { .stream(); auto shape = out->dims(); - const auto& runner = NpuOpRunner("FillD", {tensor_tmp}, {*out}, - {{"dims", framework::vectorize(shape)}}); - runner.Run(stream); + NpuOpRunner runner; + runner.SetType("Fill") + .AddInput(framework::vectorize(shape)) + .AddInput(tensor_tmp) + .AddOutput(*out) + .Run(stream); } }; @@ -75,5 +78,8 @@ class FillAnyLikeNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL(fill_any_like, ops::FillAnyLikeNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::FillAnyLikeNPUKernel, +#endif ops::FillAnyLikeNPUKernel, ops::FillAnyLikeNPUKernel); diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index bb6549c111988e..d10e94962d6a6d 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -26,6 +26,8 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" +DECLARE_string(npu_precision_mode); + namespace paddle { namespace operators { @@ -404,6 +406,12 @@ void NpuOpRunner::Run(aclrtStream stream) const { VLOG(4) << "attr: " << attr_; VLOG(4) << "stream: " << stream; + if (!FLAGS_npu_precision_mode.empty()) { + PADDLE_ENFORCE_NPU_SUCCESS( + aclSetCompileopt(ACL_PRECISION_MODE, FLAGS_npu_precision_mode.c_str())); + VLOG(4) << "set ACL_PRECISION_MODE: " << FLAGS_npu_precision_mode; + } + aclError ret = aclopCompileAndExecute( op_type_.c_str(), input_descs_.size(), input_descs_.data(), input_buffers_.data(), output_descs_.size(), output_descs_.data(), diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc index 1084eadc55c5bc..f8bf46da4a6383 100644 --- a/paddle/fluid/operators/slice_op_npu.cc +++ b/paddle/fluid/operators/slice_op_npu.cc @@ -181,12 +181,37 @@ class SliceGradNPUKernel : public framework::OpKernel { paddings[i][1] = static_cast(in_dims[i] - size[i] - offsets[i]); } + Tensor tmp_dout; + tmp_dout.ShareDataWith(*dout); + auto out_dims = dout->dims(); + auto decrease_axis = ctx.Attr>("decrease_axis"); + auto decrease_size = decrease_axis.size(); + if (decrease_size > 0) { + if (decrease_size == static_cast(in_dims.size())) { + out_dims = framework::make_ddim(std::vector(decrease_size, 1)); + } else { + std::vector origin_out_shape(out_dims.size() + decrease_size, -1); + for (size_t i = 0; i < decrease_size; ++i) { + origin_out_shape[decrease_axis[i]] = 1; + } + int index = 0; + for (size_t i = 0; i < origin_out_shape.size(); ++i) { + if (origin_out_shape[i] == -1) { + origin_out_shape[i] = out_dims[index]; + ++index; + } + } + out_dims = framework::make_ddim(origin_out_shape); + } + tmp_dout.Resize(out_dims); + } + dinput->mutable_data(ctx.GetPlace()); auto stream = ctx.template device_context() .stream(); const auto& runner = - NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}}); + NpuOpRunner("PadD", {tmp_dout}, {*dinput}, {{"paddings", paddings}}); runner.Run(stream); } }; diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index b97c3106439bed..89a829f9490f9f 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -121,6 +121,13 @@ PADDLE_DEFINE_EXPORTED_string( "If proveided, it will be passed to aclInit()."); PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling, 1, "set minmum loss scaling value!"); +PADDLE_DEFINE_EXPORTED_string( + npu_precision_mode, "", + "NPU operator precision mode, options are 'force_fp32', 'force_fp16', " + "'allow_fp32_to_fp16', 'must_keep_origin_dtype' and " + "'allow_mix_precision'. If you want to use the default mode (" + "allow_fp32_to_fp16), set this to empty string. For more details, " + "please refer to the documents"); #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py index 6faa77b4602137..7c8710fd42b36e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py @@ -90,6 +90,11 @@ def test_check_output(self): # max_relative_error=0.006,) +class TestElementwiseSubOpInt32(TestElementwiseSubOp): + def init_dtype(self): + self.dtype = np.int32 + + class TestSubtractAPI(unittest.TestCase): def test_name(self): with paddle.static.program_guard(paddle.static.Program()): diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py index a687509e6ae9c6..c3074db1aaff68 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py @@ -57,6 +57,12 @@ def init(self): self.value = -1 +class TestFillAnyLikeNPUOpInt64(TestFillAnyLikeNPUOp): + def init(self): + self.dtype = np.int64 + self.value = -1 + + class TestFillAnyLikeNPUOpFloat32(TestFillAnyLikeNPUOp): def init(self): self.dtype = np.float32 diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py index 5a38f14868bb8a..055c3015f82f5a 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py @@ -301,5 +301,231 @@ def test_npu(self): self.assertTrue(np.allclose(npu_loss, cpu_loss)) +class TestSliceOpDecsDim(OpTest): + def setUp(self): + self.op_type = "slice" + self.set_npu() + self.init_dtype() + self.config() + self.set_inputs() + self.set_outputs() + self.set_attrs() + + def set_inputs(self): + self.inputs = {'Input': self.input} + + def set_outputs(self): + self.outputs = {'Out': self.out} + + def set_attrs(self): + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + 'decrease_axis': self.decrease_axis, + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 3, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0:3, 2:4, :] + + def init_dtype(self): + self.dtype = np.float32 + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + if self.dtype == np.float16: + return + self.check_grad_with_place(self.place, ['Input'], 'Out') + + +class TestSliceOpDecsDimFp16(TestSliceOpDecsDim): + def init_dtype(self): + self.dtype = np.float16 + + +class TestSliceOpDecsDim2(TestSliceOpDecsDim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0, 2:4, :] + + +class TestSliceOpDecsDim3(TestSliceOpDecsDim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-1, 0, 2] + self.ends = [1000000, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[-1, 0, 2:4, :] + + +class TestSliceOpDecsDim4(TestSliceOpDecsDim): + def config(self): + self.input = np.random.random([3, 4, 5, 7]).astype(self.dtype) + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + +class TestSliceOpDecsDim5(TestSliceOpDecsDim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-1] + self.ends = [1000000] + self.axes = [3] + self.decrease_axis = [3] + self.infer_flags = [1, 1, 1] + self.out = self.input[:, :, :, -1] + + +class TestSliceOpDecsDim6(TestSliceOpDecsDim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + +class TestSliceOpDecsDimStartsTensor(TestSliceOpDecsDim): + def set_inputs(self): + self.inputs = { + 'Input': self.input, + "StartsTensor": np.array( + self.starts, dtype='int32') + } + + def set_attrs(self): + self.attrs = { + 'axes': self.axes, + #'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + 'decrease_axis': self.decrease_axis, + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 3, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0] + self.infer_flags = [-1, -1, -1] + self.out = self.input[1, 0:3, 2:4, :] + + +class TestSliceOpDecsDimStartsTensorFP16(TestSliceOpDecsDimStartsTensor): + def init_dtype(self): + self.dtype = np.float16 + + +class TestSliceOpDecsDimStartsTensorStartsAndEndsTensor(TestSliceOpDecsDim): + def set_inputs(self): + self.inputs = { + 'Input': self.input, + "StartsTensor": np.array( + self.starts, dtype='int64'), + "EndsTensor": np.array( + self.ends, dtype='int32') + } + + def set_attrs(self): + self.attrs = { + 'axes': self.axes, + #'starts': self.starts, + #'ends': self.ends, + 'infer_flags': self.infer_flags, + 'decrease_axis': self.decrease_axis, + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [-1, -1, -1] + self.out = self.input[1, 0, 2:4, :] + + +class TestSliceOpDecsDimStartsTensorStartsAndEndsTensorFP16( + TestSliceOpDecsDimStartsTensorStartsAndEndsTensor): + def init_dtype(self): + self.dtype = np.float16 + + +class TestSliceOpDecsDimStartsListTensor(TestSliceOpDecsDim): + def set_inputs(self): + starts_tensor = [] + for index, ele in enumerate(self.starts): + starts_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor} + + def set_attrs(self): + self.attrs = { + 'axes': self.axes, + 'starts': self.starts_infer, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + 'decrease_axis': self.decrease_axis, + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 3, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0] + self.infer_flags = [1, -1, 1] + self.out = self.input[1, 0:3, 2:4, :] + + self.starts_infer = [1, -1, 2] + + +class TestSliceOpDecsDimStartsListTensor2(TestSliceOpDecsDimStartsListTensor): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-1] + self.ends = [1000000] + self.axes = [3] + self.decrease_axis = [3] + self.infer_flags = [-1] + self.out = self.input[:, :, :, -1] + + self.starts_infer = [-1] + + +class TestSliceOpDecsDimStartsListTensorFP16( + TestSliceOpDecsDimStartsListTensor): + def init_dtype(self): + self.dtype = np.float16 + + if __name__ == '__main__': unittest.main() From c79de7286e4463119639f97143ef1f91cc70d6a9 Mon Sep 17 00:00:00 2001 From: zhulei <563755780@qq.com> Date: Wed, 29 Sep 2021 14:44:27 +0800 Subject: [PATCH 049/298] [NPU] Add group norm (#35937) * [NPU] Add group norm * [NPU] Add group norm * [NPU] Add group norm * [NPU] Add group norm * [NPU] Add group_norm op --- paddle/fluid/operators/group_norm_op_npu.cc | 306 ++++++++++++++++++ .../unittests/npu/test_group_norm_op_npu.py | 217 +++++++++++++ 2 files changed, 523 insertions(+) create mode 100644 paddle/fluid/operators/group_norm_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc new file mode 100644 index 00000000000000..4ef8320cbdecd6 --- /dev/null +++ b/paddle/fluid/operators/group_norm_op_npu.cc @@ -0,0 +1,306 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/group_norm_op.h" +#include +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct GroupNormFunction { + public: + explicit GroupNormFunction(const framework::ExecutionContext& ctx) + : ctx(ctx) { + place = ctx.GetPlace(); + stream = ctx.template device_context() + .stream(); + } + void ReduceMean(const Tensor* x, Tensor* y, const std::vector& dim, + bool keep_dims = true) { + // y should be init first + const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*y}, + {{"axes", dim}, {"keep_dims", keep_dims}}); + runner.Run(stream); + } + void ReduceSum(const Tensor* x, Tensor* y, const std::vector& dim, + bool keep_dims = true) { + // y should be init first + const auto& runner = NpuOpRunner("ReduceSumD", {*x}, {*y}, + {{"axes", dim}, {"keep_dims", keep_dims}}); + runner.Run(stream); + } + void Add(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Sub(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Mul(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Div(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Div", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void DivNoNan(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Transpose(const Tensor* x, Tensor* y, const std::vector& axis) { + // y should be init first + const auto& runner = + NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}}); + runner.Run(stream); + } + void Sqrt(const Tensor* x, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Sqrt", {*x}, {*y}, {}); + runner.Run(stream); + } + void Adds(const Tensor* x, float scalar, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); + runner.Run(stream); + } + Tensor ReduceMeanToNG(const Tensor* x, const DataLayout& data_layout, + const int64_t N, const int64_t C, const int64_t H, + const int64_t W, const int G) { + Tensor y(x->type()); + // y.mutable_data( {N,G,1}, place ); + if (data_layout == DataLayout::kNCHW) { + y.mutable_data({N, G, 1}, place); + // shape of x is [N, G, C*H*W/G] + this->ReduceMean(x, &y, std::vector{2}); + } else { + y.mutable_data({N, 1, G}, place); + // shape of x is [N, C*H*W/G, G] + Tensor x_trans(x->type()); + x_trans.mutable_data({N, G, C * H * W / G}, place); + this->Transpose(x, &x_trans, std::vector{0, 2, 1}); + this->ReduceMean(&x_trans, &y, std::vector{2}); + } + return y; + } + + private: + platform::Place place; + aclrtStream stream; + const framework::ExecutionContext& ctx; +}; + +template +class GroupNormNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const float epsilon = ctx.Attr("epsilon"); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* x = ctx.Input("X"); + + auto* y = ctx.Output("Y"); + auto* mean = ctx.Output("Mean"); + auto* var = ctx.Output("Variance"); + const auto groups = ctx.Attr("groups"); + + auto place = ctx.GetPlace(); + Tensor xnorm(x->type()); + xnorm.mutable_data(x->dims(), place); + GroupNormFunction F(ctx); + if (data_layout != DataLayout::kNCHW) { + xnorm.Resize({x->dims()[0], x->dims()[3], x->dims()[1], x->dims()[2]}); + F.Transpose(x, &xnorm, std::vector{0, 3, 1, 2}); + } else { + TensorCopy(*x, platform::NPUPlace(), &xnorm); + } + auto N = xnorm.dims()[0]; + auto C = xnorm.dims()[1]; + auto H = xnorm.dims()[2]; + auto W = xnorm.dims()[3]; + xnorm.Resize({N * groups, C * H * W / groups}); + std::vector axis = {1}; + auto reduce_dim = mean->dims(); + + mean->mutable_data({N * groups, 1}, place); + var->mutable_data({N * groups, 1}, place); + y->mutable_data(place); + F.ReduceMean(&xnorm, mean, axis); + + F.Sub(&xnorm, mean, &xnorm); + Tensor sqr(x->type()); + sqr.mutable_data(xnorm.dims(), place); + + F.Mul(&xnorm, &xnorm, &sqr); + F.ReduceMean(&sqr, var, axis); + Tensor std(x->type()); + std.mutable_data(var->dims(), place); + F.Adds(var, epsilon, &std); + F.Sqrt(&std, &std); + y->Resize(xnorm.dims()); + F.Div(&xnorm, &std, y); + y->Resize({N, C, H, W}); + if (scale) { + Tensor scale_t(scale->type()); + scale_t.ShareDataWith(*scale); + scale_t.Resize({C, 1, 1}); + F.Mul(y, &scale_t, y); + } + if (bias) { + Tensor bias_t(bias->type()); + bias_t.ShareDataWith(*bias); + bias_t.Resize({C, 1, 1}); + F.Add(y, &bias_t, y); + } + if (data_layout != DataLayout::kNCHW) { + F.Transpose(y, y, std::vector{0, 2, 3, 1}); + y->Resize({x->dims()}); + } + mean->Resize(reduce_dim); + var->Resize(reduce_dim); + } +}; + +template +class GroupNormGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const float epsilon = ctx.Attr("epsilon"); + auto* y = ctx.Input("Y"); + auto* var = ctx.Input("Variance"); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* d_y = ctx.Input(framework::GradVarName("Y")); + const auto G = ctx.Attr("groups"); + + // init output + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_scale = ctx.Output(framework::GradVarName("Scale")); + auto* d_bias = ctx.Output(framework::GradVarName("Bias")); + + GroupNormFunction F(ctx); + auto place = ctx.GetPlace(); + auto _type = y->type(); + + Tensor xnorm(_type); + xnorm.mutable_data(y->dims(), place); + Tensor scale_share(_type); + scale_share.ShareDataWith(*scale); + Tensor bias_share(_type); + bias_share.ShareDataWith(*bias); + + int64_t N = y->dims()[0]; + int64_t C, H, W; + framework::DDim scale_bias_dim; + if (data_layout == DataLayout::kNCHW) { + C = y->dims()[1]; + H = y->dims()[2]; + W = y->dims()[3]; + scale_bias_dim = framework::make_ddim({C, 1, 1}); + } else { + C = y->dims()[3]; + H = y->dims()[1]; + W = y->dims()[2]; + scale_bias_dim = framework::make_ddim({1, 1, C}); + } + scale_share.Resize(scale_bias_dim); + bias_share.Resize(scale_bias_dim); + F.Sub(y, &bias_share, &xnorm); + F.DivNoNan(&xnorm, &scale_share, &xnorm); + + if (d_bias) { + d_bias->mutable_data(place); + if (data_layout == DataLayout::kNCHW) { + F.ReduceSum(d_y, d_bias, std::vector{0, 2, 3}, false); + } else { + F.ReduceSum(d_y, d_bias, std::vector{0, 1, 2}, false); + } + } + if (d_scale) { + d_scale->mutable_data(place); + Tensor dy_xnorm(_type); + dy_xnorm.mutable_data(d_y->dims(), place); + F.Mul(d_y, &xnorm, &dy_xnorm); + if (data_layout == DataLayout::kNCHW) { + F.ReduceSum(&dy_xnorm, d_scale, std::vector{0, 2, 3}); + } else { + F.ReduceSum(&dy_xnorm, d_scale, std::vector{0, 1, 2}); + } + } + + // std = Sqrt(var+epsilon), init shape = [ N, G ] + Tensor std(_type); + std.mutable_data(var->dims(), place); + F.Adds(var, epsilon, &std); + F.Sqrt(&std, &std); + // d_xnorm_std = dy_proc * scale / std + Tensor d_xnorm_std(_type); + d_xnorm_std.mutable_data(y->dims(), place); + F.Mul(d_y, &scale_share, &d_xnorm_std); + if (data_layout == DataLayout::kNCHW) { + xnorm.Resize({N, G, C * H * W / G}); + d_xnorm_std.Resize({N, G, C * H * W / G}); + std.Resize({N, G, 1}); + } else { + xnorm.Resize({N, C * H * W / G, G}); + d_xnorm_std.Resize({N, C * H * W / G, G}); + std.Resize({N, 1, G}); + } + F.Div(&d_xnorm_std, &std, &d_xnorm_std); + + // d_x = d_xnorm_std + // - Mean ( d_xnorm_std * x_norm, axis=1, keepdim=True ) * x_norm + // - Mean ( d_xnorm_std, axis=1, keepdim=True ) + d_x->mutable_data(place); + d_x->Resize(xnorm.dims()); + F.Mul(&d_xnorm_std, &xnorm, d_x); + Tensor dx1 = F.ReduceMeanToNG(d_x, data_layout, N, C, H, W, G); + F.Mul(&dx1, &xnorm, d_x); + + Tensor dx2 = F.ReduceMeanToNG(&d_xnorm_std, data_layout, N, C, H, W, G); + + F.Sub(&d_xnorm_std, d_x, d_x); + F.Sub(d_x, &dx2, d_x); + + d_x->Resize(y->dims()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(group_norm, ops::GroupNormNPUKernel, + ops::GroupNormNPUKernel); +REGISTER_OP_NPU_KERNEL(group_norm_grad, ops::GroupNormGradNPUKernel, + ops::GroupNormGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py new file mode 100644 index 00000000000000..9ab1161be36dd8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py @@ -0,0 +1,217 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np + +import sys +sys.path.append("..") + +from operator import mul +from op_test import OpTest +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid + +paddle.enable_static() + + +def group_norm_naive(x, scale, bias, epsilon, groups, data_layout): + if data_layout == "NHWC": + x = np.transpose(x, (0, 3, 1, 2)) # NHWC => NCHW + N, C, H, W = x.shape + G = groups + x = x.reshape((N * G, -1)) + mean = np.mean(x, axis=1, keepdims=True) + var = np.var(x, axis=1, keepdims=True) + xnorm = (x - mean) / np.sqrt(var + epsilon) + xnorm = xnorm.reshape((N, C, H, W)) + output = xnorm * scale.reshape((-1, 1, 1)) + bias.reshape((-1, 1, 1)) + if data_layout == "NHWC": + output = np.transpose(output, (0, 2, 3, 1)) # NCHW => NHWC + xnorm = np.transpose(xnorm, (0, 2, 3, 1)) + return output, mean.reshape((N, G)), var.reshape((N, G)) + + +class TestGroupNormOpError(unittest.TestCase): + def test_errors(self): + with fluid.program_guard(fluid.Program(), fluid.Program()): + + def test_x_type(): + input = np.random.random(2, 100, 3, 5).astype('float32') + groups = 2 + fluid.layers.group_norm(input, groups) + + self.assertRaises(TypeError, test_x_type) + + def test_x_dtype(): + x2 = fluid.layers.data( + name='x2', shape=[2, 100, 3, 5], dtype='int32') + groups = 2 + fluid.layers.group_norm(x2, groups) + + self.assertRaises(TypeError, test_x_dtype) + + +class TestGroupNormOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = 'group_norm' + self.place = paddle.NPUPlace(0) + + self.init_dtype() + + self.data_format = "NCHW" + self.atol = 1e-6 + self.max_relative_error = 0.005 + self.shape = (2, 100, 3, 5) + self.attrs = {'epsilon': 1e-5, 'groups': 2, 'data_layout': "NCHW"} + self.compare_between_place = False + self.init_test_case() + + input = np.random.random(self.shape).astype(self.dtype) + if self.data_format == "NHWC": + input = np.transpose(input, (0, 2, 3, 1)) + scale = np.random.random([self.shape[1]]).astype(self.dtype) + bias = np.random.random([self.shape[1]]).astype(self.dtype) + output, mean, var = group_norm_naive( + input, scale, bias, self.attrs['epsilon'], self.attrs['groups'], + self.data_format) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(input), + 'Scale': OpTest.np_dtype_to_fluid_dtype(scale), + 'Bias': OpTest.np_dtype_to_fluid_dtype(bias) + } + self.outputs = {'Y': output, 'Mean': mean, 'Variance': var} + self.attrs['data_layout'] = self.data_format + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=self.atol) + + def test_check_grad(self): + if self.dtype == np.float16: + return + + self.__class__.exist_check_grad = True + inputs_to_check = ['X', 'Scale', 'Bias'] + output_names = 'Y' + no_grad_set = set() + cpu_place = fluid.CPUPlace() + cpu_grads = self._get_gradient(inputs_to_check, cpu_place, output_names, + no_grad_set) + npu_grads = self._get_gradient(inputs_to_check, self.place, + output_names, no_grad_set) + + self._assert_is_close(cpu_grads, npu_grads, inputs_to_check, + self.max_relative_error, + "Gradient Check between places") + + def init_test_case(self): + pass + + +class TestGroupNormOp1(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 1 + + +class TestGroupNormOp2(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 4 + + +class TestGroupNormOpBigEps1(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 1 + self.attrs['epsilon'] = 0.5 + + +class TestGroupNormOpBigEps2(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 4 + self.attrs['epsilon'] = 0.5 + + +class TestGroupNormOpBigEps3(TestGroupNormOp): + def init_test_case(self): + self.attrs['epsilon'] = 0.5 + + +class TestGroupNormOp1_With_NHWC(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 1 + self.data_format = "NHWC" + + +class TestGroupNormOp2_With_NHWC(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 4 + self.data_format = "NHWC" + + +class TestGroupNormOpBigEps1_With_NHWC(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 1 + self.attrs['epsilon'] = 0.5 + self.data_format = "NHWC" + + +class TestGroupNormOpBigEps2_With_NHWC(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 4 + self.attrs['epsilon'] = 0.5 + self.data_format = "NHWC" + + +class TestGroupNormOpBigEps3_With_NHWC(TestGroupNormOp): + def init_test_case(self): + self.attrs['epsilon'] = 0.5 + self.data_format = "NHWC" + + +class TestGroupNormOpFP16(TestGroupNormOp): + def init_dtype(self): + self.dtype = np.float16 + + +class TestGroupNormOpFP16_With_NHWC(TestGroupNormOp): + def init_dtype(self): + self.dtype = np.float16 + + def init_test_case(self): + self.data_format = "NHWC" + + +class TestGroupNormException(unittest.TestCase): + # data_layout is not NHWC or NCHW + def test_exception(self): + data = fluid.data(name='data', shape=[None, 3, 3, 4], dtype="float64") + + def attr_data_format(): + out = fluid.layers.group_norm( + input=data, groups=2, data_layout="NDHW") + + self.assertRaises(ValueError, attr_data_format) + + +if __name__ == '__main__': + unittest.main() From 2b8fd704d0ec555b5b27d50fca261741a7fbbf28 Mon Sep 17 00:00:00 2001 From: pangyoki Date: Wed, 29 Sep 2021 14:50:43 +0800 Subject: [PATCH 050/298] fix bug of top_k npu op (#36175) --- paddle/fluid/operators/top_k_op_npu.cc | 4 ++- .../tests/unittests/npu/test_top_k_op_npu.py | 36 +++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc index ca3a5f957685d9..a7d8fe01edd4cd 100644 --- a/paddle/fluid/operators/top_k_op_npu.cc +++ b/paddle/fluid/operators/top_k_op_npu.cc @@ -51,7 +51,9 @@ class TopkNPUKernel : public framework::OpKernel { indices->mutable_data(ctx.GetPlace()); // prepare assit - auto dim = input->dims().size(); + auto size = input->dims().size(); + // dim is the last dimension of input + auto dim = input->dims()[size - 1]; framework::Tensor assist_seq_tensor; assist_seq_tensor.Resize({2 * dim}); assist_seq_tensor.mutable_data(ctx.GetPlace()); diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py index b735adf76d6c12..c8a620d9dbb351 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py @@ -22,6 +22,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core +from test_top_k_v2_op_npu import numpy_topk paddle.enable_static() SEED = 2021 @@ -87,5 +88,40 @@ def test_check_output(self): self.check_output_with_place(self.place) +class TestTopkV3(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "top_k" + + self.init_dtype() + self.set_input_data() + self.set_attrs() + output, indices = numpy_topk( + self.input_data, axis=self.axis, k=self.k, largest=True) + + self.inputs = {'X': self.input_data} + self.attrs = {'k': self.k, 'axis': self.axis} + self.outputs = {'Out': output, 'Indices': indices} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def set_attrs(self): + self.k = 3 + self.axis = 1 + + def set_input_data(self): + self.input_data = np.random.choice( + 10000, size=(10, 20), replace=False).astype(self.dtype) + + if __name__ == '__main__': unittest.main() From 83578cfad12bf1925171c1501cea2bef4a679d3f Mon Sep 17 00:00:00 2001 From: zhulei <563755780@qq.com> Date: Wed, 29 Sep 2021 14:52:05 +0800 Subject: [PATCH 051/298] [npu] add box coder (#36171) * [npu] add box coder * [npu] add box coder --- .../fluid/operators/detection/CMakeLists.txt | 7 +- .../operators/detection/box_coder_op_npu.cc | 373 ++++++++++++++++++ .../unittests/npu/test_box_coder_op_npu.py | 252 ++++++++++++ 3 files changed, 631 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/detection/box_coder_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index c04d04f8413882..4e951f6318cc9c 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -15,8 +15,13 @@ function(detection_library TARGET_NAME) PARENT_SCOPE) endfunction() +if (WITH_ASCEND_CL) + detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc) +else() + detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) +endif() + detection_library(bipartite_match_op SRCS bipartite_match_op.cc) -detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) diff --git a/paddle/fluid/operators/detection/box_coder_op_npu.cc b/paddle/fluid/operators/detection/box_coder_op_npu.cc new file mode 100644 index 00000000000000..9d97c7af9630c9 --- /dev/null +++ b/paddle/fluid/operators/detection/box_coder_op_npu.cc @@ -0,0 +1,373 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/box_coder_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct BoxCoderFunction { + public: + explicit BoxCoderFunction(const framework::ExecutionContext& ctx) : ctx(ctx) { + place = ctx.GetPlace(); + stream = ctx.template device_context() + .stream(); + } + Tensor Adds(const Tensor& x, float scalar) { + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}}); + runner.Run(stream); + return y; + } + Tensor Muls(const Tensor& x, float scalar) { + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}}); + runner.Run(stream); + return y; + } + Tensor Mul(const Tensor& x, const Tensor& y) { + Tensor z; + z.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {}); + runner.Run(stream); + return z; + } + Tensor SubWithBroadCast(const Tensor& x, const Tensor& y, + const framework::DDim& shape) { + Tensor z; + z.mutable_data(shape, place); + const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {}); + runner.Run(stream); + return z; + } + void DivWithBroadCastVoid(const Tensor& x, const Tensor& y, + const framework::DDim& shape, Tensor* z) { + z->mutable_data(shape, place); + const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {}); + runner.Run(stream); + } + Tensor DivWithBroadCast(const Tensor& x, const Tensor& y, + const framework::DDim& shape) { + Tensor z; + DivWithBroadCastVoid(x, y, shape, &z); + return z; + } + void MulWithBroadCastVoid(const Tensor& x, const Tensor& y, + const framework::DDim& shape, Tensor* z) { + z->mutable_data(shape, place); + const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {}); + runner.Run(stream); + } + Tensor MulWithBroadCast(const Tensor& x, const Tensor& y, + const framework::DDim& shape) { + Tensor z; + MulWithBroadCastVoid(x, y, shape, &z); + return z; + } + void AddWithBroadCastVoid(const Tensor& x, const Tensor& y, + const framework::DDim& shape, Tensor* z) { + z->mutable_data(shape, place); + const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {}); + runner.Run(stream); + } + Tensor AddWithBroadCast(const Tensor& x, const Tensor& y, + const framework::DDim& shape) { + Tensor z; + AddWithBroadCastVoid(x, y, shape, &z); + return z; + } + Tensor Abs(const Tensor& x) { + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Abs", {x}, {y}, {}); + runner.Run(stream); + return y; + } + Tensor Log(const Tensor& x) { + Tensor t_x_m1 = Adds(x, -1); + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {}); + runner.Run(stream); + return y; + } + Tensor Exp(const Tensor& x) { + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Exp", {x}, {y}, {}); + runner.Run(stream); + return y; + } + Tensor Dot(const Tensor& x, const Tensor& y) { + auto dim_x = x.dims(); + auto dim_y = y.dims(); + PADDLE_ENFORCE_EQ( + dim_x.size(), 2, + platform::errors::InvalidArgument( + "x should be a 2-dim tensor, but got %d-dim.", dim_x.size())); + PADDLE_ENFORCE_EQ( + dim_y.size(), 2, + platform::errors::InvalidArgument( + "y should be a 2-dim tensor, but got %d-dim.", dim_y.size())); + PADDLE_ENFORCE_EQ( + dim_x[1], dim_y[0], + platform::errors::InvalidArgument("Expect dim_x[1] == dim_y[0], but " + "got dim_x[1] = %d, dim_y[0] = %d.", + dim_x[1], dim_y[0])); + Tensor z; + z.mutable_data({dim_x[0], dim_y[1]}, place); + const auto& runner = + NpuOpRunner("MatMul", {x, y}, {z}, + {{"transpose_x1", false}, {"transpose_x2", false}}); + runner.Run(stream); + return z; + } + void ConcatVoid(const std::vector& inputs, + const framework::DDim& shape_out, int axis, Tensor* output) { + output->mutable_data(shape_out, place); + std::vector names; + for (size_t i = 0; i < inputs.size(); i++) { + names.push_back("x" + std::to_string(i)); + } + NpuOpRunner runner{ + "ConcatD", + {inputs}, + {*output}, + {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}}; + runner.AddInputNames(names); + runner.Run(stream); + } + Tensor Concat(const std::vector& inputs, + const framework::DDim& shape_out, int axis) { + Tensor output; + ConcatVoid(inputs, shape_out, axis, &output); + return output; + } + Tensor Slice(const Tensor& x, const std::vector& offsets, + const std::vector& size, const framework::DDim& shape) { + Tensor y; + y.mutable_data(shape, place); + const auto& runner = + NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}}); + runner.Run(stream); + return y; + } + + private: + platform::Place place; + aclrtStream stream; + const framework::ExecutionContext& ctx; +}; + +template +void Vector2Tensor(const framework::ExecutionContext& ctx, + const std::vector& vec, const framework::DDim& ddim, + Tensor* tsr) { + framework::TensorFromVector(vec, ctx.device_context(), tsr); + ctx.template device_context().Wait(); + tsr->Resize(ddim); +} + +template +void BoxCoderEnc(const framework::ExecutionContext& ctx, const Tensor* tb, + const Tensor* pb, const Tensor* pbv, const bool norm, + const std::vector& variance, Tensor* out) { + auto M = pb->dims()[0]; + auto N = tb->dims()[0]; + auto shape_0 = framework::make_ddim({4, 2}); + Tensor m_diff; + Tensor m_aver; + std::vector vec_diff = {static_cast(-1), static_cast(0), + static_cast(0), static_cast(-1), + static_cast(1), static_cast(0), + static_cast(0), static_cast(1)}; + std::vector vec_aver = {static_cast(0.5), static_cast(0), + static_cast(0), static_cast(0.5), + static_cast(0.5), static_cast(0), + static_cast(0), static_cast(0.5)}; + Vector2Tensor(ctx, vec_diff, shape_0, &m_diff); + Vector2Tensor(ctx, vec_aver, shape_0, &m_aver); + + BoxCoderFunction F(ctx); + Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5)); + Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1)); + Tensor tb_xy = F.Dot(*tb, m_aver); + Tensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1)); + + pb_xy.Resize({1, M, 2}); + pb_wh.Resize({1, M, 2}); + tb_xy.Resize({N, 1, 2}); + tb_wh.Resize({N, 1, 2}); + + auto shape_half = framework::make_ddim({N, M, 2}); + auto shape_full = framework::make_ddim({N, M, 4}); + + Tensor out_xy_0 = F.DivWithBroadCast( + F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half); + Tensor out_wh_0 = F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half))); + Tensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2); + + if (pbv) { + F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out); + } else { + Tensor t_var; + std::vector vec_var(4); + for (auto i = 0; i < 4; i++) { + vec_var[i] = static_cast(variance[i]); + } + Vector2Tensor(ctx, vec_var, framework::make_ddim({1, 1, 4}), &t_var); + F.DivWithBroadCastVoid(out_0, t_var, shape_full, out); + } +} + +template +void BoxCoderDec(const framework::ExecutionContext& ctx, const Tensor* tb, + const Tensor* pb, const Tensor* pbv, const bool norm, + const std::vector& variance, int axis, Tensor* out) { + auto shape_0 = framework::make_ddim({4, 2}); + Tensor m_diff; + Tensor m_aver; + std::vector vec_diff = {static_cast(-1), static_cast(0), + static_cast(0), static_cast(-1), + static_cast(1), static_cast(0), + static_cast(0), static_cast(1)}; + std::vector vec_aver = {static_cast(0.5), static_cast(0), + static_cast(0), static_cast(0.5), + static_cast(0.5), static_cast(0), + static_cast(0), static_cast(0.5)}; + Vector2Tensor(ctx, vec_diff, shape_0, &m_diff); + Vector2Tensor(ctx, vec_aver, shape_0, &m_aver); + + BoxCoderFunction F(ctx); + Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5)); + Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1)); + auto pb_resize_shape = axis == 0 + ? framework::make_ddim({1, pb->dims()[0], 2}) + : framework::make_ddim({pb->dims()[0], 1, 2}); + pb_xy.Resize(pb_resize_shape); + pb_wh.Resize(pb_resize_shape); + + auto tbox_slice_shape = + framework::make_ddim({tb->dims()[0], tb->dims()[1], 2}); + std::vector tbox_slice_size = {static_cast(tb->dims()[0]), + static_cast(tb->dims()[1]), 2}; + Tensor tbox01 = F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape); + Tensor tbox23 = F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape); + + Tensor tb_xy; + Tensor tb_wh; + if (pbv) { + auto pbvt_slice_shape = framework::make_ddim({pbv->dims()[0], 2}); + auto pbvt_resize_shape = axis == 0 + ? framework::make_ddim({1, pbv->dims()[0], 2}) + : framework::make_ddim({pbv->dims()[0], 1, 2}); + std::vector pbvt_slice_size = {static_cast(pbv->dims()[0]), 2}; + Tensor pbv_t01 = F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape); + Tensor pbv_t23 = F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape); + pbv_t01.Resize(pbvt_resize_shape); + pbv_t23.Resize(pbvt_resize_shape); + + F.AddWithBroadCastVoid( + F.MulWithBroadCast(tbox01, F.Mul(pb_wh, pbv_t01), tbox_slice_shape), + pb_xy, tbox_slice_shape, &tb_xy); + F.MulWithBroadCastVoid( + F.Exp(F.MulWithBroadCast(pbv_t23, tbox23, tbox_slice_shape)), pb_wh, + tbox_slice_shape, &tb_wh); + } else if (variance.empty()) { + F.AddWithBroadCastVoid(F.MulWithBroadCast(tbox01, pb_wh, tbox_slice_shape), + pb_xy, tbox_slice_shape, &tb_xy); + F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh); + } else { + Tensor t_var01, t_var23; + auto t_var_shape = framework::make_ddim({1, 1, 2}); + std::vector vec_var01 = {static_cast(variance[0]), + static_cast(variance[1])}; + std::vector vec_var23 = {static_cast(variance[2]), + static_cast(variance[3])}; + Vector2Tensor(ctx, vec_var01, t_var_shape, &t_var01); + Vector2Tensor(ctx, vec_var23, t_var_shape, &t_var23); + F.AddWithBroadCastVoid( + F.MulWithBroadCast(tbox01, + F.MulWithBroadCast(pb_wh, t_var01, pb_resize_shape), + tbox_slice_shape), + pb_xy, tbox_slice_shape, &tb_xy); + F.MulWithBroadCastVoid( + F.Exp(F.MulWithBroadCast(t_var23, tbox23, tbox_slice_shape)), pb_wh, + tbox_slice_shape, &tb_wh); + } + Tensor obox01 = + F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape); + Tensor obox23 = + F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape), + (norm ? 0 : -1)); + F.ConcatVoid({obox01, obox23}, out->dims(), 2, out); +} + +template +class BoxCoderNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* prior_box = ctx.Input("PriorBox"); + auto* prior_box_var = ctx.Input("PriorBoxVar"); + auto* target_box = ctx.Input("TargetBox"); + auto* output_box = ctx.Output("OutputBox"); + std::vector variance = ctx.Attr>("variance"); + const int axis = ctx.Attr("axis"); + + if (prior_box_var) { + PADDLE_ENFORCE_EQ(variance.empty(), true, + platform::errors::InvalidArgument( + "Input 'PriorBoxVar' and attribute 'variance'" + " of BoxCoder operator should not be used at the " + "same time.")); + } + if (!(variance.empty())) { + PADDLE_ENFORCE_EQ(static_cast(variance.size()), 4, + platform::errors::InvalidArgument( + "Size of attribute 'variance' in BoxCoder operator" + " should be 4. But received size is %d", + variance.size())); + } + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, + platform::errors::InvalidArgument( + "Input 'TargetBox' of BoxCoder operator only" + " supports LoD with one level.")); + } + + auto code_type = GetBoxCodeType(ctx.Attr("code_type")); + bool normalized = ctx.Attr("box_normalized"); + + if (code_type == BoxCodeType::kEncodeCenterSize) { + BoxCoderEnc(ctx, target_box, prior_box, prior_box_var, normalized, + variance, output_box); + } else { + BoxCoderDec(ctx, target_box, prior_box, prior_box_var, normalized, + variance, axis, output_box); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(box_coder, ops::BoxCoderNPUKernel, + ops::BoxCoderNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py new file mode 100644 index 00000000000000..4d4d61ace841e8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py @@ -0,0 +1,252 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +import math +import paddle +from op_test import OpTest + +paddle.enable_static() + +np.random.seed(2021) + + +def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0): + pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False) + pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False) + pb_x = pb_w * 0.5 + p_box[:, 0] + pb_y = pb_h * 0.5 + p_box[:, 1] + + shape = (1, p_box.shape[0]) if axis == 0 else (p_box.shape[0], 1) + + pb_w = pb_w.reshape(shape) + pb_h = pb_h.reshape(shape) + pb_x = pb_x.reshape(shape) + pb_y = pb_y.reshape(shape) + + if pb_v.ndim == 2: + var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else ( + pb_v.shape[0], 1, pb_v.shape[1]) + pb_v = pb_v.reshape(var_shape) + if pb_v.ndim == 1: + tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x + tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y + tb_w = np.exp(pb_v[2] * t_box[:, :, 2]) * pb_w + tb_h = np.exp(pb_v[3] * t_box[:, :, 3]) * pb_h + else: + tb_x = pb_v[:, :, 0] * t_box[:, :, 0] * pb_w + pb_x + tb_y = pb_v[:, :, 1] * t_box[:, :, 1] * pb_h + pb_y + tb_w = np.exp(pb_v[:, :, 2] * t_box[:, :, 2]) * pb_w + tb_h = np.exp(pb_v[:, :, 3] * t_box[:, :, 3]) * pb_h + output_box[:, :, 0] = tb_x - tb_w / 2 + output_box[:, :, 1] = tb_y - tb_h / 2 + output_box[:, :, 2] = tb_x + tb_w / 2 - (not norm) + output_box[:, :, 3] = tb_y + tb_h / 2 - (not norm) + + +def box_encoder(t_box, p_box, pb_v, output_box, norm): + pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False) + pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False) + pb_x = pb_w * 0.5 + p_box[:, 0] + pb_y = pb_h * 0.5 + p_box[:, 1] + shape = (1, p_box.shape[0]) + + pb_w = pb_w.reshape(shape) + pb_h = pb_h.reshape(shape) + pb_x = pb_x.reshape(shape) + pb_y = pb_y.reshape(shape) + + if pb_v.ndim == 2: + pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1]) + tb_x = ((t_box[:, 2] + t_box[:, 0]) / 2).reshape(t_box.shape[0], 1) + tb_y = ((t_box[:, 3] + t_box[:, 1]) / 2).reshape(t_box.shape[0], 1) + tb_w = (t_box[:, 2] - t_box[:, 0]).reshape(t_box.shape[0], 1) + (not norm) + tb_h = (t_box[:, 3] - t_box[:, 1]).reshape(t_box.shape[0], 1) + (not norm) + if pb_v.ndim == 1: + output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[0] + output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[1] + output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[2] + output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[3] + else: + output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[:, :, 0] + output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[:, :, 1] + output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[:, :, 2] + output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[:, :, 3] + + +def batch_box_coder(p_box, pb_v, t_box, lod, code_type, norm, axis=0): + n = t_box.shape[0] + m = p_box.shape[0] + if code_type == "decode_center_size": + m = t_box.shape[1] + output_box = np.zeros((n, m, 4), dtype=np.float32) + cur_offset = 0 + + for i in range(len(lod)): + if (code_type == "encode_center_size"): + box_encoder(t_box[cur_offset:(cur_offset + lod[i]), :], p_box, pb_v, + output_box[cur_offset:(cur_offset + lod[i]), :, :], + norm) + elif (code_type == "decode_center_size"): + box_decoder(t_box, p_box, pb_v, output_box, norm, axis) + cur_offset += lod[i] + return output_box + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestBoxCoderOp(OpTest): + def setUp(self): + self.op_type = "box_coder" + self.set_npu() + self.init_dtype() + + self.set_init_config() + self.set_inputs() + self.set_attrs() + self.set_outputs() + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def set_init_config(self): + self.M = 81 + self.N = 20 + self.code_type = 'decode_center_size' + self.box_normalized = False + self.lod = [[1, 1, 1, 1, 1]] + self.axis = 0 + self.use_variance = False + self.without_prior_box_var = False + self.atol = 1e-5 + + def set_inputs(self): + self.inputs = {} + assert (self.code_type in ['decode_center_size', 'encode_center_size']) + assert (self.axis in [0, 1]) + if self.code_type == 'decode_center_size': + assert (not self.use_variance or not self.without_prior_box_var) + + self.prior_box = np.random.random((self.M, 4)).astype(self.dtype) + + if self.use_variance: + self.prior_box_var = np.random.random(4).astype(self.dtype) + else: + if self.without_prior_box_var: + self.prior_box_var = np.ones((self.M, 4)).astype(self.dtype) + else: + self.prior_box_var = np.random.random( + (self.M, 4)).astype(self.dtype) + + if self.axis == 0: + self.target_box = np.random.random( + (self.N, self.M, 4)).astype(self.dtype) + else: + self.target_box = np.random.random( + (self.M, self.N, 4)).astype(self.dtype) + self.inputs['PriorBox'] = self.prior_box + self.inputs['TargetBox'] = self.target_box + if (not self.use_variance and not self.without_prior_box_var): + self.inputs['PriorBoxVar'] = self.prior_box_var + else: + #encode_center_size + self.prior_box = np.random.random((self.M, 4)).astype(self.dtype) + if self.use_variance: + self.prior_box_var = np.random.random(4).astype(self.dtype) + else: + self.prior_box_var = np.random.random( + (self.M, 4)).astype(self.dtype) + self.target_box = np.random.random((self.N, 4)).astype(self.dtype) + self.inputs['PriorBox'] = self.prior_box + #self.inputs['PriorBoxVar'] = self.prior_box_var + self.inputs['TargetBox'] = (self.target_box, self.lod) + if (not self.use_variance): + self.inputs['PriorBoxVar'] = self.prior_box_var + + def set_attrs(self): + self.attrs = { + 'code_type': self.code_type, + 'box_normalized': self.box_normalized + } + if self.use_variance: + self.attrs['variance'] = self.prior_box_var.astype( + np.float).flatten() + if self.axis != 0: + self.attrs['axis'] = self.axis + + def set_outputs(self): + output_box = batch_box_coder( + self.prior_box, self.prior_box_var, self.target_box, self.lod[0], + self.code_type, self.box_normalized, self.axis) + self.outputs = {'OutputBox': output_box.astype(self.dtype)} + + def test_check_output(self): + self.check_output_with_place(self.place, atol=self.atol) + + +class TestBoxCoderOpWithoutBoxVar(TestBoxCoderOp): + def set_init_config(self): + super(TestBoxCoderOpWithoutBoxVar, self).set_init_config() + self.without_prior_box_var = True + self.lod = [[0, 1, 2, 3, 4, 5]] + + +class TestBoxCoderOpWithLoD(TestBoxCoderOp): + def set_init_config(self): + super(TestBoxCoderOpWithLoD, self).set_init_config() + self.M = 20 + self.N = 50 + self.lod = [[10, 20, 20]] + self.code_type = 'encode_center_size' + self.box_normalized = True + + +class TestBoxCoderOpWithLoDWithVariance(TestBoxCoderOpWithLoD): + def set_init_config(self): + super(TestBoxCoderOpWithLoDWithVariance, self).set_init_config() + self.use_variance = True + + +class TestBoxCoderOpWithAxis(TestBoxCoderOp): + def set_init_config(self): + super(TestBoxCoderOpWithAxis, self).set_init_config() + self.axis = 1 + + +class TestBoxCoderOpWithVariance(TestBoxCoderOp): + def set_init_config(self): + super(TestBoxCoderOpWithVariance, self).set_init_config() + self.use_variance = True + + +class TestBoxCoderOpFP16(TestBoxCoderOp): + def init_dtype(self): + self.dtype = np.float16 + + def set_init_config(self): + super(TestBoxCoderOpFP16, self).set_init_config() + self.atol = 1e-2 + + +if __name__ == '__main__': + unittest.main() From 79bd5f90f304c239f2b51778c977648016174381 Mon Sep 17 00:00:00 2001 From: yaoxuefeng Date: Wed, 29 Sep 2021 14:59:53 +0800 Subject: [PATCH 052/298] add slot record dataset (#36200) --- paddle/fluid/framework/channel.h | 20 +- paddle/fluid/framework/data_feed.cc | 112 +++++++- paddle/fluid/framework/data_feed.h | 317 +++++++++++++++++++++- paddle/fluid/framework/data_set.cc | 166 +++++++++-- paddle/fluid/framework/data_set.h | 40 ++- paddle/fluid/framework/dataset_factory.cc | 3 +- paddle/fluid/platform/flags.cc | 8 + paddle/fluid/pybind/data_set_py.cc | 2 - 8 files changed, 622 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index 503f1513aad20c..80fee94f1c85d9 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -157,7 +157,19 @@ class ChannelObject { p.resize(finished); return finished; } + // read once only + size_t ReadOnce(std::vector& p, size_t size) { // NOLINT + if (size == 0) { + return 0; + } + std::unique_lock lock(mutex_); + p.resize(size); + size_t finished = Read(size, &p[0], lock, true); + p.resize(finished); + Notify(); + return finished; + } size_t ReadAll(std::vector& p) { // NOLINT p.clear(); size_t finished = 0; @@ -241,17 +253,21 @@ class ChannelObject { return !closed_; } - size_t Read(size_t n, T* p, std::unique_lock& lock) { // NOLINT + size_t Read(size_t n, T* p, std::unique_lock& lock, // NOLINT + bool once = false) { // NOLINT size_t finished = 0; CHECK(n <= MaxCapacity() - reading_count_); reading_count_ += n; while (finished < n && WaitForRead(lock)) { - size_t m = std::min(n - finished, data_.size()); + size_t m = (std::min)(n - finished, data_.size()); for (size_t i = 0; i < m; i++) { p[finished++] = std::move(data_.front()); data_.pop_front(); } reading_count_ -= m; + if (once && m > 0) { + break; + } } reading_count_ -= n - finished; return finished; diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index fdb24ee18eca7d..4463fd9fd53409 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -36,6 +36,107 @@ DLManager& global_dlmanager_pool() { return manager; } +class BufferedLineFileReader { + typedef std::function SampleFunc; + static const int MAX_FILE_BUFF_SIZE = 4 * 1024 * 1024; + class FILEReader { + public: + explicit FILEReader(FILE* fp) : fp_(fp) {} + int read(char* buf, int len) { return fread(buf, sizeof(char), len, fp_); } + + private: + FILE* fp_; + }; + + public: + typedef std::function LineFunc; + + private: + template + int read_lines(T* reader, LineFunc func, int skip_lines) { + int lines = 0; + size_t ret = 0; + char* ptr = NULL; + char* eol = NULL; + total_len_ = 0; + error_line_ = 0; + + SampleFunc spfunc = get_sample_func(); + std::string x; + while (!is_error() && (ret = reader->read(buff_, MAX_FILE_BUFF_SIZE)) > 0) { + total_len_ += ret; + ptr = buff_; + eol = reinterpret_cast(memchr(ptr, '\n', ret)); + while (eol != NULL) { + int size = static_cast((eol - ptr) + 1); + x.append(ptr, size - 1); + ++lines; + if (lines > skip_lines && spfunc()) { + if (!func(x)) { + ++error_line_; + } + } + + x.clear(); + ptr += size; + ret -= size; + eol = reinterpret_cast(memchr(ptr, '\n', ret)); + } + if (ret > 0) { + x.append(ptr, ret); + } + } + if (!is_error() && !x.empty()) { + ++lines; + if (lines > skip_lines && spfunc()) { + if (!func(x)) { + ++error_line_; + } + } + } + return lines; + } + + public: + BufferedLineFileReader() + : random_engine_(std::random_device()()), + uniform_distribution_(0.0f, 1.0f) { + total_len_ = 0; + sample_line_ = 0; + buff_ = + reinterpret_cast(calloc(MAX_FILE_BUFF_SIZE + 1, sizeof(char))); + } + ~BufferedLineFileReader() { free(buff_); } + + int read_file(FILE* fp, LineFunc func, int skip_lines) { + FILEReader reader(fp); + return read_lines(&reader, func, skip_lines); + } + uint64_t file_size(void) { return total_len_; } + void set_sample_rate(float r) { sample_rate_ = r; } + size_t get_sample_line() { return sample_line_; } + bool is_error(void) { return (error_line_ > 10); } + + private: + SampleFunc get_sample_func() { + if (std::abs(sample_rate_ - 1.0f) < 1e-5f) { + return [this](void) { return true; }; + } + return [this](void) { + return (uniform_distribution_(random_engine_) < sample_rate_); + }; + } + + private: + char* buff_ = nullptr; + uint64_t total_len_ = 0; + + std::default_random_engine random_engine_; + std::uniform_real_distribution uniform_distribution_; + float sample_rate_ = 1.0f; + size_t sample_line_ = 0; + size_t error_line_ = 0; +}; void RecordCandidateList::ReSize(size_t length) { mutex_.lock(); capacity_ = length; @@ -301,7 +402,7 @@ int InMemoryDataFeed::Next() { << ", thread_id=" << thread_id_; } } else { - VLOG(3) << "enable heter NEXT: " << offset_index_ + VLOG(3) << "enable heter next: " << offset_index_ << " batch_offsets: " << batch_offsets_.size(); if (offset_index_ >= batch_offsets_.size()) { VLOG(3) << "offset_index: " << offset_index_ @@ -318,14 +419,7 @@ int InMemoryDataFeed::Next() { VLOG(3) << "finish reading for heterps, batch size zero, thread_id=" << thread_id_; } - /* - if (offset_index_ == batch_offsets_.size() - 1) { - std::vector data; - output_channel_->ReadAll(data); - consume_channel_->Write(std::move(data)); - } - */ - VLOG(3) << "#15 enable heter NEXT: " << offset_index_ + VLOG(3) << "enable heter next: " << offset_index_ << " batch_offsets: " << batch_offsets_.size() << " baych_size: " << this->batch_size_; } diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 198bc51463af35..5527eaf1f6fa4d 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -39,8 +39,14 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/string/string_helper.h" +DECLARE_int32(record_pool_max_size); +DECLARE_int32(slotpool_thread_num); +DECLARE_bool(enable_slotpool_wait_release); +DECLARE_bool(enable_slotrecord_reset_shrink); + namespace paddle { namespace framework { class DataFeedDesc; @@ -69,6 +75,50 @@ namespace framework { // while (reader->Next()) { // // trainer do something // } + +template +struct SlotValues { + std::vector slot_values; + std::vector slot_offsets; + + void add_values(const T* values, uint32_t num) { + if (slot_offsets.empty()) { + slot_offsets.push_back(0); + } + if (num > 0) { + slot_values.insert(slot_values.end(), values, values + num); + } + slot_offsets.push_back(static_cast(slot_values.size())); + } + T* get_values(int idx, size_t* size) { + uint32_t& offset = slot_offsets[idx]; + (*size) = slot_offsets[idx + 1] - offset; + return &slot_values[offset]; + } + void add_slot_feasigns(const std::vector>& slot_feasigns, + uint32_t fea_num) { + slot_values.reserve(fea_num); + int slot_num = static_cast(slot_feasigns.size()); + slot_offsets.resize(slot_num + 1); + for (int i = 0; i < slot_num; ++i) { + auto& slot_val = slot_feasigns[i]; + slot_offsets[i] = static_cast(slot_values.size()); + uint32_t num = static_cast(slot_val.size()); + if (num > 0) { + slot_values.insert(slot_values.end(), slot_val.begin(), slot_val.end()); + } + } + slot_offsets[slot_num] = slot_values.size(); + } + void clear(bool shrink) { + slot_offsets.clear(); + slot_values.clear(); + if (shrink) { + slot_values.shrink_to_fit(); + slot_offsets.shrink_to_fit(); + } + } +}; union FeatureFeasign { uint64_t uint64_feasign_; float float_feasign_; @@ -97,6 +147,38 @@ struct FeatureItem { uint16_t slot_; }; +struct AllSlotInfo { + std::string slot; + std::string type; + int used_idx; + int slot_value_idx; +}; +struct UsedSlotInfo { + int idx; + int slot_value_idx; + std::string slot; + std::string type; + bool dense; + std::vector local_shape; + int total_dims_without_inductive; + int inductive_shape_index; +}; +struct SlotRecordObject { + uint64_t search_id; + uint32_t rank; + uint32_t cmatch; + std::string ins_id_; + SlotValues slot_uint64_feasigns_; + SlotValues slot_float_feasigns_; + + ~SlotRecordObject() { clear(true); } + void reset(void) { clear(FLAGS_enable_slotrecord_reset_shrink); } + void clear(bool shrink) { + slot_uint64_feasigns_.clear(shrink); + slot_float_feasigns_.clear(shrink); + } +}; +using SlotRecord = SlotRecordObject*; // sizeof Record is much less than std::vector struct Record { std::vector uint64_feasigns_; @@ -108,6 +190,179 @@ struct Record { uint32_t cmatch; }; +inline SlotRecord make_slotrecord() { + static const size_t slot_record_byte_size = sizeof(SlotRecordObject); + void* p = malloc(slot_record_byte_size); + new (p) SlotRecordObject; + return reinterpret_cast(p); +} + +inline void free_slotrecord(SlotRecordObject* p) { + p->~SlotRecordObject(); + free(p); +} + +template +class SlotObjAllocator { + public: + explicit SlotObjAllocator(std::function deleter) + : free_nodes_(NULL), capacity_(0), deleter_(deleter) {} + ~SlotObjAllocator() { clear(); } + + void clear() { + T* tmp = NULL; + while (free_nodes_ != NULL) { + tmp = reinterpret_cast(reinterpret_cast(free_nodes_)); + free_nodes_ = free_nodes_->next; + deleter_(tmp); + --capacity_; + } + CHECK_EQ(capacity_, static_cast(0)); + } + T* acquire(void) { + T* x = NULL; + x = reinterpret_cast(reinterpret_cast(free_nodes_)); + free_nodes_ = free_nodes_->next; + --capacity_; + return x; + } + void release(T* x) { + Node* node = reinterpret_cast(reinterpret_cast(x)); + node->next = free_nodes_; + free_nodes_ = node; + ++capacity_; + } + size_t capacity(void) { return capacity_; } + + private: + struct alignas(T) Node { + union { + Node* next; + char data[sizeof(T)]; + }; + }; + Node* free_nodes_; // a list + size_t capacity_; + std::function deleter_ = nullptr; +}; +static const int OBJPOOL_BLOCK_SIZE = 10000; +class SlotObjPool { + public: + SlotObjPool() + : max_capacity_(FLAGS_record_pool_max_size), alloc_(free_slotrecord) { + ins_chan_ = MakeChannel(); + ins_chan_->SetBlockSize(OBJPOOL_BLOCK_SIZE); + for (int i = 0; i < FLAGS_slotpool_thread_num; ++i) { + threads_.push_back(std::thread([this]() { run(); })); + } + disable_pool_ = false; + count_ = 0; + } + ~SlotObjPool() { + ins_chan_->Close(); + for (auto& t : threads_) { + t.join(); + } + } + void disable_pool(bool disable) { disable_pool_ = disable; } + void set_max_capacity(size_t max_capacity) { max_capacity_ = max_capacity; } + void get(std::vector* output, int n) { + output->resize(n); + return get(&(*output)[0], n); + } + void get(SlotRecord* output, int n) { + int size = 0; + mutex_.lock(); + int left = static_cast(alloc_.capacity()); + if (left > 0) { + size = (left >= n) ? n : left; + for (int i = 0; i < size; ++i) { + output[i] = alloc_.acquire(); + } + } + mutex_.unlock(); + count_ += n; + if (size == n) { + return; + } + for (int i = size; i < n; ++i) { + output[i] = make_slotrecord(); + } + } + void put(std::vector* input) { + size_t size = input->size(); + if (size == 0) { + return; + } + put(&(*input)[0], size); + input->clear(); + } + void put(SlotRecord* input, size_t size) { + CHECK(ins_chan_->WriteMove(size, input) == size); + } + void run(void) { + std::vector input; + while (ins_chan_->ReadOnce(input, OBJPOOL_BLOCK_SIZE)) { + if (input.empty()) { + continue; + } + // over max capacity + size_t n = input.size(); + count_ -= n; + if (disable_pool_ || n + capacity() > max_capacity_) { + for (auto& t : input) { + free_slotrecord(t); + } + } else { + for (auto& t : input) { + t->reset(); + } + mutex_.lock(); + for (auto& t : input) { + alloc_.release(t); + } + mutex_.unlock(); + } + input.clear(); + } + } + void clear(void) { + platform::Timer timeline; + timeline.Start(); + mutex_.lock(); + alloc_.clear(); + mutex_.unlock(); + // wait release channel data + if (FLAGS_enable_slotpool_wait_release) { + while (!ins_chan_->Empty()) { + sleep(1); + } + } + timeline.Pause(); + VLOG(3) << "clear slot pool data size=" << count_.load() + << ", span=" << timeline.ElapsedSec(); + } + size_t capacity(void) { + mutex_.lock(); + size_t total = alloc_.capacity(); + mutex_.unlock(); + return total; + } + + private: + size_t max_capacity_; + Channel ins_chan_; + std::vector threads_; + std::mutex mutex_; + SlotObjAllocator alloc_; + bool disable_pool_; + std::atomic count_; // NOLINT +}; + +inline SlotObjPool& SlotRecordPool() { + static SlotObjPool pool; + return pool; +} struct PvInstanceObject { std::vector ads; void merge_instance(Record* ins) { ads.push_back(ins); } @@ -129,7 +384,21 @@ class CustomParser { CustomParser() {} virtual ~CustomParser() {} virtual void Init(const std::vector& slots) = 0; + virtual bool Init(const std::vector& slots) = 0; virtual void ParseOneInstance(const char* str, Record* instance) = 0; + virtual bool ParseOneInstance( + const std::string& line, + std::function&, int)> + GetInsFunc) { // NOLINT + return true; + } + virtual bool ParseFileInstance( + std::function ReadBuffFunc, + std::function&, int, int)> + PullRecordsFunc, // NOLINT + int& lines) { // NOLINT + return false; + } }; typedef paddle::framework::CustomParser* (*CreateParserObjectFunc)(); @@ -194,6 +463,34 @@ class DLManager { return nullptr; } + paddle::framework::CustomParser* Load(const std::string& name, + const std::vector& conf) { +#ifdef _LINUX + std::lock_guard lock(mutex_); + DLHandle handle; + std::map::iterator it = handle_map_.find(name); + if (it != handle_map_.end()) { + return it->second.parser; + } + handle.module = dlopen(name.c_str(), RTLD_NOW); + if (handle.module == nullptr) { + VLOG(0) << "Create so of " << name << " fail"; + exit(-1); + return nullptr; + } + + CreateParserObjectFunc create_parser_func = + (CreateParserObjectFunc)dlsym(handle.module, "CreateParserObject"); + handle.parser = create_parser_func(); + handle.parser->Init(conf); + handle_map_.insert({name, handle}); + + return handle.parser; +#endif + VLOG(0) << "Not implement in windows"; + return nullptr; + } + paddle::framework::CustomParser* ReLoad(const std::string& name, const std::vector& conf) { Close(name); @@ -415,6 +712,11 @@ class InMemoryDataFeed : public DataFeed { virtual void SetCurrentPhase(int current_phase); virtual void LoadIntoMemory(); virtual void LoadIntoMemoryFromSo(); + virtual void SetRecord(T* records) { records_ = records; } + int GetDefaultBatchSize() { return default_batch_size_; } + void AddBatchOffset(const std::pair& offset) { + batch_offsets_.push_back(offset); + } protected: virtual bool ParseOneInstance(T* instance) = 0; @@ -424,6 +726,11 @@ class InMemoryDataFeed : public DataFeed { virtual void PutToFeedVec(const std::vector& ins_vec) = 0; virtual void PutToFeedVec(const T* ins_vec, int num) = 0; + std::vector> batch_float_feasigns_; + std::vector> batch_uint64_feasigns_; + std::vector> offset_; + std::vector visit_; + int thread_id_; int thread_num_; bool parse_ins_id_; @@ -783,11 +1090,7 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed { MultiSlotInMemoryDataFeed() {} virtual ~MultiSlotInMemoryDataFeed() {} virtual void Init(const DataFeedDesc& data_feed_desc); - void SetRecord(Record* records) { records_ = records; } - int GetDefaultBatchSize() { return default_batch_size_; } - void AddBatchOffset(const std::pair& offset) { - batch_offsets_.push_back(offset); - } + // void SetRecord(Record* records) { records_ = records; } protected: virtual bool ParseOneInstance(Record* instance); @@ -798,10 +1101,6 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed { virtual void GetMsgFromLogKey(const std::string& log_key, uint64_t* search_id, uint32_t* cmatch, uint32_t* rank); virtual void PutToFeedVec(const Record* ins_vec, int num); - std::vector> batch_float_feasigns_; - std::vector> batch_uint64_feasigns_; - std::vector> offset_; - std::vector visit_; }; class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed { diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index 08c42a93d1fcbf..82a39b206e6bd6 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -351,10 +351,8 @@ static int compute_thread_batch_nccl( return thread_avg_batch_num; } -template -void DatasetImpl::SetHeterPs(bool enable_heterps) { +void MultiSlotDataset::PrepareTrain() { #ifdef PADDLE_WITH_GLOO - enable_heterps_ = enable_heterps; if (enable_heterps_) { if (input_records_.size() == 0 && input_channel_ != nullptr && input_channel_->Size() != 0) { @@ -541,22 +539,21 @@ void DatasetImpl::LocalShuffle() { << timeline.ElapsedSec() << " seconds"; } -template -void DatasetImpl::GlobalShuffle(int thread_num) { +void MultiSlotDataset::GlobalShuffle(int thread_num) { #ifdef PADDLE_WITH_PSLIB - VLOG(3) << "DatasetImpl::GlobalShuffle() begin"; + VLOG(3) << "MultiSlotDataset::GlobalShuffle() begin"; platform::Timer timeline; timeline.Start(); auto fleet_ptr = FleetWrapper::GetInstance(); if (!input_channel_ || input_channel_->Size() == 0) { - VLOG(3) << "DatasetImpl::GlobalShuffle() end, no data to shuffle"; + VLOG(3) << "MultiSlotDataset::GlobalShuffle() end, no data to shuffle"; return; } // local shuffle input_channel_->Close(); - std::vector data; + std::vector data; input_channel_->ReadAll(data); std::shuffle(data.begin(), data.end(), fleet_ptr->LocalRandomEngine()); input_channel_->Open(); @@ -566,10 +563,10 @@ void DatasetImpl::GlobalShuffle(int thread_num) { input_channel_->Close(); input_channel_->SetBlockSize(fleet_send_batch_size_); - VLOG(3) << "DatasetImpl::GlobalShuffle() input_channel_ size " + VLOG(3) << "MultiSlotDataset::GlobalShuffle() input_channel_ size " << input_channel_->Size(); - auto get_client_id = [this, fleet_ptr](const T& data) -> size_t { + auto get_client_id = [this, fleet_ptr](const Record& data) -> size_t { if (!this->merge_by_insid_) { return fleet_ptr->LocalRandomEngine()() % this->trainer_num_; } else { @@ -580,7 +577,7 @@ void DatasetImpl::GlobalShuffle(int thread_num) { auto global_shuffle_func = [this, get_client_id]() { auto fleet_ptr = FleetWrapper::GetInstance(); - std::vector data; + std::vector data; while (this->input_channel_->Read(data)) { std::vector ars(this->trainer_num_); for (auto& t : data) { @@ -835,9 +832,6 @@ void DatasetImpl::CreateReaders() { channel_idx = 0; } } - if (enable_heterps_) { - SetHeterPs(true); - } VLOG(3) << "readers size: " << readers_.size(); } @@ -923,9 +917,8 @@ int64_t DatasetImpl::GetShuffleDataSize() { return sum; } -template -int DatasetImpl::ReceiveFromClient(int msg_type, int client_id, - const std::string& msg) { +int MultiSlotDataset::ReceiveFromClient(int msg_type, int client_id, + const std::string& msg) { #ifdef _LINUX VLOG(3) << "ReceiveFromClient msg_type=" << msg_type << ", client_id=" << client_id << ", msg length=" << msg.length(); @@ -937,9 +930,9 @@ int DatasetImpl::ReceiveFromClient(int msg_type, int client_id, if (ar.Cursor() == ar.Finish()) { return 0; } - std::vector data; + std::vector data; while (ar.Cursor() < ar.Finish()) { - data.push_back(ar.Get()); + data.push_back(ar.Get()); } CHECK(ar.Cursor() == ar.Finish()); @@ -966,6 +959,20 @@ int DatasetImpl::ReceiveFromClient(int msg_type, int client_id, // explicit instantiation template class DatasetImpl; +void MultiSlotDataset::DynamicAdjustReadersNum(int thread_num) { + if (thread_num_ == thread_num) { + VLOG(3) << "DatasetImpl::DynamicAdjustReadersNum thread_num_=" + << thread_num_ << ", thread_num_=thread_num, no need to adjust"; + return; + } + VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num; + thread_num_ = thread_num; + std::vector>().swap(readers_); + CreateReaders(); + VLOG(3) << "adjust readers num done"; + PrepareTrain(); +} + void MultiSlotDataset::PostprocessInstance() { // divide pv instance, and merge to input_channel_ if (enable_pv_merge_) { @@ -1503,5 +1510,126 @@ void MultiSlotDataset::SlotsShuffle( << ", cost time=" << timeline.ElapsedSec() << " seconds"; } +template class DatasetImpl; +void SlotRecordDataset::CreateChannel() { + if (input_channel_ == nullptr) { + input_channel_ = paddle::framework::MakeChannel(); + } +} +void SlotRecordDataset::CreateReaders() { + VLOG(3) << "Calling CreateReaders()"; + VLOG(3) << "thread num in Dataset: " << thread_num_; + VLOG(3) << "Filelist size in Dataset: " << filelist_.size(); + VLOG(3) << "channel num in Dataset: " << channel_num_; + CHECK(thread_num_ > 0) << "thread num should > 0"; + CHECK(channel_num_ > 0) << "channel num should > 0"; + CHECK(channel_num_ <= thread_num_) << "channel num should <= thread num"; + VLOG(3) << "readers size: " << readers_.size(); + if (readers_.size() != 0) { + VLOG(3) << "readers_.size() = " << readers_.size() + << ", will not create again"; + return; + } + VLOG(3) << "data feed class name: " << data_feed_desc_.name(); + for (int i = 0; i < thread_num_; ++i) { + readers_.push_back(DataFeedFactory::CreateDataFeed(data_feed_desc_.name())); + readers_[i]->Init(data_feed_desc_); + readers_[i]->SetThreadId(i); + readers_[i]->SetThreadNum(thread_num_); + readers_[i]->SetFileListMutex(&mutex_for_pick_file_); + readers_[i]->SetFileListIndex(&file_idx_); + readers_[i]->SetFeaNumMutex(&mutex_for_fea_num_); + readers_[i]->SetFeaNum(&total_fea_num_); + readers_[i]->SetFileList(filelist_); + readers_[i]->SetParseInsId(parse_ins_id_); + readers_[i]->SetParseContent(parse_content_); + readers_[i]->SetParseLogKey(parse_logkey_); + readers_[i]->SetEnablePvMerge(enable_pv_merge_); + readers_[i]->SetCurrentPhase(current_phase_); + if (input_channel_ != nullptr) { + readers_[i]->SetInputChannel(input_channel_.get()); + } + } + VLOG(3) << "readers size: " << readers_.size(); +} + +void SlotRecordDataset::ReleaseMemory() { + VLOG(3) << "SlotRecordDataset::ReleaseMemory() begin"; + platform::Timer timeline; + timeline.Start(); + + if (input_channel_) { + input_channel_->Clear(); + input_channel_ = nullptr; + } + if (enable_heterps_) { + VLOG(3) << "put pool records size: " << input_records_.size(); + SlotRecordPool().put(&input_records_); + input_records_.clear(); + input_records_.shrink_to_fit(); + VLOG(3) << "release heterps input records records size: " + << input_records_.size(); + } + + readers_.clear(); + readers_.shrink_to_fit(); + + std::vector>().swap(readers_); + + VLOG(3) << "SlotRecordDataset::ReleaseMemory() end"; + VLOG(3) << "total_feasign_num_(" << STAT_GET(STAT_total_feasign_num_in_mem) + << ") - current_fea_num_(" << total_fea_num_ << ") = (" + << STAT_GET(STAT_total_feasign_num_in_mem) - total_fea_num_ << ")" + << " object pool size=" << SlotRecordPool().capacity(); // For Debug + STAT_SUB(STAT_total_feasign_num_in_mem, total_fea_num_); +} +void SlotRecordDataset::GlobalShuffle(int thread_num) { + // TODO(yaoxuefeng) + return; +} + +void SlotRecordDataset::DynamicAdjustChannelNum(int channel_num, + bool discard_remaining_ins) { + if (channel_num_ == channel_num) { + VLOG(3) << "DatasetImpl::DynamicAdjustChannelNum channel_num_=" + << channel_num_ << ", channel_num_=channel_num, no need to adjust"; + return; + } + VLOG(3) << "adjust channel num from " << channel_num_ << " to " + << channel_num; + channel_num_ = channel_num; + + if (static_cast(input_channel_->Size()) >= channel_num) { + input_channel_->SetBlockSize(input_channel_->Size() / channel_num + + (discard_remaining_ins ? 0 : 1)); + } + + VLOG(3) << "adjust channel num done"; +} + +void SlotRecordDataset::PrepareTrain() { +#ifdef PADDLE_WITH_GLOO + return; +#else + PADDLE_THROW(platform::errors::Unavailable( + "dataset set heterps need compile with GLOO")); +#endif + return; +} + +void SlotRecordDataset::DynamicAdjustReadersNum(int thread_num) { + if (thread_num_ == thread_num) { + VLOG(3) << "DatasetImpl::DynamicAdjustReadersNum thread_num_=" + << thread_num_ << ", thread_num_=thread_num, no need to adjust"; + return; + } + VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num; + thread_num_ = thread_num; + std::vector>().swap(readers_); + CreateReaders(); + VLOG(3) << "adjust readers num done"; + PrepareTrain(); +} + } // end namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index f3ee96fab8297f..981fb694e0fec9 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -149,7 +149,6 @@ class Dataset { virtual void DynamicAdjustReadersNum(int thread_num) = 0; // set fleet send sleep seconds virtual void SetFleetSendSleepSeconds(int seconds) = 0; - virtual void SetHeterPs(bool enable_heterps) = 0; protected: virtual int ReceiveFromClient(int msg_type, int client_id, @@ -207,7 +206,7 @@ class DatasetImpl : public Dataset { virtual void WaitPreLoadDone(); virtual void ReleaseMemory(); virtual void LocalShuffle(); - virtual void GlobalShuffle(int thread_num = -1); + virtual void GlobalShuffle(int thread_num = -1) {} virtual void SlotsShuffle(const std::set& slots_to_replace) {} virtual const std::vector& GetSlotsOriginalData() { return slots_shuffle_original_data_; @@ -233,7 +232,11 @@ class DatasetImpl : public Dataset { bool discard_remaining_ins = false); virtual void DynamicAdjustReadersNum(int thread_num); virtual void SetFleetSendSleepSeconds(int seconds); - virtual void SetHeterPs(bool enable_heterps); + /* for enable_heterps_ + virtual void EnableHeterps(bool enable_heterps) { + enable_heterps_ = enable_heterps; + } + */ std::vector>& GetMultiOutputChannel() { return multi_output_channel_; @@ -251,7 +254,10 @@ class DatasetImpl : public Dataset { protected: virtual int ReceiveFromClient(int msg_type, int client_id, - const std::string& msg); + const std::string& msg) { + // TODO(yaoxuefeng) for SlotRecordDataset + return -1; + } std::vector> readers_; std::vector> preload_readers_; paddle::framework::Channel input_channel_; @@ -327,6 +333,32 @@ class MultiSlotDataset : public DatasetImpl { const std::unordered_set& slots_to_replace, std::vector* result); virtual ~MultiSlotDataset() {} + virtual void GlobalShuffle(int thread_num = -1); + virtual void DynamicAdjustReadersNum(int thread_num); + virtual void PrepareTrain(); + + protected: + virtual int ReceiveFromClient(int msg_type, int client_id, + const std::string& msg); +}; +class SlotRecordDataset : public DatasetImpl { + public: + SlotRecordDataset() { SlotRecordPool(); } + virtual ~SlotRecordDataset() {} + // create input channel + virtual void CreateChannel(); + // create readers + virtual void CreateReaders(); + // release memory + virtual void ReleaseMemory(); + virtual void GlobalShuffle(int thread_num = -1); + virtual void DynamicAdjustChannelNum(int channel_num, + bool discard_remaining_ins); + virtual void PrepareTrain(); + virtual void DynamicAdjustReadersNum(int thread_num); + + protected: + bool enable_heterps_ = true; }; } // end namespace framework diff --git a/paddle/fluid/framework/dataset_factory.cc b/paddle/fluid/framework/dataset_factory.cc index aeaf9611853238..38200927c5586f 100644 --- a/paddle/fluid/framework/dataset_factory.cc +++ b/paddle/fluid/framework/dataset_factory.cc @@ -53,7 +53,7 @@ std::unique_ptr DatasetFactory::CreateDataset( std::string dataset_class) { if (g_dataset_map.count(dataset_class) < 1) { LOG(WARNING) << "Your Dataset " << dataset_class - << "is not supported currently"; + << " is not supported currently"; LOG(WARNING) << "Supported Dataset: " << DatasetTypeList(); exit(-1); } @@ -61,5 +61,6 @@ std::unique_ptr DatasetFactory::CreateDataset( } REGISTER_DATASET_CLASS(MultiSlotDataset); +REGISTER_DATASET_CLASS(SlotRecordDataset); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 89a829f9490f9f..72b95dcc153464 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -680,3 +680,11 @@ PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time, 120, PADDLE_DEFINE_EXPORTED_bool( apply_pass_to_program, false, "It controls whether to apply IR pass to program when using Fleet APIs"); + +DEFINE_int32(record_pool_max_size, 2000000, + "SlotRecordDataset slot record pool max size"); +DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num"); +DEFINE_bool(enable_slotpool_wait_release, false, + "enable slotrecord obejct wait release, default false"); +DEFINE_bool(enable_slotrecord_reset_shrink, false, + "enable slotrecord obejct reset shrink memory, default false"); \ No newline at end of file diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc index 41cf0189d3d9d0..7a32d8729fc6ca 100644 --- a/paddle/fluid/pybind/data_set_py.cc +++ b/paddle/fluid/pybind/data_set_py.cc @@ -309,8 +309,6 @@ void BindDataset(py::module *m) { &framework::Dataset::SetFleetSendSleepSeconds, py::call_guard()) .def("enable_pv_merge", &framework::Dataset::EnablePvMerge, - py::call_guard()) - .def("set_heter_ps", &framework::Dataset::SetHeterPs, py::call_guard()); py::class_(*m, "IterableDatasetWrapper") From a9ea41c5e251e2cf8b15d286e938a961d8c1cb28 Mon Sep 17 00:00:00 2001 From: liutiexing <74819124+liutiexing@users.noreply.github.com> Date: Wed, 29 Sep 2021 15:10:03 +0800 Subject: [PATCH 053/298] Spinlock (#36030) * add align for WorkQueue * add spinlock * merge spinlock --- .../fluid/framework/new_executor/run_queue.h | 10 +++-- .../fluid/framework/new_executor/workqueue.cc | 4 +- .../framework/new_executor/workqueue_utils.h | 1 + paddle/fluid/memory/allocation/spin_lock.h | 43 ++++++++++++------- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/framework/new_executor/run_queue.h b/paddle/fluid/framework/new_executor/run_queue.h index 13035237ff8b48..e457b20a3c35d5 100644 --- a/paddle/fluid/framework/new_executor/run_queue.h +++ b/paddle/fluid/framework/new_executor/run_queue.h @@ -37,6 +37,8 @@ #include #include #include +#include "paddle/fluid/framework/new_executor/workqueue_utils.h" +#include "paddle/fluid/memory/allocation/spin_lock.h" namespace paddle { namespace framework { @@ -101,7 +103,7 @@ class RunQueue { // PushBack adds w at the end of the queue. // If queue is full returns w, otherwise returns default-constructed Work. Work PushBack(Work w) { - std::unique_lock lock(mutex_); + std::unique_lock lock(mutex_); unsigned back = back_.load(std::memory_order_relaxed); Elem* e = &array_[(back - 1) & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); @@ -123,7 +125,7 @@ class RunQueue { return Work(); } - std::unique_lock lock(mutex_); + std::unique_lock lock(mutex_); unsigned back = back_.load(std::memory_order_relaxed); Elem* e = &array_[back & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); @@ -145,7 +147,7 @@ class RunQueue { return 0; } - std::unique_lock lock(mutex_); + std::unique_lock lock(mutex_); unsigned back = back_.load(std::memory_order_relaxed); unsigned size = Size(); unsigned mid = back; @@ -213,7 +215,7 @@ class RunQueue { // modification counters. alignas(64) std::atomic front_; alignas(64) std::atomic back_; - std::mutex mutex_; + paddle::memory::SpinLock mutex_; Elem array_[kSize]; // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false, diff --git a/paddle/fluid/framework/new_executor/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue.cc index bc5a4e27dc528a..8c6eeab4d5c0a1 100644 --- a/paddle/fluid/framework/new_executor/workqueue.cc +++ b/paddle/fluid/framework/new_executor/workqueue.cc @@ -166,7 +166,7 @@ std::unique_ptr CreateMultiThreadedWorkQueue( "WorkQueueOptions.num_threads must be " "greater than 1.")); std::unique_ptr ptr(new WorkQueueImpl(options)); - return ptr; + return std::move(ptr); } std::unique_ptr CreateWorkQueueGroup( @@ -176,7 +176,7 @@ std::unique_ptr CreateWorkQueueGroup( "For a WorkQueueGroup, the number of WorkQueueOptions " "must be greater than 1.")); std::unique_ptr ptr(new WorkQueueGroupImpl(queues_options)); - return ptr; + return std::move(ptr); } } // namespace framework diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue_utils.h index 6907f2f17da0db..bb219fea36267a 100644 --- a/paddle/fluid/framework/new_executor/workqueue_utils.h +++ b/paddle/fluid/framework/new_executor/workqueue_utils.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include diff --git a/paddle/fluid/memory/allocation/spin_lock.h b/paddle/fluid/memory/allocation/spin_lock.h index 42462fd74b4cd7..2bbe340e7c6912 100644 --- a/paddle/fluid/memory/allocation/spin_lock.h +++ b/paddle/fluid/memory/allocation/spin_lock.h @@ -15,37 +15,48 @@ #pragma once #include -#if !defined(_WIN32) -#include -#else -#include -#endif // !_WIN32 +#if defined(_M_X64) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(__i386__) +#define __PADDLE_x86__ +#include +#endif +#include #include "paddle/fluid/platform/macros.h" namespace paddle { namespace memory { +static inline void CpuRelax() { +#if defined(__PADDLE_x86__) + _mm_pause(); +#endif +} class SpinLock { public: SpinLock() : mlock_(false) {} void lock() { - bool expect = false; - uint64_t spin_cnt = 0; - while (!mlock_.compare_exchange_weak(expect, true)) { - expect = false; - if ((++spin_cnt & 0xFF) == 0) { -#if defined(_WIN32) - SleepEx(50, FALSE); -#else - sched_yield(); -#endif + for (;;) { + if (!mlock_.exchange(true, std::memory_order_acquire)) { + break; + } + constexpr int kMaxLoop = 32; + for (int loop = 1; mlock_.load(std::memory_order_relaxed);) { + if (loop <= kMaxLoop) { + for (int i = 1; i <= loop; ++i) { + CpuRelax(); + } + loop *= 2; + } else { + std::this_thread::yield(); + } } } } - void unlock() { mlock_.store(false); } + void unlock() { mlock_.store(false, std::memory_order_release); } + DISABLE_COPY_AND_ASSIGN(SpinLock); private: From 1f93582cd1f13a09971e2c03334d649d82238e5b Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Wed, 29 Sep 2021 16:24:59 +0800 Subject: [PATCH 054/298] Add functional autograd API:hessian (#36108) * init functional jacobian api * finish test with dtype float32 * add float64 test case * polish code * use atol=1e-5 with dtype float64 * fix for ci * set timeout for test_jacobian * init hessian API * save status * polish API docstring * modify docstring * add utils.py * save status * fix dygraph double grad dtype error when calling for high differential senario * reinvoke ci * test_hessian.py is ok * polish hessian API * init vhp * Revert "init vhp" This reverts commit cbd4d3b66abe82b0ac10721b9eddeb7d82e0a1c8. * add test for partial_engine.cc * modify numerical_delta with dtype float32 * merge fix for dtype float64 * spell fix * polish code * rm _stop_gradient_pre_process Co-authored-by: JiabinYang <360788950@qq.com> --- python/paddle/autograd/__init__.py | 2 +- python/paddle/autograd/functional.py | 152 +++++++++++++++--- python/paddle/autograd/utils.py | 49 ++++++ .../tests/unittests/autograd/CMakeLists.txt | 1 + .../tests/unittests/autograd/test_hessian.py | 140 ++++++++++++++++ .../tests/unittests/autograd/test_jacobian.py | 60 +------ .../fluid/tests/unittests/autograd/utils.py | 107 ++++++++++++ 7 files changed, 426 insertions(+), 85 deletions(-) create mode 100644 python/paddle/autograd/utils.py create mode 100644 python/paddle/fluid/tests/unittests/autograd/test_hessian.py create mode 100644 python/paddle/fluid/tests/unittests/autograd/utils.py diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py index dfbb3cfb45f2be..f4a0122759dc5d 100644 --- a/python/paddle/autograd/__init__.py +++ b/python/paddle/autograd/__init__.py @@ -18,6 +18,6 @@ from .py_layer import PyLayer, PyLayerContext # noqa: F401 from ..framework import set_grad_enabled # noqa: F401 from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401 -from .functional import jacobian # noqa: F401 +from .functional import jacobian, hessian # noqa: F401 __all__ = ['backward', 'PyLayer', 'PyLayerContext'] diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py index c1b4dd9e3a2db8..a5665631c937f8 100644 --- a/python/paddle/autograd/functional.py +++ b/python/paddle/autograd/functional.py @@ -13,34 +13,10 @@ # limitations under the License. from paddle.fluid import framework +from .utils import _check_tensors, _stack_tensor_or_return_none, _replace_none_with_zero_tensor import paddle -def _check_tensors(in_out_list, name): - assert in_out_list is not None, "{} should not be None".format(name) - - if isinstance(in_out_list, (list, tuple)): - assert len(in_out_list) > 0, "{} connot be empyt".format(name) - for each_var in in_out_list: - assert isinstance( - each_var, - paddle.Tensor), "Elements of {} must be paddle.Tensor".format( - name) - return in_out_list - else: - assert isinstance( - in_out_list, - paddle.Tensor), "{} must be Tensor or list of Tensor".format(name) - return [in_out_list] - - -def _stack_tensor_or_return_none(origin_list): - assert len(origin_list) > 0, "Can't not stack an empty list" - return paddle.stack( - origin_list, axis=0) if isinstance(origin_list[0], - paddle.Tensor) else None - - @framework.dygraph_only def jacobian(func, inputs, create_graph=False, allow_unused=False): ''' @@ -183,3 +159,129 @@ def func(x, y): return jacobian[0] else: return jacobian + + +@framework.dygraph_only +def hessian(func, inputs, create_graph=False, allow_unused=False): + ''' + .. note:: + **This API is ONLY available in imperative mode.** + + This API computes the Hessian matrix of `func` with respect to `inputs`. + + Parameters: + func (function): a Python function that takes a Tensor or a Tensor + list/tuple as inputs and returns a Tensor with a single element. + inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or + Tensor list/tuple of the function ``func``. + create_graph (bool, optional): whether to create the gradient graphs + of the computing process. When it is True, higher order derivatives + are supported to compute; when it is False, the gradient graphs of + the computing process would be discarded. Defaults to ``False``. + allow_unused (bool, optional): whether to raise error or return None if + some Tensors of `inputs` are unreachable in the graph. Error would + be raised if allow_unused=False, and None would be returned as + their gradients if allow_unused=True. Default False. + Returns: + Hessian (Tensor or a tuple of tuple of Tensors): if function ``func`` + takes a Tensor as ``inputs``, Hessian will be a single Tensor containing + the Hessian matrix for the linearized ``inputs`` Tensor. If function + ``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will + be a tuple of tuple of Tensors where ``Hessian[i][j]`` will contain the + Hessian matrix of the ``i``th input and ``j``th input with size ``m * n``. + Here ``m`` and ``n`` denote the number of elements of the ``i`` th input + and the ``j`` th input respectively. + + Examples 1: + .. code-block:: python + + import paddle + + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + x = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + hessian = paddle.autograd.hessian(func, x) + print(hessian) + # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 1., 1., 0.], + # [1., 0., 2., 1.], + # [1., 2., 0., 1.], + # [0., 1., 1., 2.]]) + + Examples 2: + .. code-block:: python + + import paddle + + def func(x, y): + return paddle.sum(paddle.matmul(x, y)) + + x = paddle.ones(shape=[2, 2], dtype='float32') + y = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + y.stop_gradient = False + hessian = paddle.autograd.hessian(func, [x, y]) + print(hessian) + # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]), + # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[1., 1., 0., 0.], + # [0., 0., 1., 1.], + # [1., 1., 0., 0.], + # [0., 0., 1., 1.]])), + # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[1., 0., 1., 0.], + # [1., 0., 1., 0.], + # [0., 1., 0., 1.], + # [0., 1., 0., 1.]]), + # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]))) + + Examples 3: + .. code-block:: python + + import paddle + + def func(x, y): + return paddle.sum(paddle.matmul(x, x)) + + x = paddle.ones(shape=[2, 2], dtype='float32') + y = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + y.stop_gradient = False + hessian = paddle.autograd.hessian(func, [x, y], allow_unused=True) + print(hessian) + # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 1., 1., 0.], + # [1., 0., 2., 1.], + # [1., 2., 0., 1.], + # [0., 1., 1., 2.]]), None), (None, None)) + + ''' + inputs = _check_tensors(inputs, "inputs") + outputs = func(*inputs) + assert isinstance(outputs, paddle.Tensor) and outputs.shape == [ + 1 + ], "The function to compute Hessian matrix should return a Tensor with a single element" + + def jac_func(*ins): + grad_inputs = paddle.grad( + outputs, + ins, + create_graph=True, + retain_graph=True, + allow_unused=allow_unused) + return tuple( + _replace_none_with_zero_tensor(grad_inputs[i], inputs[i]) + for i in range(len(inputs))) + + return jacobian( + jac_func, inputs, create_graph=create_graph, allow_unused=allow_unused) diff --git a/python/paddle/autograd/utils.py b/python/paddle/autograd/utils.py new file mode 100644 index 00000000000000..d437f7d82d3611 --- /dev/null +++ b/python/paddle/autograd/utils.py @@ -0,0 +1,49 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + + +def _check_tensors(in_out_list, name): + assert in_out_list is not None, "{} should not be None".format(name) + + if isinstance(in_out_list, (list, tuple)): + assert len(in_out_list) > 0, "{} connot be empyt".format(name) + for each_var in in_out_list: + assert isinstance( + each_var, + paddle.Tensor), "Elements of {} must be paddle.Tensor".format( + name) + return list(in_out_list) + else: + assert isinstance( + in_out_list, + paddle.Tensor), "{} must be Tensor or list of Tensor".format(name) + return [in_out_list] + + +def _stack_tensor_or_return_none(origin_list): + assert len(origin_list) > 0, "Can't not stack an empty list" + return paddle.stack( + origin_list, axis=0) if isinstance(origin_list[0], + paddle.Tensor) else None + + +def _replace_none_with_zero_tensor(t, spec_t): + if t is None: + zero_t = paddle.zeros(shape=spec_t.shape, dtype=spec_t.dtype) + zero_t.stop_gradient = spec_t.stop_gradient + return zero_t + else: + return t diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt index 7f7a232fcefa64..1e9d433ebce8e1 100644 --- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt @@ -7,3 +7,4 @@ foreach(TEST_OP ${TEST_OPS}) endforeach(TEST_OP) set_tests_properties(test_jacobian PROPERTIES TIMEOUT 20) +set_tests_properties(test_hessian PROPERTIES TIMEOUT 20) diff --git a/python/paddle/fluid/tests/unittests/autograd/test_hessian.py b/python/paddle/fluid/tests/unittests/autograd/test_hessian.py new file mode 100644 index 00000000000000..120a6c853e8d89 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/test_hessian.py @@ -0,0 +1,140 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle +import paddle.compat as cpt +from utils import _compute_numerical_hessian + + +class TestHessian(unittest.TestCase): + @classmethod + def setUpClass(self): + self.shape = (2, 2) + self.dtype = 'float32' + self.np_dtype = np.float32 + self.numerical_delta = 1e-2 + self.rtol = 1e-2 + self.atol = 1e-2 + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + + def test_single_input(self): + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + numerical_hessian = _compute_numerical_hessian( + func, self.x, self.numerical_delta, self.np_dtype) + + self.x.stop_gradient = False + hessian = paddle.autograd.hessian(func, self.x) + assert np.allclose(hessian.numpy(), numerical_hessian[0][0], self.rtol, + self.atol) + + def test_multi_input(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, y)) + + numerical_hessian = _compute_numerical_hessian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + + self.x.stop_gradient = False + self.y.stop_gradient = False + hessian = paddle.autograd.hessian(func, [self.x, self.y]) + for i in range(len(hessian)): + for j in range(len(hessian[0])): + assert np.allclose(hessian[i][j].numpy(), + numerical_hessian[i][j], self.rtol, + self.atol) + + def test_allow_unused_false(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, x)) + + try: + self.x.stop_gradient = False + self.y.stop_gradient = False + hessian = paddle.autograd.hessian(func, [self.x, self.y]) + except ValueError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("allow_unused") > 0 + + def test_allow_unused_true(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, x)) + + numerical_hessian = _compute_numerical_hessian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + self.y.stop_gradient = False + hessian = paddle.autograd.hessian( + func, [self.x, self.y], allow_unused=True) + for i in range(len(hessian)): + for j in range(len(hessian[0])): + if i == j == 0: + assert np.allclose(hessian[i][j].numpy(), + numerical_hessian[i][j], self.rtol, + self.atol) + else: + assert hessian[i][j] is None + + def test_create_graph_false(self): + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + numerical_hessian = _compute_numerical_hessian( + func, self.x, self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + hessian = paddle.autograd.hessian(func, self.x) + assert hessian.stop_gradient == True + assert np.allclose(hessian.numpy(), numerical_hessian[0][0], self.rtol, + self.atol) + try: + paddle.grad(hessian, self.x) + except RuntimeError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("has no gradient") > 0 + + # TODO(levi): enable this test case when matmul_grad_grad_grad is ok + def _test_create_graph_true(self): + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + numerical_hessian = _compute_numerical_hessian( + func, self.x, self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + hessian = paddle.autograd.hessian(func, self.x, create_graph=True) + assert hessian.stop_gradient == False + assert np.allclose(hessian.numpy(), numerical_hessian[0][0], self.rtol, + self.atol) + triple_grad = paddle.grad(hessian, self.x) + assert triple_grad is not None + + +class TestHessianFloat64(TestHessian): + @classmethod + def setUpClass(self): + self.shape = (2, 2) + self.dtype = 'float64' + self.np_dtype = np.float64 + self.numerical_delta = 1e-5 + self.rtol = 1e-5 + self.atol = 1e-5 + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py b/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py index 2722d2c83b130e..2f0b8c7cad3e5e 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py @@ -16,65 +16,7 @@ import numpy as np import paddle import paddle.compat as cpt -from paddle.autograd.functional import _check_tensors - - -def _product(t): - if isinstance(t, int): - return t - else: - return np.product(t) - - -def _get_item(t, idx): - assert isinstance(t, paddle.Tensor), "The first argument t must be Tensor." - assert isinstance(idx, - int), "The second argument idx must be an int number." - flat_t = paddle.reshape(t, [-1]) - return flat_t.__getitem__(idx) - - -def _set_item(t, idx, value): - assert isinstance(t, paddle.Tensor), "The first argument t must be Tensor." - assert isinstance(idx, - int), "The second argument idx must be an int number." - flat_t = paddle.reshape(t, [-1]) - flat_t.__setitem__(idx, value) - return paddle.reshape(flat_t, t.shape) - - -def _compute_numerical_jacobian(func, xs, delta, np_dtype): - xs = _check_tensors(xs, "xs") - ys = _check_tensors(func(*xs), "ys") - fin_size = len(xs) - fout_size = len(ys) - jacobian = list([] for _ in range(fout_size)) - for i in range(fout_size): - jac_i = list([] for _ in range(fin_size)) - for j in range(fin_size): - jac_i[j] = np.zeros( - (_product(ys[i].shape), _product(xs[j].shape)), dtype=np_dtype) - jacobian[i] = jac_i - - for j in range(fin_size): - for q in range(_product(xs[j].shape)): - orig = _get_item(xs[j], q) - x_pos = orig + delta - xs[j] = _set_item(xs[j], q, x_pos) - ys_pos = _check_tensors(func(*xs), "ys_pos") - - x_neg = orig - delta - xs[j] = _set_item(xs[j], q, x_neg) - ys_neg = _check_tensors(func(*xs), "ys_neg") - - xs[j] = _set_item(xs[j], q, orig) - - for i in range(fout_size): - for p in range(_product(ys[i].shape)): - y_pos = _get_item(ys_pos[i], p) - y_neg = _get_item(ys_neg[i], p) - jacobian[i][j][p][q] = (y_pos - y_neg) / delta / 2. - return jacobian +from utils import _compute_numerical_jacobian class TestJacobian(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/autograd/utils.py b/python/paddle/fluid/tests/unittests/autograd/utils.py new file mode 100644 index 00000000000000..0aadef4a809f3f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/utils.py @@ -0,0 +1,107 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +from paddle.autograd.functional import _check_tensors + + +def _product(t): + if isinstance(t, int): + return t + else: + return np.product(t) + + +def _get_item(t, idx): + assert isinstance(t, paddle.Tensor), "The first argument t must be Tensor." + assert isinstance(idx, + int), "The second argument idx must be an int number." + flat_t = paddle.reshape(t, [-1]) + return flat_t.__getitem__(idx) + + +def _set_item(t, idx, value): + assert isinstance(t, paddle.Tensor), "The first argument t must be Tensor." + assert isinstance(idx, + int), "The second argument idx must be an int number." + flat_t = paddle.reshape(t, [-1]) + flat_t.__setitem__(idx, value) + return paddle.reshape(flat_t, t.shape) + + +def _compute_numerical_jacobian(func, xs, delta, np_dtype): + xs = _check_tensors(xs, "xs") + ys = _check_tensors(func(*xs), "ys") + fin_size = len(xs) + fout_size = len(ys) + jacobian = list([] for _ in range(fout_size)) + for i in range(fout_size): + jac_i = list([] for _ in range(fin_size)) + for j in range(fin_size): + jac_i[j] = np.zeros( + (_product(ys[i].shape), _product(xs[j].shape)), dtype=np_dtype) + jacobian[i] = jac_i + + for j in range(fin_size): + for q in range(_product(xs[j].shape)): + orig = _get_item(xs[j], q) + x_pos = orig + delta + xs[j] = _set_item(xs[j], q, x_pos) + ys_pos = _check_tensors(func(*xs), "ys_pos") + + x_neg = orig - delta + xs[j] = _set_item(xs[j], q, x_neg) + ys_neg = _check_tensors(func(*xs), "ys_neg") + + xs[j] = _set_item(xs[j], q, orig) + + for i in range(fout_size): + for p in range(_product(ys[i].shape)): + y_pos = _get_item(ys_pos[i], p) + y_neg = _get_item(ys_neg[i], p) + jacobian[i][j][p][q] = (y_pos - y_neg) / delta / 2. + return jacobian + + +def _compute_numerical_hessian(func, xs, delta, np_dtype): + xs = _check_tensors(xs, "xs") + ys = _check_tensors(func(*xs), "ys") + fin_size = len(xs) + hessian = list([] for _ in range(fin_size)) + for i in range(fin_size): + hessian_i = list([] for _ in range(fin_size)) + for j in range(fin_size): + hessian_i[j] = np.zeros( + (_product(xs[i].shape), _product(xs[j].shape)), dtype=np_dtype) + hessian[i] = hessian_i + + for i in range(fin_size): + for p in range(_product(xs[i].shape)): + for j in range(fin_size): + for q in range(_product(xs[j].shape)): + orig = _get_item(xs[j], q) + x_pos = orig + delta + xs[j] = _set_item(xs[j], q, x_pos) + jacobian_pos = _compute_numerical_jacobian(func, xs, delta, + np_dtype) + x_neg = orig - delta + xs[j] = _set_item(xs[j], q, x_neg) + jacobian_neg = _compute_numerical_jacobian(func, xs, delta, + np_dtype) + xs[j] = _set_item(xs[j], q, orig) + hessian[i][j][p][q] = ( + jacobian_pos[0][i][0][p] - jacobian_neg[0][i][0][p] + ) / delta / 2. + return hessian From 3eb50715a53279c5df82c9d2c0c60802aef5387e Mon Sep 17 00:00:00 2001 From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com> Date: Wed, 29 Sep 2021 16:50:35 +0800 Subject: [PATCH 055/298] fix cusparse compile problem, test=develop (#36199) * fix cusparse compile problem, test=develop * Modify file permissions --- paddle/fluid/platform/dynload/cusparse.cc | 4 ++++ paddle/fluid/platform/dynload/cusparse.h | 20 +++++++++++++------ .../unittests/test_sparse_attention_op.py | 8 ++++---- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/platform/dynload/cusparse.cc b/paddle/fluid/platform/dynload/cusparse.cc index 2b41da541d9ae0..2a1fe322dabcf7 100644 --- a/paddle/fluid/platform/dynload/cusparse.cc +++ b/paddle/fluid/platform/dynload/cusparse.cc @@ -26,6 +26,10 @@ void *cusparse_dso_handle; #ifdef CUSPARSE_ROUTINE_EACH CUSPARSE_ROUTINE_EACH(DEFINE_WRAP); #endif + +#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2 +CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP); +#endif } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h index 98841949676e47..e5be003fadf066 100644 --- a/paddle/fluid/platform/dynload/cusparse.h +++ b/paddle/fluid/platform/dynload/cusparse.h @@ -41,8 +41,9 @@ extern void *cusparse_dso_handle; }; \ extern DynLoad__##__name __name -#ifndef _WIN32 -#if CUDA_VERSION >= 11020 +#if !defined(PADDLE_WITH_ARM) && !defined(_WIN32) +// APIs available after CUDA 11.0 +#if CUDA_VERSION >= 11000 #define CUSPARSE_ROUTINE_EACH(__macro) \ __macro(cusparseCreate); \ __macro(cusparseCreateCsr); \ @@ -51,12 +52,19 @@ extern void *cusparse_dso_handle; __macro(cusparseSpMM); \ __macro(cusparseDestroySpMat); \ __macro(cusparseDestroyDnMat); \ - __macro(cusparseDestroy); \ - __macro(cusparseSDDMM_bufferSize); \ - __macro(cusparseSDDMM_preprocess); \ - __macro(cusparseSDDMM); + __macro(cusparseDestroy); CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP); + +// APIs available after CUDA 11.2 +#if CUDA_VERSION >= 11020 +#define CUSPARSE_ROUTINE_EACH_R2(__macro) \ + __macro(cusparseSDDMM_bufferSize); \ + __macro(cusparseSDDMM_preprocess); \ + __macro(cusparseSDDMM); + +CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) +#endif #endif #endif diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py index ad618edd24d55b..48401fb55ef3f5 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py @@ -169,13 +169,13 @@ def setUp(self): 'Q': self.q, 'K': self.k, 'V': self.v, - 'offset': self.offset, - 'columns': self.columns + 'Offset': self.offset, + 'Columns': self.columns } self.outputs = { 'Out': result.astype(self.dtype), - 'ResultSdd': result_sdd.astype(self.dtype), - 'ResultSoftmax': result_softmax.astype(self.dtype) + 'SparseDotSdd': result_sdd.astype(self.dtype), + 'Softmax': result_softmax.astype(self.dtype) } def test_check_output(self): From 69eed34d1dd5b38e2810b0bafe0cac075fdd0d2e Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Wed, 29 Sep 2021 17:02:04 +0800 Subject: [PATCH 056/298] add optest for adamw (#36148) * update func name * skip cpu * update unittest * update unittest --- .../fluid/tests/unittests/test_adamw_op.py | 166 +++++++++++++++++- python/paddle/optimizer/adamw.py | 6 +- 2 files changed, 165 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index 2a5dc76c6bb285..0a60f4cba09bc6 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -14,9 +14,153 @@ import unittest import paddle +import random import numpy as np import paddle.fluid as fluid +from op_test import OpTest from functools import partial +from paddle.framework import core + + +def adamw_step(inputs, attributes): + param = inputs['Param'] + grad = inputs['Grad'] + moment1 = inputs['Moment1'] + moment2 = inputs['Moment2'] + lr = inputs['LearningRate'] + beta1_pow = inputs['Beta1Pow'] + beta2_pow = inputs['Beta2Pow'] + + epsilon = attributes['epsilon'] + + if 'lr_ratio' in attributes: + lr = lr * attributes['lr_ratio'] + + if attributes["with_decay"]: + coeff = attributes["coeff"] + decay = 1.0 - lr * coeff + param2 = param * decay + param = param2.copy() + + if 'beta1' in attributes: + beta1 = attributes['beta1'] + else: + beta1 = inputs['Beta1Tensor'][0] + if 'beta2' in attributes: + beta2 = attributes['beta2'] + else: + beta2 = inputs['Beta2Tensor'][0] + + moment1_out = beta1 * moment1 + (1 - beta1) * grad + moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad) + lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) + param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon)) + return param_out, moment1_out, moment2_out + + +class TestAdamW(OpTest): + def setUp(self): + '''Test AdamW Op with supplied attributes + ''' + self.op_type = "adamw" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32") + } + + self.attrs = { + 'epsilon': epsilon, + 'beta1': beta1, + 'beta2': beta2, + "coeff": 0.5, + "with_decay": True + } + + param_out, moment1_out, \ + moment2_out = adamw_step(self.inputs, self.attrs) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'ParamOut': param_out, + 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, + 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 + } + + def test_check_output(self): + self.check_output() + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestAdamW2(OpTest): + def setUp(self): + '''Test AdamW Op with supplied attributes + ''' + self.op_type = "adamw" + param = np.random.uniform(-1, 1, (2, 2)).astype("float32") + grad = np.random.uniform(-1, 1, (2, 2)).astype("float32") + moment1 = np.random.uniform(-1, 1, (2, 2)).astype("float32") + # The second moment is positive + moment2 = np.random.random((2, 2)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32") + } + + self.attrs = { + 'epsilon': epsilon, + 'beta1': beta1, + 'beta2': beta2, + "lr_ratio": 0.1, + "coeff": 0.5, + "with_decay": True + } + + param_out, moment1_out, moment2_out = adamw_step(self.inputs, + self.attrs) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'ParamOut': param_out, + 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, + 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 + } + + def test_check_output(self): + self.check_output_with_place(core.CUDAPlace(0)) class TestAdamWOp(unittest.TestCase): @@ -160,7 +304,14 @@ def simple_lr_setting(param, decay_rate, n_layers): return decay_rate**(n_layers + 2 - depth) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestAdamWOpLayerwiseLR(TestAdamWOp): + def setUp(self): + random.seed(2021) + np.random.seed(2021) + paddle.seed(2021) + def test_adamw_op_dygraph(self): paddle.disable_static() value = np.arange(26).reshape(2, 13).astype("float32") @@ -181,17 +332,20 @@ def test_adamw_op_dygraph(self): weight_decay=0.01, lr_ratio=simple_lr_fun) - for _ in range(2): + loss_ref = np.array( + [4.8383293, 3.0854003, 1.33299, -0.418993, -2.171043]) + for i in range(5): a1 = linear1(a) out = linear2(a1) + out = paddle.mean(out) out.backward() adam.step() adam.clear_gradients() + np.testing.assert_allclose(out[0].numpy(), loss_ref[i], rtol=1e-6) def test_adamw_op(self): paddle.enable_static() - place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \ - else fluid.CPUPlace() + place = fluid.CUDAPlace(0) train_prog = fluid.Program() startup = fluid.Program() with fluid.program_guard(train_prog, startup): @@ -223,7 +377,10 @@ def test_adamw_op(self): exe = fluid.Executor(place) exe.run(startup) - for _ in range(2): + + loss_ref = np.array( + [0.36120513, 0.2720821, 0.67208904, 0.14607805, 0.24098626]) + for i in range(5): inputs = np.random.random(size=[8, 10]).astype('float32') outputs = np.random.random(size=[8, 1]).astype('float32') rets = exe.run(train_prog, @@ -231,6 +388,7 @@ def test_adamw_op(self): "y": outputs}, fetch_list=[avg_cost]) assert rets[0] is not None + np.testing.assert_allclose(rets[0], loss_ref[i], rtol=1e-6) paddle.disable_static() diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 34fb201d8ccaf7..f26ee80d0af607 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -171,9 +171,9 @@ def __init__(self, self._lr_to_coeff = dict() if lr_ratio is not None: assert isinstance(lr_ratio, Callable) - if core.is_compiled_with_xpu() or core.is_compiled_with_npu(): + if not core.is_compiled_with_cuda(): raise NotImplementedError( - "'lr_ratio' is unimplemented in XPU and NPU") + "'lr_ratio' is unimplemented in CPU, XPU and NPU") self._lr_ratio = lr_ratio super(AdamW, self).__init__( @@ -305,7 +305,7 @@ def _append_optimize_op(self, block, param_and_grad): 'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', 1000, 'beta1', _beta1, 'beta2', _beta2, 'coeff', self._coeff, 'multi_precision', - find_master, "lr_ratio", lr_ratio_) + find_master, 'lr_ratio', lr_ratio_) return None From 21b93c3dc68c616f12c360ebbbd9961fe379902f Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 29 Sep 2021 17:12:17 +0800 Subject: [PATCH 057/298] Add basic support for CUDA Graph (#36190) * add basic support for CUDA Graph * fix ci compile error * fix LOG print, fix windows CI * follow comments and update * small fix for default ctor * fix rocm compile error * fix CPU compile error --- paddle/fluid/memory/allocation/CMakeLists.txt | 6 +- .../memory/allocation/allocator_facade.cc | 147 ++++++++++++++++-- .../memory/allocation/allocator_facade.h | 8 + .../auto_growth_best_fit_allocator.cc | 8 +- .../auto_growth_best_fit_allocator.h | 3 +- paddle/fluid/platform/CMakeLists.txt | 5 + paddle/fluid/platform/cuda_graph.cc | 92 +++++++++++ paddle/fluid/platform/cuda_graph.h | 136 ++++++++++++++++ .../platform/cuda_graph_with_memory_pool.cc | 43 +++++ .../platform/cuda_graph_with_memory_pool.h | 64 ++++++++ paddle/fluid/platform/gpu_info.cc | 2 + paddle/fluid/platform/type_defs.h | 1 + paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/pybind.cc | 15 ++ python/paddle/device/cuda/graphs.py | 57 +++++++ .../fluid/tests/unittests/test_cuda_graph.py | 60 +++++++ 16 files changed, 634 insertions(+), 15 deletions(-) create mode 100644 paddle/fluid/platform/cuda_graph.cc create mode 100644 paddle/fluid/platform/cuda_graph.h create mode 100644 paddle/fluid/platform/cuda_graph_with_memory_pool.cc create mode 100644 paddle/fluid/platform/cuda_graph_with_memory_pool.h create mode 100644 python/paddle/device/cuda/graphs.py create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_graph.py diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 6b4afae9f8c752..4aa1900f53f5e3 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -82,7 +82,11 @@ endif() cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator) cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy ) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) + +if (WITH_GPU) + target_link_libraries(allocator_facade cuda_graph) +endif() cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator) if (WITH_TESTING) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 78bce53b6f4ffb..0388e2d13afb0d 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -32,6 +32,9 @@ #include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/platform/gpu_info.h" #endif +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_graph.h" +#endif #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/xpu/xpu_info.h" #endif @@ -47,17 +50,64 @@ PADDLE_DEFINE_EXPORTED_bool( "Whether to use system allocator to allocate CPU and GPU memory. " "Only used for unittests."); +DECLARE_string(allocator_strategy); + namespace paddle { namespace memory { namespace allocation { +#ifdef PADDLE_WITH_CUDA +class CUDAGraphAllocator + : public Allocator, + public std::enable_shared_from_this { + private: + class PrivateAllocation : public Allocation { + public: + PrivateAllocation(CUDAGraphAllocator* allocator, + AllocationPtr underlying_allocation) + : Allocation(underlying_allocation->ptr(), + underlying_allocation->size(), + underlying_allocation->place()), + allocator_(allocator->shared_from_this()), + underlying_allocation_(std::move(underlying_allocation)) {} + + private: + std::shared_ptr allocator_; + AllocationPtr underlying_allocation_; + }; + + explicit CUDAGraphAllocator(const std::shared_ptr& allocator) + : underlying_allocator_(allocator) {} + + public: + static std::shared_ptr Create( + const std::shared_ptr& allocator) { + return std::shared_ptr(new CUDAGraphAllocator(allocator)); + } + + protected: + Allocation* AllocateImpl(size_t size) { + VLOG(10) << "Allocate " << size << " for CUDA Graph"; + return new PrivateAllocation(this, underlying_allocator_->Allocate(size)); + } + + void FreeImpl(Allocation* allocation) { + VLOG(10) << "delete for CUDA Graph"; + delete allocation; + } + + private: + std::shared_ptr underlying_allocator_; +}; +#endif + class AllocatorFacadePrivate { public: using AllocatorMap = std::map>; - AllocatorFacadePrivate() { - auto strategy = GetAllocatorStrategy(); - switch (strategy) { + explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) { + strategy_ = GetAllocatorStrategy(); + switch (strategy_) { case AllocatorStrategy::kNaiveBestFit: { InitNaiveBestFitCPUAllocator(); #ifdef PADDLE_WITH_XPU @@ -91,7 +141,8 @@ class AllocatorFacadePrivate { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { - InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id)); + InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), + allow_free_idle_chunk); } InitNaiveBestFitCUDAPinnedAllocator(); #endif @@ -117,7 +168,7 @@ class AllocatorFacadePrivate { default: { PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported allocator strategy: %d", static_cast(strategy))); + "Unsupported allocator strategy: %d", static_cast(strategy_))); } } InitZeroSizeAllocators(); @@ -130,11 +181,29 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(); } + inline const AllocatorMap& GetAllocatorMap() { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + auto id = platform::CUDAGraph::CapturingID(); + auto iter = cuda_graph_allocator_map_.find(id); + PADDLE_ENFORCE_NE( + iter, cuda_graph_allocator_map_.end(), + platform::errors::PermissionDenied( + "No memory pool is prepared for CUDA Graph capturing.")); + return iter->second->allocators_; + } else { + return allocators_; + } +#else + return allocators_; +#endif + } + inline const std::shared_ptr& GetAllocator( const platform::Place& place, size_t size) { const auto& allocators = (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ - : allocators_) + : GetAllocatorMap()) : zero_size_allocators_); auto iter = allocators.find(place); PADDLE_ENFORCE_NE(iter, allocators.end(), @@ -145,6 +214,7 @@ class AllocatorFacadePrivate { private: void InitSystemAllocators() { + if (!system_allocators_.empty()) return; system_allocators_[platform::CPUPlace()] = std::make_shared(); #ifdef PADDLE_WITH_XPU int device_count = platform::GetXPUDeviceCount(); @@ -183,10 +253,11 @@ class AllocatorFacadePrivate { allocators_[p] = std::make_shared(p); } - void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) { + void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, + bool allow_free_idle_chunk) { auto cuda_allocator = std::make_shared(p); allocators_[p] = std::make_shared( - cuda_allocator, platform::GpuMinChunkSize()); + cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); } #endif @@ -226,6 +297,7 @@ class AllocatorFacadePrivate { }; void InitZeroSizeAllocators() { + if (!zero_size_allocators_.empty()) return; std::vector places; places.emplace_back(platform::CPUPlace()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -279,12 +351,57 @@ class AllocatorFacadePrivate { } } +#ifdef PADDLE_WITH_CUDA + + public: + void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { + PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth, + platform::errors::InvalidArgument( + "CUDA Graph is only supported when the " + "FLAGS_allocator_strategy=\"auto_growth\", but got " + "FLAGS_allocator_strategy=\"%s\"", + FLAGS_allocator_strategy)); + auto& allocator = cuda_graph_allocator_map_[id]; + PADDLE_ENFORCE_EQ( + allocator.get(), nullptr, + platform::errors::InvalidArgument( + "The memory pool of the CUDA Graph with ID %d have been prepared.", + id)); + allocator.reset( + new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); + for (auto& item : allocator->allocators_) { + auto& old_allocator = item.second; + old_allocator = CUDAGraphAllocator::Create(old_allocator); + } + VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; + } + + void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { + auto iter = cuda_graph_allocator_map_.find(id); + PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(), + platform::errors::InvalidArgument( + "Cannot find CUDA Graph with ID = %d", id)); + cuda_graph_allocator_map_.erase(iter); + VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; + } +#endif + private: AllocatorMap allocators_; - AllocatorMap zero_size_allocators_; - AllocatorMap system_allocators_; +#ifdef PADDLE_WITH_CUDA + std::unordered_map> + cuda_graph_allocator_map_; +#endif + AllocatorStrategy strategy_; + + static AllocatorMap zero_size_allocators_; + static AllocatorMap system_allocators_; }; +AllocatorFacadePrivate::AllocatorMap + AllocatorFacadePrivate::zero_size_allocators_; +AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_; + // Pimpl. Make interface clean. AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {} // delete m_ may cause core dump when the destructor of python in conflict with @@ -316,6 +433,16 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } +#ifdef PADDLE_WITH_CUDA +void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { + return m_->PrepareMemoryPoolForCUDAGraph(id); +} + +void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { + return m_->RemoveMemoryPoolOfCUDAGraph(id); +} +#endif + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 7f6ad561aa931b..8d889ec38eed7e 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -18,6 +18,9 @@ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif #include "paddle/fluid/platform/place.h" namespace paddle { @@ -54,6 +57,11 @@ class AllocatorFacade { uint64_t Release(const platform::Place& place); const std::shared_ptr& GetAllocator(const platform::Place& place); +#ifdef PADDLE_WITH_CUDA + void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id); + void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id); +#endif + // TODO(yy): Allocate a Copy-On-Write allocation? private: AllocatorFacade(); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index a35d8a73f7edae..f36d589f907fb4 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -39,11 +39,12 @@ namespace allocation { AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( const std::shared_ptr &underlying_allocator, size_t alignment, - size_t chunk_size) + size_t chunk_size, bool allow_free_idle_chunk) : underlying_allocator_( std::make_shared(underlying_allocator, alignment)), alignment_(alignment), - chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)) {} + chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)), + allow_free_idle_chunk_(allow_free_idle_chunk) {} Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { size = AlignedSize(size, alignment_); @@ -139,6 +140,9 @@ void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { } uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() { + if (!allow_free_idle_chunk_) { + return 0; + } uint64_t bytes = 0; for (auto chunk_it = chunks_.begin(); chunk_it != chunks_.end();) { auto &blocks = chunk_it->blocks_; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index 5ed6eb94f158fe..d1fa6cce0164f6 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -31,7 +31,7 @@ class AutoGrowthBestFitAllocator : public Allocator { public: AutoGrowthBestFitAllocator( const std::shared_ptr &underlying_allocator, size_t alignment, - size_t chunk_size = 0); + size_t chunk_size = 0, bool allow_free_idle_chunk = true); bool IsAllocThreadSafe() const override { return true; } @@ -86,6 +86,7 @@ class AutoGrowthBestFitAllocator : public Allocator { std::list chunks_; size_t alignment_; size_t chunk_size_; + bool allow_free_idle_chunk_; SpinLock spinlock_; }; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 2540170ed54fb5..21213f9e6ff21f 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -59,9 +59,14 @@ cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS}) cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) IF(WITH_GPU) + nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda) nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce) + nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph) +ELSE() + cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade) ENDIF() + IF(WITH_ROCM) hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda) ENDIF() diff --git a/paddle/fluid/platform/cuda_graph.cc b/paddle/fluid/platform/cuda_graph.cc new file mode 100644 index 00000000000000..6e518d779e9cd4 --- /dev/null +++ b/paddle/fluid/platform/cuda_graph.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/cuda_graph.h" + +namespace paddle { +namespace platform { + +std::unique_ptr CUDAGraph::capturing_graph_{nullptr}; + +void CUDAGraph::Reset() { + if (is_reset_) return; +#if CUDA_VERSION >= 10010 + if (graph_) { + PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphDestroy(graph_)); + graph_ = nullptr; + } + if (exec_graph_) { + PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphExecDestroy(exec_graph_)); + exec_graph_ = nullptr; + } +#endif + // callback should be called in reverse order because the latter added + // callback may rely on the former added callback. + for (auto iter = callbacks_.rbegin(); iter != callbacks_.rend(); ++iter) { + (*iter)(); + } + callbacks_.clear(); + is_reset_ = true; +} + +void CUDAGraph::Replay() { +#if CUDA_VERSION >= 10010 + PADDLE_ENFORCE_EQ(is_reset_, false, + errors::PermissionDenied( + "Cannot replay the CUDA Graph after reset is called.")); + PADDLE_ENFORCE_NOT_NULL(exec_graph_, + errors::PermissionDenied( + "CUDA Graph must be captured before replaying.")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphLaunch(exec_graph_, stream_)); +#endif +} + +void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream, + cudaStreamCaptureMode mode) { + ThrowErrorIfNotSupportCUDAGraph(); + PADDLE_ENFORCE_EQ( + IsCapturing(), false, + errors::PermissionDenied("CUDA Graph can only captured one by one.")); + PADDLE_ENFORCE_NOT_NULL( + stream, errors::PermissionDenied( + "CUDA Graph cannot be captured in default CUDA stream 0.")); + capturing_graph_.reset(new CUDAGraph()); + capturing_graph_->place_ = place; + capturing_graph_->stream_ = stream; + + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamBeginCapture(capturing_graph_->stream_, mode)); + cudaStreamCaptureStatus status; + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamGetCaptureInfo( + capturing_graph_->stream_, &status, &(capturing_graph_->id_))); + VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_; +} + +std::unique_ptr CUDAGraph::EndCapture() { + ThrowErrorIfNotSupportCUDAGraph(); +#if CUDA_VERSION >= 10010 + PADDLE_ENFORCE_EQ(IsCapturing(), true, + errors::PermissionDenied("No CUDA Graph is capturing.")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamEndCapture( + capturing_graph_->stream_, &(capturing_graph_->graph_))); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaGraphInstantiate(&(capturing_graph_->exec_graph_), + capturing_graph_->graph_, nullptr, nullptr, 0)); + VLOG(10) << "End to capture CUDA Graph with ID " << capturing_graph_->id_; + return std::move(capturing_graph_); +#endif +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/cuda_graph.h b/paddle/fluid/platform/cuda_graph.h new file mode 100644 index 00000000000000..41e36049aa1a01 --- /dev/null +++ b/paddle/fluid/platform/cuda_graph.h @@ -0,0 +1,136 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "cuda.h" // NOLINT +#include "cuda_runtime.h" // NOLINT +#include "paddle/fluid/platform/type_defs.h" + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace platform { + +#if CUDA_VERSION >= 10010 +static void ThrowErrorIfNotSupportCUDAGraph() {} +#else +enum cudaStreamCaptureMode { + cudaStreamCaptureModeGlobal = 0, + cudaStreamCaptureModeThreadLocal = 1, + cudaStreamCaptureModeRelaxed = 2 +}; +static void ThrowErrorIfNotSupportCUDAGraph() { + PADDLE_THROW(platform::errors::Unimplemented( + "CUDA Graph is only supported when CUDA version >= 10.1")); +} +#endif + +// NOTE: Currently, we do not support to capture CUDA graph in parallel +// NOTE: Do not use this class directly because it should be used with +// the memory pool. +class CUDAGraph { + DISABLE_COPY_AND_ASSIGN(CUDAGraph); + + // Since the constructor would throw error is CUDA_VERSION < 10010. + // The non-static method of CUDAGraph need not check CUDA_VERSION + // again. + CUDAGraph() { ThrowErrorIfNotSupportCUDAGraph(); } + + public: + ~CUDAGraph() { Reset(); } + + CUDAGraphID ID() const { return id_; } + + void Replay(); + + void Reset(); + + void AddResetCallback(std::function callback) { + std::lock_guard guard(mtx_); + callbacks_.push_back(std::move(callback)); + } + + static void BeginCapture(platform::CUDAPlace place, cudaStream_t stream, + cudaStreamCaptureMode mode); + static std::unique_ptr EndCapture(); + static void AddResetCallbackDuringCapturing(std::function callback) { + capturing_graph_->AddResetCallback(std::move(callback)); + } + + // No need to add CUDA_VERSION macro because capturing_graph_ would + // always be nullptr (constructor throws error) + static bool IsCapturing() { return capturing_graph_ != nullptr; } + + static CUDAGraphID CapturingID() { return capturing_graph_->id_; } + + static platform::CUDAPlace CapturingPlace() { + return capturing_graph_->place_; + } + + private: +#if CUDA_VERSION >= 10010 + cudaGraph_t graph_{nullptr}; + cudaGraphExec_t exec_graph_{nullptr}; +#endif + cudaStream_t stream_{nullptr}; + platform::CUDAPlace place_; + CUDAGraphID id_{0}; + std::vector> callbacks_; + bool is_reset_{false}; + std::mutex mtx_; + + static std::unique_ptr capturing_graph_; +}; + +#if CUDA_VERSION >= 10010 +class CUDAGraphCaptureModeGuard { + DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard); + + public: + explicit CUDAGraphCaptureModeGuard(cudaStreamCaptureMode mode) { + if (UNLIKELY(CUDAGraph::IsCapturing())) { + PADDLE_ENFORCE_CUDA_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode)); + // After cudaThreadExchangeStreamCaptureMode is called, + // the variable "mode" would be set to the old capturing mode. + old_mode_ = mode; + } + } + + ~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW { + if (UNLIKELY(CUDAGraph::IsCapturing())) { + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaThreadExchangeStreamCaptureMode(&old_mode_)); + } + } + + private: + cudaStreamCaptureMode old_mode_; +}; +#else +class CUDAGraphCaptureModeGuard { + DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard); + + public: + explicit CUDAGraphCaptureModeGuard(cudaStreamCaptureMode) {} +}; +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc new file mode 100644 index 00000000000000..1f0d39e2abe236 --- /dev/null +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace platform { + +#ifdef PADDLE_WITH_CUDA +void BeginCUDAGraphCapture(platform::CUDAPlace place, + cudaStreamCaptureMode mode) { + auto stream = + platform::DeviceContextPool::Instance().GetByPlace(place)->stream(); + CUDAGraph::BeginCapture(place, stream, mode); + auto id = CUDAGraph::CapturingID(); + memory::allocation::AllocatorFacade::Instance().PrepareMemoryPoolForCUDAGraph( + id); + AddResetCallbackIfCapturingCUDAGraph([id] { + memory::allocation::AllocatorFacade::Instance().RemoveMemoryPoolOfCUDAGraph( + id); + }); +} + +std::unique_ptr EndCUDAGraphCapture() { + return CUDAGraph::EndCapture(); +} +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h new file mode 100644 index 00000000000000..f9f0248e5153b2 --- /dev/null +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h @@ -0,0 +1,64 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_graph.h" +#endif + +namespace paddle { +namespace platform { + +// NOTE: These APIs are not thread-safe. +#ifdef PADDLE_WITH_CUDA +void BeginCUDAGraphCapture(platform::CUDAPlace place, + cudaStreamCaptureMode mode); +std::unique_ptr EndCUDAGraphCapture(); +#endif + +inline bool IsCUDAGraphCapturing() { +#ifdef PADDLE_WITH_CUDA + return CUDAGraph::IsCapturing(); +#else + return false; +#endif +} + +inline platform::CUDAPlace CUDAGraphCapturingPlace() { +#ifdef PADDLE_WITH_CUDA + return CUDAGraph::CapturingPlace(); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "CUDA Graph is only supported on NVIDIA GPU device.")); +#endif +} + +// Add reset callback if CUDA Graph is capturing. +// Otherwise, invoke callback directly. +template +inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(IsCUDAGraphCapturing())) { + return CUDAGraph::AddResetCallbackDuringCapturing( + std::forward(callback)); + } +#endif + callback(); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index c4ac5aa3046a9c..59e4404ffe535c 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -22,6 +22,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/dynload/miopen.h" #else +#include "paddle/fluid/platform/cuda_graph.h" #include "paddle/fluid/platform/dynload/cudnn.h" #endif #include "paddle/fluid/memory/malloc.h" @@ -557,6 +558,7 @@ class RecordedCudaMallocHelper { #ifdef PADDLE_WITH_HIP auto result = hipMalloc(ptr, size); #else + CUDAGraphCaptureModeGuard capture_mode_guard{cudaStreamCaptureModeRelaxed}; auto result = cudaMalloc(ptr, size); #endif if (result == gpuSuccess) { diff --git a/paddle/fluid/platform/type_defs.h b/paddle/fluid/platform/type_defs.h index f46bd1a0bdfa4a..88a2d16472fa70 100644 --- a/paddle/fluid/platform/type_defs.h +++ b/paddle/fluid/platform/type_defs.h @@ -36,4 +36,5 @@ using gpuEvent_t = cudaEvent_t; using gpuDeviceProp = cudaDeviceProp; #endif +using CUDAGraphID = unsigned long long; // NOLINT } // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 22778013f2390b..875e6af9652a25 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -7,7 +7,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator - cost_model) + cost_model cuda_graph_with_memory_pool) if (WITH_PSCORE) set(PYBIND_DEPS ${PYBIND_DEPS} ps_service) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a16916ab33f831..6b24c644925815 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -125,6 +125,8 @@ limitations under the License. */ #include "paddle/fluid/platform/xpu/xpu_info.h" #endif +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" + #ifdef PADDLE_WITH_CRYPTO #include "paddle/fluid/pybind/crypto.h" #endif @@ -520,6 +522,19 @@ PYBIND11_MODULE(core_noavx, m) { m.def("nccl_version", &GetNCCLVersion); #endif + m.def("is_cuda_graph_capturing", &platform::IsCUDAGraphCapturing); +#ifdef PADDLE_WITH_CUDA + py::class_(m, "CUDAGraph") + .def_static("begin_capture", + [](platform::CUDAPlace place, int mode) { + platform::BeginCUDAGraphCapture( + place, static_cast(mode)); + }) + .def_static("end_capture", &platform::EndCUDAGraphCapture) + .def("replay", &platform::CUDAGraph::Replay) + .def("reset", &platform::CUDAGraph::Reset); +#endif + m.def("wait_device", [](const platform::Place &place) { platform::DeviceContextPool::Instance().Get(place)->Wait(); }); diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py new file mode 100644 index 00000000000000..612f4d2c8cebd1 --- /dev/null +++ b/python/paddle/device/cuda/graphs.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.core import is_compiled_with_cuda, is_compiled_with_rocm, CUDAPlace + +if is_compiled_with_cuda() and not is_compiled_with_rocm(): + from paddle.fluid.core import CUDAGraph as CoreCUDAGraph + + class CUDAGraph: + def __init__(self, place=None, mode="thread_local"): + ALL_MODES = ["global", "thread_local", "relaxed"] + self._graph = None + if place is None: + place = CUDAPlace(0) + self._place = place + assert mode in ALL_MODES + self._mode = ALL_MODES.index(mode) + + def capture_begin(self): + CoreCUDAGraph.begin_capture(self._place, self._mode) + + def capture_end(self): + self._graph = CoreCUDAGraph.end_capture() + + def replay(self): + self._graph.replay() + + def reset(self): + self._graph.reset() +else: + + class CUDAGraph: + def __init__(self, place=None, mode="thread_local"): + raise NotImplementedError() + + def capture_begin(self): + raise NotImplementedError() + + def capture_end(self): + raise NotImplementedError() + + def replay(self): + raise NotImplementedError() + + def reset(self): + raise NotImplementedError() diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py new file mode 100644 index 00000000000000..272d68e17fcc4d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py @@ -0,0 +1,60 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.fluid as fluid +from paddle.device.cuda.graphs import CUDAGraph +import unittest +import numpy as np + + +class TestCUDAGraph(unittest.TestCase): + def setUp(self): + fluid.set_flags({'FLAGS_allocator_strategy': 'auto_growth'}) + + def random_tensor(self, shape): + return paddle.to_tensor( + np.random.randint( + low=0, high=10, size=shape).astype("float32")) + + def test_cuda_graph(self): + if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(): + return + + shape = [2, 3] + x = self.random_tensor(shape) + z = self.random_tensor(shape) + + g = CUDAGraph() + g.capture_begin() + y = x + 10 + z.add_(x) + g.capture_end() + + for _ in range(10): + z_np_init = z.numpy() + x_new = self.random_tensor(shape) + x.copy_(x_new, False) + g.replay() + x_np = x_new.numpy() + y_np = y.numpy() + z_np = z.numpy() + self.assertTrue((y_np - x_np == 10).all()) + self.assertTrue((z_np - z_np_init == x_np).all()) + + g.reset() + + +if __name__ == "__main__": + unittest.main() From 8af939f16abf8a03fc4e30ffac267f9d75af7d13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AD=A3?= <2042519524@qq.com> Date: Thu, 30 Sep 2021 10:13:23 +0800 Subject: [PATCH 058/298] fix the undefined variable bug in dist_transformer file (#36211) --- python/paddle/fluid/tests/unittests/dist_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index 6546bb5549df8c..db321f9417880f 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -1450,7 +1450,7 @@ def wrap_decoder(trg_vocab_size, # This is used to implement independent decoder program in inference. trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ enc_output = make_all_inputs( - decoder_data_input_fields + decoder_util_input_fields) + decoder_data_input_fields) else: trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs From 5e0f199ab02e1f1458e49a9318f40fede2c0439e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AD=A3?= <2042519524@qq.com> Date: Thu, 30 Sep 2021 10:15:40 +0800 Subject: [PATCH 059/298] Fix raw optim (#36176) * fix raw optim * pre-commit test file Co-authored-by: sneaxiy --- .../meta_optimizers/raw_program_optimizer.py | 2 + .../fluid/tests/unittests/CMakeLists.txt | 2 + .../fluid/tests/unittests/test_rnn_dp.py | 157 ++++++++++++++++++ 3 files changed, 161 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_rnn_dp.py diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py index 7d899cff418710..c8eaa54f9cda1c 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py @@ -460,6 +460,8 @@ def __get_ouputs_name_to_idx(self, first_backward_idx, block): if is_optimizer_op(op): break for name in op.output_arg_names: + if name == core.kEmptyVarName(): + continue var = block.var(name) if not outputs_name_to_idx.get(var): # if the grad only be generated by one op diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 129fbb9ac3328d..cd1c4363879bb6 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -21,6 +21,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer) list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer) list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute) list(APPEND DIST_TEST_OPS test_fleet_raw_program_meta_optimizer) +list(APPEND DIST_TEST_OPS test_rnn_dp) list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer) list(APPEND DIST_TEST_OPS test_gen_nccl_id_op) list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables) @@ -66,6 +67,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute) list(APPEND MIXED_DIST_TEST_OPS test_fleet_raw_program_meta_optimizer) +list(APPEND MIXED_DIST_TEST_OPS test_rnn_dp) list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_init) list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer) diff --git a/python/paddle/fluid/tests/unittests/test_rnn_dp.py b/python/paddle/fluid/tests/unittests/test_rnn_dp.py new file mode 100644 index 00000000000000..8d7e86fcdb9c7e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_rnn_dp.py @@ -0,0 +1,157 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import os + +import numpy as np +import paddle +import paddle.static as static +import paddle.distributed.fleet as fleet +import paddle.nn as nn +import paddle.nn.functional as F + +paddle.enable_static() + + +class RNNEncoder(nn.Layer): + def __init__(self, + input_size, + hidden_size, + num_layers=1, + direction="forward", + dropout=0.0, + pooling_type=None, + **kwargs): + super().__init__() + self._input_size = input_size + self._hidden_size = hidden_size + self._direction = direction + self._pooling_type = pooling_type + + self.rnn_layer = nn.SimpleRNN( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + direction=direction, + dropout=dropout, + **kwargs) + + def get_input_dim(self): + return self._input_size + + def get_output_dim(self): + if self._direction == "bidirect": + return self._hidden_size * 2 + else: + return self._hidden_size + + def forward(self, inputs, sequence_length): + encoded_text, last_hidden = self.rnn_layer( + inputs, sequence_length=sequence_length) + output = paddle.max(encoded_text, axis=1) + return output + + +class RNNModel(nn.Layer): + def __init__(self, + vocab_size, + num_classes, + emb_dim=128, + padding_idx=0, + rnn_hidden_size=198, + direction='forward', + rnn_layers=1, + dropout_rate=0.0, + pooling_type=None, + fc_hidden_size=96): + super().__init__() + self.embedder = nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=emb_dim, + padding_idx=padding_idx) + self.rnn_encoder = RNNEncoder( + emb_dim, + rnn_hidden_size, + num_layers=rnn_layers, + direction=direction, + dropout=dropout_rate, + pooling_type=pooling_type) + self.fc = nn.Linear(self.rnn_encoder.get_output_dim(), fc_hidden_size) + self.output_layer = nn.Linear(fc_hidden_size, num_classes) + + def forward(self, text, seq_len): + embedded_text = self.embedder(text) + text_repr = self.rnn_encoder(embedded_text, sequence_length=seq_len) + fc_out = paddle.tanh(self.fc(text_repr)) + logits = self.output_layer(fc_out) + return logits + + +def rnn_pretrain_forward(train_program, start_program, topo=None): + with static.program_guard(train_program, + start_program), paddle.utils.unique_name.guard(): + batch_size = 1 + tokens = static.data( + name="tokens", shape=[batch_size, -1], dtype="int64") + seq_len = static.data(name="ids", shape=[batch_size], dtype="int64") + labels = static.data(name="labels", shape=[batch_size], dtype="int64") + data_holders = [tokens, seq_len, labels] + vocab_size = 10 + num_classes = 2 + pad_token_id = 0 + model = RNNModel( + vocab_size, + num_classes, + direction='forward', + padding_idx=pad_token_id, + pooling_type='max') + + optimizer = paddle.optimizer.Adam( + parameters=model.parameters(), learning_rate=0.001) + criterion = paddle.nn.CrossEntropyLoss() + preds = model(tokens, seq_len) + loss = criterion(preds, labels) + + return train_program, start_program, loss, optimizer, data_holders + + +class TestFleetMetaOptimizer(unittest.TestCase): + def setUp(self): + os.environ["PADDLE_TRAINER_ID"] = "1" + os.environ[ + "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002" + + def test_rnn_raw_optimizer(self): + import paddle.distributed.fleet as fleet + import paddle.distributed.fleet.base.role_maker as role_maker + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + train_program = static.Program() + start_program = static.Program() + train_program, start_program, loss, optimizer, data_holders = \ + rnn_pretrain_forward(train_program, start_program) + with paddle.static.program_guard( + train_program, start_program), paddle.utils.unique_name.guard(): + strategy = fleet.DistributedStrategy() + strategy.without_graph_optimization = True + strategy.fuse_all_reduce_ops = True + fleet.init(is_collective=True, strategy=strategy) + optimizer = fleet.distributed_optimizer(optimizer) + optimizer.minimize(loss) + + +if __name__ == "__main__": + unittest.main() From a66b9fba3b5ada77ef5c3cc1b8e398395676a730 Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Thu, 30 Sep 2021 14:18:24 +0800 Subject: [PATCH 060/298] [NPU] modify transpose2 and index_select_grad kernels for model xlnet (#36214) * [NPU] modify transpose2 and index_select_grad kernels for model xlnet * add transpose2 int64_t unit test * add more transpose2 unit tests * update test_transpose_op_npu.py --- paddle/fluid/operators/index_select_op_npu.cc | 17 ++-- paddle/fluid/operators/transpose_op_npu.cc | 21 +++- .../unittests/npu/test_transpose_op_npu.py | 98 +++++++++++++++---- 3 files changed, 107 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc index b624d03cc85559..825229282f3dac 100644 --- a/paddle/fluid/operators/index_select_op_npu.cc +++ b/paddle/fluid/operators/index_select_op_npu.cc @@ -99,10 +99,11 @@ class IndexSelectGradNPUKernel : public framework::OpKernel { transed_out_dims[i] = out_dims[in_trans_perm[i]]; } transed_out_grad.mutable_data(transed_out_dims, ctx.GetPlace()); - framework::NPUAttributeMap in_trans_attr = {{"perm", in_trans_perm}}; - - const auto& in_trans_runner = NpuOpRunner( - "TransposeD", {*out_grad}, {transed_out_grad}, in_trans_attr); + NpuOpRunner in_trans_runner; + in_trans_runner.SetType("Transpose") + .AddInput(*out_grad) + .AddInput(std::move(in_trans_perm)) + .AddOutput(transed_out_grad); in_trans_runner.Run(stream); Tensor sum_out; @@ -133,10 +134,12 @@ class IndexSelectGradNPUKernel : public framework::OpKernel { for (int i = 1 + dim; i < x_dims.size(); ++i) { out_trans_perm.push_back(i); } - framework::NPUAttributeMap out_trans_attr = {{"perm", out_trans_perm}}; x_grad->mutable_data(ctx.GetPlace()); - const auto& out_trans_runner = - NpuOpRunner("TransposeD", {sum_out}, {*x_grad}, out_trans_attr); + NpuOpRunner out_trans_runner; + out_trans_runner.SetType("Transpose") + .AddInput(sum_out) + .AddInput(std::move(out_trans_perm)) + .AddOutput(*x_grad); out_trans_runner.Run(stream); } } diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc index 035ad5f3f314aa..7cc68e93c5d620 100644 --- a/paddle/fluid/operators/transpose_op_npu.cc +++ b/paddle/fluid/operators/transpose_op_npu.cc @@ -27,9 +27,12 @@ class TransposeNPUKernel : public framework::OpKernel { auto* x = ctx.Input("X"); auto* out = ctx.Output("Out"); std::vector axis = ctx.Attr>("axis"); - framework::NPUAttributeMap attr_input = {{"perm", axis}}; out->mutable_data(ctx.device_context().GetPlace()); - const auto& runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input); + NpuOpRunner runner; + runner.SetType("Transpose") + .AddInput(*x) + .AddInput(std::move(axis)) + .AddOutput(*out); auto stream = ctx.template device_context() .stream(); @@ -51,9 +54,11 @@ class TransposeGradNPUKernel : public framework::OpKernel { reversed_axis[axis[i]] = i; } x_grad->mutable_data(ctx.GetPlace()); - framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}}; - const auto& runner = - NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input); + NpuOpRunner runner; + runner.SetType("Transpose") + .AddInput(*out_grad) + .AddInput(std::move(reversed_axis)) + .AddOutput(*x_grad); auto stream = ctx.template device_context() .stream(); @@ -72,11 +77,17 @@ REGISTER_OP_NPU_KERNEL( ops::TransposeNPUKernel, ops::TransposeNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::TransposeNPUKernel, +#endif ops::TransposeNPUKernel, ops::TransposeNPUKernel); REGISTER_OP_NPU_KERNEL(transpose2_grad, ops::TransposeGradNPUKernel, ops::TransposeGradNPUKernel, ops::TransposeGradNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::TransposeGradNPUKernel, +#endif ops::TransposeGradNPUKernel, ops::TransposeGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py index e95f3cc83cfb31..b1a6bfcdaaadca 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py @@ -31,40 +31,104 @@ def setUp(self): self.op_type = "transpose2" self.place = paddle.NPUPlace(0) self.init_dtype() - self.init_input_output() - self.init_kernel_type() - self.init_axis() + self.init_shape_axis() - self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)} - self.attrs = {'axis': [0, 2, 1, 3], 'data_format': 'AnyLayout'} - self.outputs = {'Out': self.out} + self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)} + self.attrs = {'axis': self.axis, 'data_format': 'AnyLayout'} + self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} def set_npu(self): self.__class__.use_npu = True - def init_kernel_type(self): - self.use_mkldnn = False - - def init_input_output(self): - self.x = np.random.uniform(0.1, 1, [8, 512, 12, 64]).astype(self.dtype) - self.out = np.transpose(self.x, [0, 2, 1, 3]) - def init_dtype(self): self.dtype = np.float32 - def init_axis(self): - self.axis = -1 + def init_shape_axis(self): + self.shape = (3, 40) + self.axis = (1, 0) def test_check_output(self): self.check_output_with_place(self.place) + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + +class TestCase0(TestTransposeOp): + def init_shape_axis(self): + self.shape = (100, ) + self.axis = (0, ) + + +class TestCase1(TestTransposeOp): + def init_shape_axis(self): + self.shape = (3, 4, 10) + self.axis = (0, 2, 1) + + +class TestCase2(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 3, 4, 5) + self.axis = (0, 2, 3, 1) + + +class TestCase3(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 3, 4, 5, 6) + self.axis = (4, 2, 3, 1, 0) + + +class TestCase4(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 3, 4, 5, 6, 1) + self.axis = (4, 2, 3, 1, 0, 5) + + +class TestCase5(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 16, 96) + self.axis = (0, 2, 1) -class TestTransposeOpFP16(TestTransposeOp): - no_need_check_grad = True +class TestCase6(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 10, 12, 16) + self.axis = (3, 1, 2, 0) + + +class TestCase7(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 10, 2, 16) + self.axis = (0, 1, 3, 2) + + +class TestCase8(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 3, 2, 3, 2, 4, 3, 3) + self.axis = (0, 1, 3, 2, 4, 5, 6, 7) + + +class TestCase9(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 3, 2, 3, 2, 4, 3, 3) + self.axis = (6, 1, 3, 5, 0, 2, 4, 7) + + +class TestTransposeOpFP16(TestTransposeOp): def init_dtype(self): self.dtype = np.float16 + def test_check_grad(self): + pass + + +class TestTransposeOpInt64(TestTransposeOp): + def init_dtype(self): + self.dtype = np.int64 + + def test_check_grad(self): + pass + if __name__ == '__main__': unittest.main() From 56b04bc19fa68f6767dc83cd26b8b4a35ad69d5e Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Thu, 30 Sep 2021 16:48:01 +0800 Subject: [PATCH 061/298] add test_hessian time out (#36234) --- python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt index 1e9d433ebce8e1..369134c8989a0e 100644 --- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt @@ -7,4 +7,4 @@ foreach(TEST_OP ${TEST_OPS}) endforeach(TEST_OP) set_tests_properties(test_jacobian PROPERTIES TIMEOUT 20) -set_tests_properties(test_hessian PROPERTIES TIMEOUT 20) +set_tests_properties(test_hessian PROPERTIES TIMEOUT 50) From c12176e88566a97ca0f3efec071eaaebade9cd9a Mon Sep 17 00:00:00 2001 From: wenbin Date: Thu, 30 Sep 2021 17:30:34 +0800 Subject: [PATCH 062/298] fix yolo (#36240) --- paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu index ee1709f57e2598..10123cd4fa0e1b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -119,10 +119,10 @@ __device__ inline void GetYoloBox(float* box, const T* x, const int* anchors, int img_height, int img_width, float scale, float bias) { box[0] = static_cast( - (i + sigmoid(static_cast(x[index]) * scale + bias)) * img_width / + (i + sigmoid(static_cast(x[index])) * scale + bias) * img_width / grid_size_w); box[1] = static_cast( - (j + sigmoid(static_cast(x[index + stride]) * scale + bias)) * + (j + sigmoid(static_cast(x[index + stride])) * scale + bias) * img_height / grid_size_h); box[2] = static_cast(expf(static_cast(x[index + 2 * stride])) * anchors[2 * an_idx] * img_width / input_size_w); From 0a3dbe8a26ae592623002a3eb2d17978c77b919f Mon Sep 17 00:00:00 2001 From: yaoxuefeng Date: Thu, 30 Sep 2021 18:16:01 +0800 Subject: [PATCH 063/298] add slotrecord datafeed (#36099) --- paddle/fluid/framework/data_feed.cc | 642 ++++++++++++++++++ paddle/fluid/framework/data_feed.h | 38 +- paddle/fluid/framework/data_feed_factory.cc | 5 +- paddle/fluid/framework/data_set.cc | 30 +- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 100 ++- paddle/fluid/platform/flags.cc | 4 +- 6 files changed, 787 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 4463fd9fd53409..2d089b4721b82c 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/fluid/platform/timer.h" USE_INT_STAT(STAT_total_feasign_num_in_mem); +DECLARE_bool(enable_ins_parser_file); namespace paddle { namespace framework { @@ -1929,5 +1930,646 @@ void PaddleBoxDataFeed::PutToFeedVec(const std::vector& ins_vec) { #endif } +template class InMemoryDataFeed; +void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) { + finish_init_ = false; + finish_set_filelist_ = false; + finish_start_ = false; + PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(), + platform::errors::PreconditionNotMet( + "Multi_slot_desc has not been set in data_feed_desc")); + paddle::framework::MultiSlotDesc multi_slot_desc = + data_feed_desc.multi_slot_desc(); + SetBatchSize(data_feed_desc.batch_size()); + size_t all_slot_num = multi_slot_desc.slots_size(); + + all_slots_.resize(all_slot_num); + all_slots_info_.resize(all_slot_num); + used_slots_info_.resize(all_slot_num); + use_slot_size_ = 0; + use_slots_.clear(); + + float_total_dims_size_ = 0; + float_total_dims_without_inductives_.clear(); + for (size_t i = 0; i < all_slot_num; ++i) { + const auto& slot = multi_slot_desc.slots(i); + all_slots_[i] = slot.name(); + + AllSlotInfo& all_slot = all_slots_info_[i]; + all_slot.slot = slot.name(); + all_slot.type = slot.type(); + all_slot.used_idx = slot.is_used() ? use_slot_size_ : -1; + all_slot.slot_value_idx = -1; + + if (slot.is_used()) { + UsedSlotInfo& info = used_slots_info_[use_slot_size_]; + info.idx = i; + info.slot = slot.name(); + info.type = slot.type(); + info.dense = slot.is_dense(); + info.total_dims_without_inductive = 1; + info.inductive_shape_index = -1; + + // record float value and uint64_t value pos + if (info.type[0] == 'u') { + info.slot_value_idx = uint64_use_slot_size_; + all_slot.slot_value_idx = uint64_use_slot_size_; + ++uint64_use_slot_size_; + } else if (info.type[0] == 'f') { + info.slot_value_idx = float_use_slot_size_; + all_slot.slot_value_idx = float_use_slot_size_; + ++float_use_slot_size_; + } + + use_slots_.push_back(slot.name()); + + if (slot.is_dense()) { + for (int j = 0; j < slot.shape_size(); ++j) { + if (slot.shape(j) > 0) { + info.total_dims_without_inductive *= slot.shape(j); + } + if (slot.shape(j) == -1) { + info.inductive_shape_index = j; + } + } + } + if (info.type[0] == 'f') { + float_total_dims_without_inductives_.push_back( + info.total_dims_without_inductive); + float_total_dims_size_ += info.total_dims_without_inductive; + } + info.local_shape.clear(); + for (int j = 0; j < slot.shape_size(); ++j) { + info.local_shape.push_back(slot.shape(j)); + } + ++use_slot_size_; + } + } + used_slots_info_.resize(use_slot_size_); + + feed_vec_.resize(used_slots_info_.size()); + const int kEstimatedFeasignNumPerSlot = 5; // Magic Number + for (size_t i = 0; i < all_slot_num; i++) { + batch_float_feasigns_.push_back(std::vector()); + batch_uint64_feasigns_.push_back(std::vector()); + batch_float_feasigns_[i].reserve(default_batch_size_ * + kEstimatedFeasignNumPerSlot); + batch_uint64_feasigns_[i].reserve(default_batch_size_ * + kEstimatedFeasignNumPerSlot); + offset_.push_back(std::vector()); + offset_[i].reserve(default_batch_size_ + + 1); // Each lod info will prepend a zero + } + visit_.resize(all_slot_num, false); + pipe_command_ = data_feed_desc.pipe_command(); + finish_init_ = true; + input_type_ = data_feed_desc.input_type(); + size_t pos = pipe_command_.find(".so"); + if (pos != std::string::npos) { + pos = pipe_command_.rfind('|'); + if (pos == std::string::npos) { + so_parser_name_ = pipe_command_; + pipe_command_.clear(); + } else { + so_parser_name_ = pipe_command_.substr(pos + 1); + pipe_command_ = pipe_command_.substr(0, pos); + } + so_parser_name_ = paddle::string::erase_spaces(so_parser_name_); + } else { + so_parser_name_.clear(); + } +} + +void SlotRecordInMemoryDataFeed::LoadIntoMemory() { + VLOG(3) << "SlotRecord LoadIntoMemory() begin, thread_id=" << thread_id_; + if (!so_parser_name_.empty()) { + LoadIntoMemoryByLib(); + } else { + LoadIntoMemoryByCommand(); + } +} +void SlotRecordInMemoryDataFeed::LoadIntoMemoryByLib(void) { + if (true) { + // user defined file format analysis + LoadIntoMemoryByFile(); + } else { + LoadIntoMemoryByLine(); + } +} + +void SlotRecordInMemoryDataFeed::LoadIntoMemoryByFile(void) { +#ifdef _LINUX + paddle::framework::CustomParser* parser = + global_dlmanager_pool().Load(so_parser_name_, all_slots_info_); + CHECK(parser != nullptr); + // get slotrecord object + auto pull_record_func = [this](std::vector& record_vec, + int max_fetch_num, int offset) { + if (offset > 0) { + input_channel_->WriteMove(offset, &record_vec[0]); + if (max_fetch_num > 0) { + SlotRecordPool().get(&record_vec[0], offset); + } else { // free all + max_fetch_num = static_cast(record_vec.size()); + if (max_fetch_num > offset) { + SlotRecordPool().put(&record_vec[offset], (max_fetch_num - offset)); + } + } + } else if (max_fetch_num > 0) { + SlotRecordPool().get(&record_vec, max_fetch_num); + } else { + SlotRecordPool().put(&record_vec); + } + }; + + std::string filename; + while (this->PickOneFile(&filename)) { + VLOG(3) << "PickOneFile, filename=" << filename + << ", thread_id=" << thread_id_; + platform::Timer timeline; + timeline.Start(); + + int lines = 0; + bool is_ok = true; + do { + int err_no = 0; + this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_); + + CHECK(this->fp_ != nullptr); + __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER); + is_ok = parser->ParseFileInstance( + [this](char* buf, int len) { + return fread(buf, sizeof(char), len, this->fp_.get()); + }, + pull_record_func, lines); + + if (!is_ok) { + LOG(WARNING) << "parser error, filename=" << filename + << ", lines=" << lines; + } + } while (!is_ok); + timeline.Pause(); + VLOG(3) << "LoadIntoMemoryByLib() read all file, file=" << filename + << ", cost time=" << timeline.ElapsedSec() + << " seconds, thread_id=" << thread_id_ << ", lines=" << lines; + } +#endif +} + +void SlotRecordInMemoryDataFeed::LoadIntoMemoryByLine(void) { +#ifdef _LINUX + paddle::framework::CustomParser* parser = + global_dlmanager_pool().Load(so_parser_name_, all_slots_info_); + std::string filename; + BufferedLineFileReader line_reader; + line_reader.set_sample_rate(sample_rate_); + BufferedLineFileReader::LineFunc line_func = nullptr; + + while (this->PickOneFile(&filename)) { + VLOG(3) << "PickOneFile, filename=" << filename + << ", thread_id=" << thread_id_; + std::vector record_vec; + platform::Timer timeline; + timeline.Start(); + int offset = 0; + int old_offset = 0; + + SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); + // get slotrecord object function + auto record_func = [this, &offset, &record_vec, &old_offset]( + std::vector& vec, int num) { + vec.resize(num); + if (offset + num > OBJPOOL_BLOCK_SIZE) { + input_channel_->WriteMove(offset, &record_vec[0]); + SlotRecordPool().get(&record_vec[0], offset); + record_vec.resize(OBJPOOL_BLOCK_SIZE); + offset = 0; + old_offset = 0; + } + for (int i = 0; i < num; ++i) { + auto& ins = record_vec[offset + i]; + ins->reset(); + vec[i] = ins; + } + offset = offset + num; + }; + + line_func = [this, &parser, &record_vec, &offset, &filename, &record_func, + &old_offset](const std::string& line) { + old_offset = offset; + if (!parser->ParseOneInstance(line, record_func)) { + offset = old_offset; + LOG(WARNING) << "read file:[" << filename << "] item error, line:[" + << line << "]"; + return false; + } + if (offset >= OBJPOOL_BLOCK_SIZE) { + input_channel_->Write(std::move(record_vec)); + record_vec.clear(); + SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); + offset = 0; + } + return true; + }; + + int lines = 0; + + do { + int err_no = 0; + this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_); + CHECK(this->fp_ != nullptr); + __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER); + lines = line_reader.read_file(this->fp_.get(), line_func, lines); + } while (line_reader.is_error()); + + if (offset > 0) { + input_channel_->WriteMove(offset, &record_vec[0]); + if (offset < OBJPOOL_BLOCK_SIZE) { + SlotRecordPool().put(&record_vec[offset], + (OBJPOOL_BLOCK_SIZE - offset)); + } + } else { + SlotRecordPool().put(&record_vec); + } + record_vec.clear(); + record_vec.shrink_to_fit(); + timeline.Pause(); + VLOG(3) << "LoadIntoMemoryByLib() read all lines, file=" << filename + << ", cost time=" << timeline.ElapsedSec() + << " seconds, thread_id=" << thread_id_ << ", lines=" << lines + << ", sample lines=" << line_reader.get_sample_line() + << ", filesize=" << line_reader.file_size() / 1024.0 / 1024.0 + << "MB"; + } + + VLOG(3) << "LoadIntoMemoryByLib() end, thread_id=" << thread_id_ + << ", total size: " << line_reader.file_size(); +#endif +} + +void SlotRecordInMemoryDataFeed::LoadIntoMemoryByCommand(void) { +#ifdef _LINUX + std::string filename; + BufferedLineFileReader line_reader; + line_reader.set_sample_rate(sample_rate_); + + while (this->PickOneFile(&filename)) { + VLOG(3) << "PickOneFile, filename=" << filename + << ", thread_id=" << thread_id_; + int lines = 0; + std::vector record_vec; + platform::Timer timeline; + timeline.Start(); + SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); + int offset = 0; + + do { + int err_no = 0; + this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_); + CHECK(this->fp_ != nullptr); + __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER); + + lines = line_reader.read_file( + this->fp_.get(), + [this, &record_vec, &offset, &filename](const std::string& line) { + if (ParseOneInstance(line, &record_vec[offset])) { + ++offset; + } else { + LOG(WARNING) << "read file:[" << filename + << "] item error, line:[" << line << "]"; + return false; + } + if (offset >= OBJPOOL_BLOCK_SIZE) { + input_channel_->Write(std::move(record_vec)); + record_vec.clear(); + SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); + offset = 0; + } + return true; + }, + lines); + } while (line_reader.is_error()); + if (offset > 0) { + input_channel_->WriteMove(offset, &record_vec[0]); + if (offset < OBJPOOL_BLOCK_SIZE) { + SlotRecordPool().put(&record_vec[offset], + (OBJPOOL_BLOCK_SIZE - offset)); + } + } else { + SlotRecordPool().put(&record_vec); + } + record_vec.clear(); + record_vec.shrink_to_fit(); + timeline.Pause(); + VLOG(3) << "LoadIntoMemory() read all lines, file=" << filename + << ", lines=" << lines + << ", sample lines=" << line_reader.get_sample_line() + << ", cost time=" << timeline.ElapsedSec() + << " seconds, thread_id=" << thread_id_; + } + VLOG(3) << "LoadIntoMemory() end, thread_id=" << thread_id_ + << ", total size: " << line_reader.file_size(); +#endif +} + +static void parser_log_key(const std::string& log_key, uint64_t* search_id, + uint32_t* cmatch, uint32_t* rank) { + std::string searchid_str = log_key.substr(16, 16); + *search_id = static_cast(strtoull(searchid_str.c_str(), NULL, 16)); + std::string cmatch_str = log_key.substr(11, 3); + *cmatch = static_cast(strtoul(cmatch_str.c_str(), NULL, 16)); + std::string rank_str = log_key.substr(14, 2); + *rank = static_cast(strtoul(rank_str.c_str(), NULL, 16)); +} + +bool SlotRecordInMemoryDataFeed::ParseOneInstance(const std::string& line, + SlotRecord* ins) { + SlotRecord& rec = (*ins); + // parse line + const char* str = line.c_str(); + char* endptr = const_cast(str); + int pos = 0; + + thread_local std::vector> slot_float_feasigns; + thread_local std::vector> slot_uint64_feasigns; + slot_float_feasigns.resize(float_use_slot_size_); + slot_uint64_feasigns.resize(uint64_use_slot_size_); + + if (parse_ins_id_) { + int num = strtol(&str[pos], &endptr, 10); + CHECK(num == 1); // NOLINT + pos = endptr - str + 1; + size_t len = 0; + while (str[pos + len] != ' ') { + ++len; + } + rec->ins_id_ = std::string(str + pos, len); + pos += len + 1; + } + if (parse_logkey_) { + int num = strtol(&str[pos], &endptr, 10); + CHECK(num == 1); // NOLINT + pos = endptr - str + 1; + size_t len = 0; + while (str[pos + len] != ' ') { + ++len; + } + // parse_logkey + std::string log_key = std::string(str + pos, len); + uint64_t search_id; + uint32_t cmatch; + uint32_t rank; + parser_log_key(log_key, &search_id, &cmatch, &rank); + + rec->ins_id_ = log_key; + rec->search_id = search_id; + rec->cmatch = cmatch; + rec->rank = rank; + pos += len + 1; + } + + int float_total_slot_num = 0; + int uint64_total_slot_num = 0; + + for (size_t i = 0; i < all_slots_info_.size(); ++i) { + auto& info = all_slots_info_[i]; + int num = strtol(&str[pos], &endptr, 10); + PADDLE_ENFORCE(num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters.\nplease check this error line: %s", + str); + if (info.used_idx != -1) { + if (info.type[0] == 'f') { // float + auto& slot_fea = slot_float_feasigns[info.slot_value_idx]; + slot_fea.clear(); + for (int j = 0; j < num; ++j) { + float feasign = strtof(endptr, &endptr); + if (fabs(feasign) < 1e-6 && !used_slots_info_[info.used_idx].dense) { + continue; + } + slot_fea.push_back(feasign); + ++float_total_slot_num; + } + } else if (info.type[0] == 'u') { // uint64 + auto& slot_fea = slot_uint64_feasigns[info.slot_value_idx]; + slot_fea.clear(); + for (int j = 0; j < num; ++j) { + uint64_t feasign = + static_cast(strtoull(endptr, &endptr, 10)); + if (feasign == 0 && !used_slots_info_[info.used_idx].dense) { + continue; + } + slot_fea.push_back(feasign); + ++uint64_total_slot_num; + } + } + pos = endptr - str; + } else { + for (int j = 0; j <= num; ++j) { + // pos = line.find_first_of(' ', pos + 1); + while (line[pos + 1] != ' ') { + pos++; + } + } + } + } + rec->slot_float_feasigns_.add_slot_feasigns(slot_float_feasigns, + float_total_slot_num); + rec->slot_uint64_feasigns_.add_slot_feasigns(slot_uint64_feasigns, + uint64_total_slot_num); + + return (uint64_total_slot_num > 0); +} + +void SlotRecordInMemoryDataFeed::PutToFeedVec(const SlotRecord* ins_vec, + int num) { + for (int j = 0; j < use_slot_size_; ++j) { + auto& feed = feed_vec_[j]; + if (feed == nullptr) { + continue; + } + + auto& slot_offset = offset_[j]; + slot_offset.clear(); + slot_offset.reserve(num + 1); + slot_offset.push_back(0); + + int total_instance = 0; + auto& info = used_slots_info_[j]; + // fill slot value with default value 0 + if (info.type[0] == 'f') { // float + auto& batch_fea = batch_float_feasigns_[j]; + batch_fea.clear(); + + for (int i = 0; i < num; ++i) { + auto r = ins_vec[i]; + size_t fea_num = 0; + float* slot_values = + r->slot_float_feasigns_.get_values(info.slot_value_idx, &fea_num); + batch_fea.resize(total_instance + fea_num); + memcpy(&batch_fea[total_instance], slot_values, + sizeof(float) * fea_num); + total_instance += fea_num; + slot_offset.push_back(total_instance); + } + + float* feasign = batch_fea.data(); + float* tensor_ptr = + feed->mutable_data({total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(float)); + + } else if (info.type[0] == 'u') { // uint64 + auto& batch_fea = batch_uint64_feasigns_[j]; + batch_fea.clear(); + + for (int i = 0; i < num; ++i) { + auto r = ins_vec[i]; + size_t fea_num = 0; + uint64_t* slot_values = + r->slot_uint64_feasigns_.get_values(info.slot_value_idx, &fea_num); + if (fea_num > 0) { + batch_fea.resize(total_instance + fea_num); + memcpy(&batch_fea[total_instance], slot_values, + sizeof(uint64_t) * fea_num); + total_instance += fea_num; + } + if (fea_num == 0) { + batch_fea.resize(total_instance + fea_num); + batch_fea[total_instance] = 0; + total_instance += 1; + } + slot_offset.push_back(total_instance); + } + + // no uint64_t type in paddlepaddle + uint64_t* feasign = batch_fea.data(); + int64_t* tensor_ptr = + feed->mutable_data({total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(int64_t)); + } + + if (info.dense) { + if (info.inductive_shape_index != -1) { + info.local_shape[info.inductive_shape_index] = + total_instance / info.total_dims_without_inductive; + } + feed->Resize(framework::make_ddim(info.local_shape)); + } else { + LoD data_lod{slot_offset}; + feed_vec_[j]->set_lod(data_lod); + } + } +} + +void SlotRecordInMemoryDataFeed::ExpandSlotRecord(SlotRecord* rec) { + SlotRecord& ins = (*rec); + if (ins->slot_float_feasigns_.slot_offsets.empty()) { + return; + } + size_t total_value_size = ins->slot_float_feasigns_.slot_values.size(); + if (float_total_dims_size_ == total_value_size) { + return; + } + int float_slot_num = + static_cast(float_total_dims_without_inductives_.size()); + CHECK(float_slot_num == float_use_slot_size_); + std::vector old_values; + std::vector old_offsets; + old_values.swap(ins->slot_float_feasigns_.slot_values); + old_offsets.swap(ins->slot_float_feasigns_.slot_offsets); + + ins->slot_float_feasigns_.slot_values.resize(float_total_dims_size_); + ins->slot_float_feasigns_.slot_offsets.assign(float_slot_num + 1, 0); + + auto& slot_offsets = ins->slot_float_feasigns_.slot_offsets; + auto& slot_values = ins->slot_float_feasigns_.slot_values; + + uint32_t offset = 0; + int num = 0; + uint32_t old_off = 0; + int dim = 0; + + for (int i = 0; i < float_slot_num; ++i) { + dim = float_total_dims_without_inductives_[i]; + old_off = old_offsets[i]; + num = static_cast(old_offsets[i + 1] - old_off); + if (num == 0) { + // fill slot value with default value 0 + for (int k = 0; k < dim; ++k) { + slot_values[k + offset] = 0.0; + } + } else { + if (num == dim) { + memcpy(&slot_values[offset], &old_values[old_off], dim * sizeof(float)); + } else { + // position fea + // record position index need fix values + int pos_idx = static_cast(old_values[old_off]); + for (int k = 0; k < dim; ++k) { + if (k == pos_idx) { + slot_values[k + offset] = 1.0; + } else { + slot_values[k + offset] = 0.0; + } + } + } + } + slot_offsets[i] = offset; + offset += dim; + } + slot_offsets[float_slot_num] = offset; + CHECK(float_total_dims_size_ == static_cast(offset)); +} + +bool SlotRecordInMemoryDataFeed::Start() { +#ifdef _LINUX + this->CheckSetFileList(); + if (input_channel_->Size() != 0) { + std::vector data; + input_channel_->Read(data); + } +#endif + if (batch_offsets_.size() > 0) { + VLOG(3) << "batch_size offsets: " << batch_offsets_.size(); + enable_heterps_ = true; + this->offset_index_ = 0; + } + this->finish_start_ = true; + return true; +} + +int SlotRecordInMemoryDataFeed::Next() { +#ifdef _LINUX + this->CheckStart(); + + VLOG(3) << "enable heter next: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size(); + if (offset_index_ >= batch_offsets_.size()) { + VLOG(3) << "offset_index: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size(); + return 0; + } + auto& batch = batch_offsets_[offset_index_++]; + this->batch_size_ = batch.second; + VLOG(3) << "batch_size_=" << this->batch_size_ + << ", thread_id=" << thread_id_; + if (this->batch_size_ != 0) { + PutToFeedVec(&records_[batch.first], this->batch_size_); + } else { + VLOG(3) << "finish reading for heterps, batch size zero, thread_id=" + << thread_id_; + } + VLOG(3) << "enable heter next: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size() + << " baych_size: " << this->batch_size_; + + return this->batch_size_; +#else + return 0; +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 5527eaf1f6fa4d..a4100e66e72850 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -384,7 +384,7 @@ class CustomParser { CustomParser() {} virtual ~CustomParser() {} virtual void Init(const std::vector& slots) = 0; - virtual bool Init(const std::vector& slots) = 0; + virtual bool Init(const std::vector& slots); virtual void ParseOneInstance(const char* str, Record* instance) = 0; virtual bool ParseOneInstance( const std::string& line, @@ -1103,6 +1103,42 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed { virtual void PutToFeedVec(const Record* ins_vec, int num); }; +class SlotRecordInMemoryDataFeed : public InMemoryDataFeed { + public: + SlotRecordInMemoryDataFeed() {} + virtual ~SlotRecordInMemoryDataFeed() {} + virtual void Init(const DataFeedDesc& data_feed_desc); + virtual void LoadIntoMemory(); + void ExpandSlotRecord(SlotRecord* ins); + + protected: + virtual bool Start(); + virtual int Next(); + virtual bool ParseOneInstance(SlotRecord* instance) { return false; } + virtual bool ParseOneInstanceFromPipe(SlotRecord* instance) { return false; } + // virtual void ParseOneInstanceFromSo(const char* str, T* instance, + // CustomParser* parser) {} + virtual void PutToFeedVec(const std::vector& ins_vec) {} + + virtual void LoadIntoMemoryByCommand(void); + virtual void LoadIntoMemoryByLib(void); + virtual void LoadIntoMemoryByLine(void); + virtual void LoadIntoMemoryByFile(void); + virtual void SetInputChannel(void* channel) { + input_channel_ = static_cast*>(channel); + } + bool ParseOneInstance(const std::string& line, SlotRecord* rec); + virtual void PutToFeedVec(const SlotRecord* ins_vec, int num); + float sample_rate_ = 1.0f; + int use_slot_size_ = 0; + int float_use_slot_size_ = 0; + int uint64_use_slot_size_ = 0; + std::vector all_slots_info_; + std::vector used_slots_info_; + size_t float_total_dims_size_ = 0; + std::vector float_total_dims_without_inductives_; +}; + class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed { public: PaddleBoxDataFeed() {} diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc index ec1b8ec773fa64..e46e4aeb0124c2 100644 --- a/paddle/fluid/framework/data_feed_factory.cc +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -58,8 +58,8 @@ std::shared_ptr DataFeedFactory::CreateDataFeed( std::string data_feed_class) { if (g_data_feed_map.count(data_feed_class) < 1) { LOG(WARNING) << "Your DataFeed " << data_feed_class - << "is not supported currently"; - LOG(WARNING) << "Supported DataFeed: " << DataFeedTypeList(); + << " is not supported currently"; + LOG(WARNING) << " Supported DataFeed: " << DataFeedTypeList(); exit(-1); } return g_data_feed_map[data_feed_class](); @@ -68,6 +68,7 @@ std::shared_ptr DataFeedFactory::CreateDataFeed( REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed); REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed); +REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed); #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed); #endif diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index 82a39b206e6bd6..2a071665b263c6 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -1609,7 +1609,35 @@ void SlotRecordDataset::DynamicAdjustChannelNum(int channel_num, void SlotRecordDataset::PrepareTrain() { #ifdef PADDLE_WITH_GLOO - return; + if (enable_heterps_) { + if (input_records_.size() == 0 && input_channel_ != nullptr && + input_channel_->Size() != 0) { + input_channel_->ReadAll(input_records_); + VLOG(3) << "read from channel to records with records size: " + << input_records_.size(); + } + VLOG(3) << "input records size: " << input_records_.size(); + int64_t total_ins_num = input_records_.size(); + std::vector> offset; + int default_batch_size = + reinterpret_cast(readers_[0].get()) + ->GetDefaultBatchSize(); + VLOG(3) << "thread_num: " << thread_num_ + << " memory size: " << total_ins_num + << " default batch_size: " << default_batch_size; + compute_thread_batch_nccl(thread_num_, total_ins_num, default_batch_size, + &offset); + VLOG(3) << "offset size: " << offset.size(); + for (int i = 0; i < thread_num_; i++) { + reinterpret_cast(readers_[i].get()) + ->SetRecord(&input_records_[0]); + } + for (size_t i = 0; i < offset.size(); i++) { + reinterpret_cast( + readers_[i % thread_num_].get()) + ->AddBatchOffset(offset[i]); + } + } #else PADDLE_THROW(platform::errors::Unavailable( "dataset set heterps need compile with GLOO")); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 784cbc3d90b865..d1e98a711dc9dd 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -45,9 +45,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { platform::Timer timeline; timeline.Start(); int device_num = heter_devices_.size(); - MultiSlotDataset* dataset = dynamic_cast(dataset_); gpu_task->init(thread_keys_shard_num_, device_num); - auto input_channel = dataset->GetInputChannel(); auto& local_keys = gpu_task->feature_keys_; auto& local_ptr = gpu_task->value_ptr_; @@ -68,35 +66,83 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { for (int i = 0; i < thread_keys_thread_num_; i++) { thread_keys_[i].resize(thread_keys_shard_num_); } - const std::deque& vec_data = input_channel->GetData(); - size_t total_len = vec_data.size(); - size_t len_per_thread = total_len / thread_keys_thread_num_; - int remain = total_len % thread_keys_thread_num_; + + size_t total_len = 0; + size_t len_per_thread = 0; + int remain = 0; size_t begin = 0; - auto gen_func = [this](const std::deque& total_data, int begin_index, - int end_index, int i) { - for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { - const auto& ins = *iter; - const auto& feasign_v = ins.uint64_feasigns_; - for (const auto feasign : feasign_v) { - uint64_t cur_key = feasign.sign().uint64_feasign_; - int shard_id = cur_key % thread_keys_shard_num_; - this->thread_keys_[i][shard_id].insert(cur_key); + + std::string data_set_name = std::string(typeid(*dataset_).name()); + + if (data_set_name.find("SlotRecordDataset") != std::string::npos) { + VLOG(0) << "ps_gpu_wrapper use SlotRecordDataset"; + SlotRecordDataset* dataset = dynamic_cast(dataset_); + auto input_channel = dataset->GetInputChannel(); + VLOG(0) << "yxf::buildtask::inputslotchannle size: " + << input_channel->Size(); + const std::deque& vec_data = input_channel->GetData(); + total_len = vec_data.size(); + len_per_thread = total_len / thread_keys_thread_num_; + remain = total_len % thread_keys_thread_num_; + VLOG(0) << "total len: " << total_len; + auto gen_func = [this](const std::deque& total_data, + int begin_index, int end_index, int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { + const auto& ins = *iter; + const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values; + for (const auto feasign : feasign_v) { + int shard_id = feasign % thread_keys_shard_num_; + this->thread_keys_[i][shard_id].insert(feasign); + } } + }; + for (int i = 0; i < thread_keys_thread_num_; i++) { + threads.push_back( + std::thread(gen_func, std::ref(vec_data), begin, + begin + len_per_thread + (i < remain ? 1 : 0), i)); + begin += len_per_thread + (i < remain ? 1 : 0); } - }; - for (int i = 0; i < thread_keys_thread_num_; i++) { - threads.push_back(std::thread(gen_func, std::ref(vec_data), begin, - begin + len_per_thread + (i < remain ? 1 : 0), - i)); - begin += len_per_thread + (i < remain ? 1 : 0); - } - for (std::thread& t : threads) { - t.join(); + for (std::thread& t : threads) { + t.join(); + } + timeline.Pause(); + VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; + } else { + CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); + VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; + MultiSlotDataset* dataset = dynamic_cast(dataset_); + auto input_channel = dataset->GetInputChannel(); + + const std::deque& vec_data = input_channel->GetData(); + total_len = vec_data.size(); + len_per_thread = total_len / thread_keys_thread_num_; + remain = total_len % thread_keys_thread_num_; + auto gen_func = [this](const std::deque& total_data, + int begin_index, int end_index, int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { + const auto& ins = *iter; + const auto& feasign_v = ins.uint64_feasigns_; + for (const auto feasign : feasign_v) { + uint64_t cur_key = feasign.sign().uint64_feasign_; + int shard_id = cur_key % thread_keys_shard_num_; + this->thread_keys_[i][shard_id].insert(cur_key); + } + } + }; + for (int i = 0; i < thread_keys_thread_num_; i++) { + threads.push_back( + std::thread(gen_func, std::ref(vec_data), begin, + begin + len_per_thread + (i < remain ? 1 : 0), i)); + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); + } + timeline.Pause(); + VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } - timeline.Pause(); - VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; timeline.Start(); diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 72b95dcc153464..7a7666665511fa 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -687,4 +687,6 @@ DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num"); DEFINE_bool(enable_slotpool_wait_release, false, "enable slotrecord obejct wait release, default false"); DEFINE_bool(enable_slotrecord_reset_shrink, false, - "enable slotrecord obejct reset shrink memory, default false"); \ No newline at end of file + "enable slotrecord obejct reset shrink memory, default false"); +DEFINE_bool(enable_ins_parser_file, false, + "enable parser ins file , default false"); From 2cee0ea7b26cb71fc4d06f5074d57f457a7db1f1 Mon Sep 17 00:00:00 2001 From: jakpiase <62569058+jakpiase@users.noreply.github.com> Date: Mon, 4 Oct 2021 09:49:48 +0200 Subject: [PATCH 064/298] added Piotr to authors.md and updated Intel-related paddle authors image (#36254) --- AUTHORS.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/AUTHORS.md b/AUTHORS.md index 1eaaff29771436..60f5b424abb7ae 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -3,7 +3,7 @@ | abhinavarora | Abhinav Arora | | andreazanetti | Andrea Zanetti | | arlesniak | Artur Lesniak | -| arogowie-intel | Adam Osewski | +| [arogowie-intel](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Adam Osewski | | backyes | Yan-Fei Wang | | baiyfbupt | Yi-Fan Bai | | beckett1124 | Bin Qi | @@ -25,8 +25,8 @@ | hedaoyuan | Dao-Yuan He | | helinwang | He-Lin Wang | | jacquesqiao | Long-Fei Qiao | -| jakpiase | Jakub Piasecki | -| [jczaja](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg) | Jacek Czaja | +| [jakpiase](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Jakub Piasecki | +| [jczaja](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Jacek Czaja | | JiayiFeng | Jia-Yi Feng | | kbinias | Krzysztof Binias | | kexinzhao | Ke-Xin Zhao | @@ -47,7 +47,8 @@ | pakchoi | Chuan-Jiang Song | | panyx0718 | Xin Pan | | pengli09 | Peng Li | -| pmajchrzak |Piotr Majchrzak | +| [piotrekobiIntel](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Piotr Paturej | +| [pmajchrzak](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Piotr Majchrzak | | pkuyym | Ya-Ming Yang | | pzelazko-intel | Pawel Zelazko | | [pawelpiotrowicz](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg) | Pawel Piotrowicz | @@ -55,12 +56,13 @@ | qingqing01 | Qing-Qing Dang | | reyoung | Yang Yu | | [Sand3r-](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)| Michal Gallus | -| [sfraczek](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)| Sylwester Fraczek | +| [sfraczek](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg)| Sylwester Fraczek | | sneaxiy | Jin-Le Zeng | | Superjom | Chun-Wei Yan | | tensor-tang | Jian Tang | | tianbingsz | Tian-Bing Xu | | tpatejko | Tomasz Patejko | +| [tsocha](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Tomasz Socha | | typhoonzero | Yi Wu | | velconia | Qi-Yang Min | | wanghaoshuang | Hao-Shuang Wang | @@ -68,7 +70,7 @@ | wangzhen-nlp | Zhen Wang | | wen-bo-yang | Wen-Bo Yang | | wojtuss | Wojciech Uss | -| wozna | Joanna Wozna | +| [wozna](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg)| Joanna Wozna | | wwhu | Wei-Wei Hu | | xinghai-sun | Xing-Hai Sun | | Xreki | Yi-Qun Liu | From dc4d5719060aac5aaaec868c1c055cd27f8e812a Mon Sep 17 00:00:00 2001 From: jakpiase <62569058+jakpiase@users.noreply.github.com> Date: Tue, 5 Oct 2021 13:38:19 +0200 Subject: [PATCH 065/298] Added concat BF16/FP32 BWD OneDNN kernel (#35889) * tmp * added concat BF16/FP32 BWD oneDNN kernel * minor change * minor change * fix for CI * added formatting * Reverted deleting static keyword * added reviewers suggestions * reverted deleting concat bf16 test file * fixed concat tests --- paddle/fluid/operators/concat_op.cc | 18 ++- .../operators/mkldnn/concat_mkldnn_op.cc | 71 +++++++++++ .../mkldnn/test_concat_bf16_mkldnn_op.py | 27 ++++- .../unittests/mkldnn/test_concat_mkldnn_op.py | 114 ++++++++++-------- .../fluid/tests/unittests/test_concat_op.py | 2 +- 5 files changed, 171 insertions(+), 61 deletions(-) diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index a400d27b798e37..e6b1f6a1c18c38 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -169,9 +169,21 @@ class ConcatOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.GetPlace()); + auto input_data_type = OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + +#ifdef PADDLE_WITH_MKLDNN + // extra checking if attr "use_mkldnn" exist is needed because + // test_reverse_op is calling concat_grad kernel without setting + // "use_mkldnn" to any value + if (ctx.HasAttr("use_mkldnn") && + this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 57a56776736ff9..4cc96a48bd26f4 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -23,6 +23,7 @@ namespace operators { using framework::DataLayout; using framework::Tensor; +using framework::LoDTensor; using mkldnn::memory; using mkldnn::primitive; using mkldnn::concat; @@ -149,6 +150,72 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { output->set_format(platform::GetMKLDNNFormat(*dst_mem)); } }; + +template +class ConcatGradMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + const auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); + + const auto x = ctx.MultiInput("X"); + const auto* dout = ctx.Input(framework::GradVarName("Out")); + auto dx = ctx.MultiOutput(framework::GradVarName("X")); + + for (size_t i = 0; i < dx.size(); ++i) { + if (dx[i] != nullptr) { + dx[i]->set_lod(x[i]->lod()); + } + } + + int axis = ctx.Attr("axis"); + if (ctx.HasInput("AxisTensor")) { + auto* axis_tensor = ctx.Input("AxisTensor"); + axis = GetDataFromTensor(axis_tensor)[0]; + } + + auto dout_vec_dims = framework::vectorize(dout->dims()); + + axis = ComputeAxis(axis, dout_vec_dims.size()); + + std::vector offset(dout_vec_dims.size(), 0); + + mkldnn::memory::data_type dout_type = + framework::ToMKLDNNDataType(dout->type()); + platform::ReorderMKLDNNHandler reorder_handler(dout_vec_dims, dout->type(), + dout_type, onednn_engine); + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + dout->format(), platform::to_void_cast(dout->data())); + + for (size_t i = 0; i < dx.size(); ++i) { + if (out_var_names[i] != framework::kEmptyVarName && + dx[i]->numel() != 0UL) { + auto dx_vec_dims = framework::vectorize(dx[i]->dims()); + auto slice_mem_p = reorder_handler.AcquireSubmemory( + dx_vec_dims, offset, reorder_src_memory_p); + + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + dx[i], dx_vec_dims, dout->format(), ctx.GetPlace()); + auto reorder_p = + reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p); + + reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p); + + offset[axis] += dx[i]->dims()[axis]; + + dx[i]->set_layout(framework::DataLayout::kMKLDNN); + dx[i]->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); + } + } + astream.wait(); + } +}; + } // namespace operators } // namespace paddle @@ -159,3 +226,7 @@ REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace, ops::ConcatMKLDNNOpKernel, ops::ConcatMKLDNNOpKernel, ops::ConcatMKLDNNOpKernel); + +REGISTER_OP_KERNEL(concat_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::ConcatGradMKLDNNOpKernel, + ops::ConcatGradMKLDNNOpKernel); diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py index 2b7b2b36afa4fb..e53afaa57be1c8 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py @@ -40,13 +40,28 @@ def setUp(self): 'mkldnn_data_type': self.mkldnn_data_type } + self.sections = [self.x0.shape[self.axis]] * 2 + self.sections[1] += self.x1.shape[self.axis] + self.output = np.concatenate( (self.x0, self.x1, self.x2), axis=self.axis).astype(np.uint16) self.outputs = {'Out': self.output} + def calculate_grads(self): + self.dout = self.outputs['Out'] + self.dxs = np.split(self.dout, self.sections, self.axis) + def test_check_output(self): self.check_output_with_place(core.CPUPlace()) + def test_check_grad(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ["x0", "x1", "x2"], + "Out", + user_defined_grads=[self.dxs[0], self.dxs[1], self.dxs[2]], + user_defined_grad_outputs=[self.dout]) + # --------------------test concat bf16 in with axis 0-------------------- def init_test_data(self): @@ -61,9 +76,9 @@ def init_axis(self): self.axis = 0 def init_shape(self): - self.x0_shape = [2, 2, 1, 2] - self.x1_shape = [1, 2, 1, 2] - self.x2_shape = [3, 2, 1, 2] + self.x0_shape = [6, 2, 4, 3] + self.x1_shape = [7, 2, 4, 3] + self.x2_shape = [8, 2, 4, 3] # --------------------test concat bf16 in with axis 1-------------------- @@ -74,9 +89,9 @@ def init_axis(self): self.axis = 1 def init_shape(self): - self.x0_shape = [1, 1, 5, 5] - self.x1_shape = [1, 2, 5, 5] - self.x2_shape = [1, 3, 5, 5] + self.x0_shape = [1, 4, 5, 5] + self.x1_shape = [1, 8, 5, 5] + self.x2_shape = [1, 6, 5, 5] # --------------------test concat bf16 in with axis 2-------------------- diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py index 4900b42d3618d1..7fc8f1d30802cd 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py @@ -15,78 +15,90 @@ from __future__ import print_function import unittest -from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3, TestConcatOp4 +import numpy as np +import struct - -class TestMKLDNNConcatOp(TestConcatOp): - def setUp(self): - super(TestMKLDNNConcatOp, self).setUp() - self.attrs["use_mkldnn"] = True - self._cpu_only = True - - def test_check_output(self): - # TODO(wangzhongpu): support mkldnn op in dygraph mode - self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False)) - - def test_check_grad(self): - pass - - def init_kernel_type(self): - self.use_mkldnn = True +import paddle.fluid.core as core +from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16 +from paddle import enable_static -class TestMKLDNNConcatOp2(TestConcatOp2): +class TestConcatAxis0OneDNNOp(OpTest): def setUp(self): - super(TestMKLDNNConcatOp2, self).setUp() - self.attrs["use_mkldnn"] = True - self._cpu_only = True + self.op_type = "concat" + self.mkldnn_data_type = "float32" + self.init_axis() + self.init_shape() + self.init_test_data() + self.configure_datatype() + self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} + self.attrs = { + 'axis': self.axis, + 'use_mkldnn': True, + 'mkldnn_data_type': self.mkldnn_data_type + } + + self.output = np.concatenate( + (self.x0, self.x1, self.x2), axis=self.axis).astype(self.dtype) + + self.outputs = {'Out': self.output} + + def configure_datatype(self): + self.mkldnn_data_type = "float32" + self.dtype = np.float32 def test_check_output(self): - # TODO(wangzhongpu): support mkldnn op in dygraph mode - self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False)) + self.check_output_with_place(core.CPUPlace()) def test_check_grad(self): - pass + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') - def init_kernel_type(self): - self.use_mkldnn = True + def init_test_data(self): + self.x0 = np.random.random(self.x0_shape).astype(np.float32) + self.x1 = np.random.random(self.x1_shape).astype(np.float32) + self.x2 = np.random.random(self.x2_shape).astype(np.float32) + def init_axis(self): + self.axis = 0 -class TestMKLDNNConcatOp3(TestConcatOp3): - def setUp(self): - super(TestMKLDNNConcatOp3, self).setUp() - self.attrs["use_mkldnn"] = True - self._cpu_only = True + def init_shape(self): + self.x0_shape = [2, 2, 1, 50] + self.x1_shape = [1, 2, 1, 50] + self.x2_shape = [3, 2, 1, 50] - def test_check_output(self): - # TODO(wangzhongpu): support mkldnn op in dygraph mode - self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False)) - def test_check_grad(self): - pass +class TestConcatAxis1OneDNNOp(TestConcatAxis0OneDNNOp): + def init_axis(self): + self.axis = 1 - def init_kernel_type(self): - self.use_mkldnn = True + def init_shape(self): + self.x0_shape = [1, 1, 5, 50] + self.x1_shape = [1, 2, 5, 50] + self.x2_shape = [1, 3, 5, 50] -class TestMKLDNNConcatOp4(TestConcatOp4): - def setUp(self): - super(TestMKLDNNConcatOp4, self).setUp() - self.attrs["use_mkldnn"] = True - self._cpu_only = True +class TestConcatAxis2OneDNNOp(TestConcatAxis0OneDNNOp): + def init_axis(self): + self.axis = 2 - def test_check_output(self): - # TODO(wangzhongpu): support mkldnn op in dygraph mode - self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False)) + def init_shape(self): + self.x0_shape = [2, 3, 4, 50] + self.x1_shape = [2, 3, 5, 50] + self.x2_shape = [2, 3, 6, 50] - def test_check_grad(self): - pass - def init_kernel_type(self): - self.use_mkldnn = True +class TestConcatAxis3OneDNNOp(TestConcatAxis0OneDNNOp): + def init_axis(self): + self.axis = 3 + + def init_shape(self): + self.x0_shape = [5, 3, 5, 5] + self.x1_shape = [5, 3, 5, 6] + self.x2_shape = [5, 3, 5, 7] if __name__ == '__main__': - from paddle import enable_static enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py index 10cd774ce04bec..5f936e577a06fd 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_op.py @@ -16,7 +16,7 @@ import unittest import numpy as np -from op_test import OpTest, skip_check_grad_ci +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard, core import paddle From e928834040fdb606fe56ba74769856b492cd9b79 Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Thu, 7 Oct 2021 11:43:43 +0200 Subject: [PATCH 066/298] [OneDNN] Conv op refactor. (#36252) * Remove unused header. * Use ConvMKLDNNHandlerT for conv2d INT8. * Use absolute module path to import. --- paddle/fluid/operators/mkldnn/axpy_handler.cc | 1 - .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 668 ++++++------------ paddle/fluid/platform/mkldnn_helper.h | 6 + paddle/fluid/platform/mkldnn_reuse.h | 568 +-------------- .../fluid/tests/unittests/test_conv2d_op.py | 3 +- .../unittests/test_conv2d_transpose_op.py | 2 +- 6 files changed, 251 insertions(+), 997 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc index ed265edf003e01..db1127b055c31e 100644 --- a/paddle/fluid/operators/mkldnn/axpy_handler.cc +++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 1b69dd7ea00c7c..c663ba2f886809 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,27 +12,16 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/data_layout_transform.h" +#include + #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" -namespace paddle { -namespace platform { -class MKLDNNDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { - -using framework::DataLayout; -using mkldnn::memory; -using mkldnn::primitive; -using mkldnn::reorder; -using mkldnn::stream; -using platform::GetMKLDNNFormat; -using platform::to_void_cast; +namespace { inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format, const int groups, @@ -78,7 +67,7 @@ class ConvMKLDNNHandlerT mkldnn::convolution_backward_data, mkldnn::convolution_backward_weights> { public: - ConvMKLDNNHandlerT(const paddle::framework::ExecutionContext& ctx, + ConvMKLDNNHandlerT(const framework::ExecutionContext& ctx, const platform::MKLDNNDeviceContext& dev_ctx, const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* input, @@ -92,19 +81,19 @@ class ConvMKLDNNHandlerT unique_name)) { if (!this->isCached()) { PADDLE_ENFORCE_EQ( - input->layout(), DataLayout::kMKLDNN, + input->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, input->layout())); + framework::DataLayout::kMKLDNN, input->layout())); PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Wrong format set for Input tensor")); PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, + filter->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The Filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); + framework::DataLayout::kMKLDNN, filter->layout())); PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Wrong format set for Filter tensor")); @@ -137,10 +126,10 @@ class ConvMKLDNNHandlerT if (bias) { PADDLE_ENFORCE_EQ( - bias->layout(), DataLayout::kMKLDNN, + bias->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The Bias tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, bias->layout())); + framework::DataLayout::kMKLDNN, bias->layout())); PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Got wrong format for Bias tensor.")); @@ -188,12 +177,12 @@ class ConvMKLDNNHandlerT std::transform(dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { return i - 1; }); - const auto src_tz = paddle::framework::vectorize(input->dims()); + const auto src_tz = framework::vectorize(input->dims()); - auto weights_tz = paddle::framework::vectorize(filter->dims()); + auto weights_tz = framework::vectorize(filter->dims()); platform::GetGroupConvWeightsTz(weights_tz, groups); - const auto dst_tz = paddle::framework::vectorize(output->dims()); + const auto dst_tz = framework::vectorize(output->dims()); const mkldnn::memory::dims stride_dims = strides; const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); @@ -204,29 +193,48 @@ class ConvMKLDNNHandlerT * the memory format preferred for best performance */ auto chosen_memory_format = MKLDNNMemoryFormat::any; - auto data_type = mkldnn::memory::data_type::f32; if (ctx.Attr("mkldnn_data_type") == "bfloat16" || std::is_same::value) data_type = mkldnn::memory::data_type::bf16; - const auto src_md = - platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format); - const auto weights_md = platform::MKLDNNMemDesc(weights_tz, data_type, - MKLDNNMemoryFormat::any); + mkldnn::memory::desc src_md, weights_md; + if (platform::is_int8()) { + src_md = platform::MKLDNNMemDesc( + src_tz, framework::ToMKLDNNDataType(input->type()), + chosen_memory_format); + weights_md = platform::MKLDNNMemDesc( + weights_tz, mkldnn::memory::data_type::s8, chosen_memory_format); + } else { + src_md = + platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format); + weights_md = platform::MKLDNNMemDesc(weights_tz, data_type, + MKLDNNMemoryFormat::any); + } + const auto dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference : mkldnn::prop_kind::forward_training; + float sum_scale; + std::vector output_shift_scale; + std::tie(sum_scale, output_shift_scale) = get_int8_scales(ctx); + const mkldnn::primitive_attr conv_attr = CreatePostOps( - fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn); + fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn, + output_shift_scale, sum_scale); // for INT8 only! if (bias) { auto bias_tz = framework::vectorize(bias->dims()); - auto bias_md = - platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x); + mkldnn::memory::desc bias_md; + if (platform::is_int8()) { + bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::s32, MKLDNNMemoryFormat::x); + } else { + bias_md = platform::MKLDNNMemDesc(bias_tz, data_type, + MKLDNNMemoryFormat::x); + } this->AcquireForwardPrimitiveDescriptor( conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct, @@ -255,28 +263,28 @@ class ConvMKLDNNHandlerT unique_name)) { if (!this->isBwdCached()) { PADDLE_ENFORCE_EQ( - in->layout(), DataLayout::kMKLDNN, + in->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, in->layout())); + framework::DataLayout::kMKLDNN, in->layout())); PADDLE_ENFORCE_NE(in->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Got wrong format for Input tensor.")); PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, + filter->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); + framework::DataLayout::kMKLDNN, filter->layout())); PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Got wrong format for Filter tensor.")); PADDLE_ENFORCE_EQ( - out_grad->layout(), DataLayout::kMKLDNN, + out_grad->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The output_grad tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, out_grad->layout())); + framework::DataLayout::kMKLDNN, out_grad->layout())); PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Wrong format set for output_grad tensor")); @@ -296,28 +304,25 @@ class ConvMKLDNNHandlerT std::vector dilations(begin(dilations_temp), end(dilations_temp)); - std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - int groups = ctx.Attr("groups"); - auto input_dims = in->dims(); auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size()); auto filter_dims = filter->dims(); auto filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); - auto ksize = framework::vectorize(filter_data_dims); + std::string padding_algorithm = + ctx.Attr("padding_algorithm"); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, data_dims, strides, ksize); auto src_tz = framework::vectorize(in->dims()); auto weights_tz = framework::vectorize(filter->dims()); + int groups = ctx.Attr("groups"); int g = std::max(groups, 1); platform::GetGroupConvWeightsTz(weights_tz, g); - auto dst_tz = paddle::framework::vectorize(out_grad->dims()); + auto dst_tz = framework::vectorize(out_grad->dims()); /* create memory descriptor for conv backward without specified format * ('any') which lets a primitive (conv backward in this case) choose @@ -349,8 +354,14 @@ class ConvMKLDNNHandlerT mkldnn::primitive_attr conv_attr; if (bias) { auto bias_tz = framework::vectorize(bias->dims()); - auto bias_md = platform::MKLDNNMemDesc( - bias_tz, mkldnn::memory::data_type::f32, MKLDNNMemoryFormat::x); + mkldnn::memory::desc bias_md; + if (platform::is_int8()) { + bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::s32, MKLDNNMemoryFormat::x); + } else { + bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::f32, MKLDNNMemoryFormat::x); + } this->AcquireForwardPrimitiveDescriptor( conv_attr, mkldnn::prop_kind::forward_training, @@ -377,6 +388,71 @@ class ConvMKLDNNHandlerT } } + std::tuple> get_int8_scales( + const framework::ExecutionContext& ctx) const { + const auto* filter = ctx.Input("Filter"); + const auto& weights_tz = framework::vectorize(filter->dims()); + + const bool& force_fp32_output = ctx.Attr("force_fp32_output"); + const bool& fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + const int groups = std::max(ctx.Attr("groups"), 1); + + const auto& scale_in_data = ctx.Attr("Scale_in"); + const auto& scale_in_eltwise_data = ctx.Attr("Scale_in_eltwise"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + bool is_multi_channel = scale_weights_data.size() > 1; + auto scale_out_data = + force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); + float sum_scale = + fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; + int count = + is_multi_channel + ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector output_shift_scale(count); + +#pragma omp parallel for if (count > 50) + for (int i = 0; i < count; i++) { + if (scale_weights_data[i] == 0.0) + // weights data will contain 0 in some models, then weights + // scale couldn't be calculated + output_shift_scale[i] = scale_out_data; + else + output_shift_scale[i] = + static_cast(static_cast(scale_out_data) / + (static_cast(scale_in_data) * + static_cast(scale_weights_data[i]))); + } + + return std::make_tuple(sum_scale, output_shift_scale); + } + + std::tuple> get_int8_bias_scales( + const framework::ExecutionContext& ctx) const { + const auto* filter = ctx.Input("Filter"); + const auto& weights_tz = framework::vectorize(filter->dims()); + const int groups = std::max(ctx.Attr("groups"), 1); + + const auto& scale_weights_data = + ctx.Attr>("Scale_weights"); + const auto& scale_in_data = ctx.Attr("Scale_in"); + + bool is_multi_channel = scale_weights_data.size() > 1; + int mask_reorder = is_multi_channel ? 1 << 0 : 1; + int count = + is_multi_channel + ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector scale_bias_data(count); + +#pragma omp parallel for if (count > 50) + for (int i = 0; i < count; i++) { + scale_bias_data[i] = scale_in_data * scale_weights_data[i]; + } + + return std::make_tuple(mask_reorder, scale_bias_data); + } + mkldnn::primitive_attr CreatePostOps( std::string fuse_activation, float fuse_alpha, float fuse_beta, bool fuse_residual_conn, const std::vector output_shift_scale = {}, @@ -433,7 +509,7 @@ class ConvMKLDNNHandlerT return this->AcquireMemoryWithReorder( user_src_md, this->bwd_pd_->weights_desc(), - to_void_cast(filter_data), "@weights_mem_d_p", false); + platform::to_void_cast(filter_data), "@weights_mem_d_p", false); } std::shared_ptr AcquireSrcMemoryWithReorder( @@ -480,11 +556,11 @@ class ConvMKLDNNHandlerT framework::vectorize(in_mem->dims()), platform::MKLDNNGetDataType(), in_mem->format()); return this->AcquireMemoryWithReorder( - user_mem_md, mem_md, to_void_cast(in_mem_data), key_mem); + user_mem_md, mem_md, platform::to_void_cast(in_mem_data), key_mem); } else { const std::string target_key_suffix{key_mem_target}; const auto target_mem_p = this->AcquireMemory(target_key_suffix); - user_mem_p->set_data_handle(to_void_cast(in_mem_data)); + user_mem_p->set_data_handle(platform::to_void_cast(in_mem_data)); if (user_mem_p != target_mem_p) { this->AcquireReorder(user_mem_p, target_mem_p, key_mem); } @@ -494,7 +570,8 @@ class ConvMKLDNNHandlerT std::shared_ptr AcquireWeightsMemoryWithReorder( const framework::Tensor* filter, const int groups, const bool is_conv3d, - const bool is_test) { + const bool is_test, const std::vector& scale_data = {1.0f}, + int mask = 0) { // This is workaround to make execution faster, delete // if statement after including md inside Tensor auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target"); @@ -511,12 +588,14 @@ class ConvMKLDNNHandlerT return this->AcquireMemoryWithReorder( user_src_md, this->fwd_pd_->weights_desc(), - to_void_cast(filter_data), "@weights_mem_p", is_test); + platform::to_void_cast(filter_data), "@weights_mem_p", is_test, {}, + scale_data, mask); } } std::shared_ptr AcquireBiasMemoryWithReorder( - const framework::Tensor* bias, const bool is_test) { + const framework::Tensor* bias, const bool is_test, + const std::vector& scale_data = {1.0f}, int mask = 0) { auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target"); if (is_test && bias_mem_p) { return bias_mem_p; @@ -527,8 +606,9 @@ class ConvMKLDNNHandlerT MKLDNNMemoryFormat::x); return this->AcquireMemoryWithReorder( - user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast(bias_data), - "@bias_mem_p", is_test); + user_bias_md, this->fwd_pd_->bias_desc(), + platform::to_void_cast(bias_data), "@bias_mem_p", is_test, {}, + scale_data, mask); } } @@ -536,8 +616,8 @@ class ConvMKLDNNHandlerT const framework::Tensor* residual_param) { void* residual_data = residual_param->type() == framework::DataTypeTrait::DataType() - ? to_void_cast(residual_param->data()) - : to_void_cast(residual_param->data()); + ? platform::to_void_cast(residual_param->data()) + : platform::to_void_cast(residual_param->data()); auto residual_mem_p = this->AcquireMemory("@user_residual_data_mem_p"); if (residual_mem_p) { residual_mem_p->set_data_handle(residual_data); @@ -572,12 +652,14 @@ class ConvMKLDNNHandlerT } }; +} // anonymous namespace + template -class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { +class ConvMKLDNNOpKernel : public framework::OpKernel { public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet( + platform::errors::PreconditionNotMet( "Operator DNNL Conv must use CPUPlace")); bool is_INT8 = std::is_same::value || std::is_same::value; @@ -607,9 +689,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } template - void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const { + void ComputeFP32(const framework::ExecutionContext& ctx) const { auto& dev_ctx = - ctx.template device_context(); + ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); const bool is_test = ctx.Attr("is_test"); @@ -656,407 +738,112 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { conv_p->execute(astream, args); astream.wait(); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + output->set_layout(framework::DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); } template - void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { - const bool is_test = ctx.Attr("is_test"); - + void ComputeINT8(const framework::ExecutionContext& ctx) const { auto& dev_ctx = - ctx.template device_context(); + ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); - auto* input = ctx.Input("Input"); - auto* output = ctx.Output("Output"); - - PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, input->layout())); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Input tensor.")); - - PADDLE_ENFORCE_GE(input->dims().size(), 4, - platform::errors::InvalidArgument( - "Input must be with 4 or 5 dimensions, i.e. NCHW or " - "NCDHW, but got dimension = %d .", - input->dims().size())); - PADDLE_ENFORCE_LE(input->dims().size(), 5, - platform::errors::InvalidArgument( - "Input must be with 4 or 5 dimensions, i.e. NCHW or " - "NCDHW, but got dimension = %d .", - input->dims().size())); + const std::string& fuse_activation = + ctx.Attr("fuse_activation"); + const bool& fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + const bool& force_fp32_output = ctx.Attr("force_fp32_output"); + const bool is_conv3d = ctx.Attr>("strides").size() == 3U; - std::string fuse_activation = ctx.Attr("fuse_activation"); - bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); bool unsigned_output = (fuse_activation == "relu" || fuse_activation == "relu6"); - - const T* input_data = input->data(); - - auto src_tz = paddle::framework::vectorize(input->dims()); - - mkldnn::memory::data_type src_dt = - paddle::framework::ToMKLDNNDataType(input->type()); - - std::string key = - platform::CreateKey(dev_ctx, src_tz, src_dt, - ctx.InputName("Input") + ctx.InputName("Filter")); - bool need_s8_to_u8 = false; - std::shared_ptr conv_p; - std::shared_ptr src_memory_p; - std::shared_ptr user_src_memory_p; - std::shared_ptr dst_memory_p; - std::vector pipeline; - std::shared_ptr conv_pd; - std::shared_ptr handler; - - // This is workaround for hacky implementation - // of conv int8 mkl-dnn. Once conv fp32 and conv int8 - // are merged/unified, this will disappear - auto key_tid = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - - const std::string key_conv_pd = key_tid + "@conv_pd"; - auto prim_key = key_tid + "@conv_p"; - auto dst_key = key_tid + "@dst_mem_p"; - auto src_key = key_tid + "@src_mem_p"; - auto weights_key = key_tid + "@weights_mem_p"; - auto bias_key = key_tid + "@bias_mem_p"; - auto user_src_key = key_tid + "@user_src_mem_p"; - auto user_residual_key = key_tid + "@user_residual_data_mem_p"; - auto src_reorder_key = key_tid + "@src_mem_preorder_p"; - auto residual_reorder_key = key_tid + "@residual_data_mem_preorder_p"; - - conv_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_conv_pd)); - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + PADDLE_ENFORCE_NE( + is_conv3d, true, + platform::errors::Unimplemented( + "OneDNN int8 convolution does not support 3D inputs currently")); + PADDLE_ENFORCE_EQ( + fuse_residual_conn && force_fp32_output, false, + platform::errors::Unimplemented( + "residual fusion does not support force output with fp32")); - if (conv_pd == nullptr || !is_test) { - float fuse_alpha = ctx.Attr("fuse_alpha"); - float fuse_beta = ctx.Attr("fuse_beta"); - bool force_fp32_output = ctx.Attr("force_fp32_output"); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; + auto* output = ctx.Output("Output"); - auto* filter = ctx.Input("Filter"); + ConvMKLDNNHandlerT handler( + ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias, + output, ctx.InputName("Input") + ctx.InputName("Filter")); - PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); - PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Filter tensor.")); + auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); - PADDLE_ENFORCE_GE(filter->dims().size(), 4, - platform::errors::InvalidArgument( - "Filter must be with 4 or 5 dimensions, i.e. OIHW " - "or OIDHW, but got dimensions = %d .", - filter->dims().size())); - PADDLE_ENFORCE_LE(filter->dims().size(), 5, - platform::errors::InvalidArgument( - "Filter must be with 4 or 5 dimensions, i.e. OIHW " - "or OIDHW, but got dimensions = %d .", - filter->dims().size())); + const auto& scale_weights_data = + ctx.Attr>("Scale_weights"); + const bool is_multi_channel = scale_weights_data.size() > 1; + const int& groups = ctx.Attr("groups"); + const bool& is_test = ctx.Attr("is_test"); + int mask_reorder = + is_multi_channel ? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; + auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( + filter, groups, false, is_test, scale_weights_data, mask_reorder); + std::shared_ptr dst_memory_p; + if (fuse_residual_conn) { + auto* residual_param = ctx.Input("ResidualData"); PADDLE_ENFORCE_EQ( - !fuse_residual_conn || !force_fp32_output, true, - platform::errors::Unimplemented( - "residual fusion does not support force output with fp32")); - - auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; - - if (bias) { - PADDLE_ENFORCE_EQ( - bias->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The bias tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, bias->layout())); - PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Bias tensor.")); - - PADDLE_ENFORCE_EQ(bias->dims().size(), 1, - platform::errors::InvalidArgument( - "Bias must only have 1 dimension, i.e. X, but " - "got dimension = %d .", - bias->dims().size())); - } - - std::vector strides_temp = ctx.Attr>("strides"); - std::vector strides(begin(strides_temp), end(strides_temp)); - - std::vector paddings_temp = ctx.Attr>("paddings"); - std::vector paddings(begin(paddings_temp), end(paddings_temp)); - - std::vector dilations_temp = ctx.Attr>("dilations"); - std::vector dilations(begin(dilations_temp), - end(dilations_temp)); - - std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - bool is_conv3d = strides.size() == 3U; - - PADDLE_ENFORCE_NE(is_conv3d, true, - platform::errors::Unimplemented( - "int8 does not support conv3d currently")); - - auto input_dims = input->dims(); - auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size()); - auto filter_dims = filter->dims(); - auto filter_data_dims = - framework::slice_ddim(filter_dims, 2, filter_dims.size()); - - auto ksize = framework::vectorize(filter_data_dims); - - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - data_dims, strides, ksize); - - int groups = ctx.Attr("groups"); - auto weights_tz = paddle::framework::vectorize(filter->dims()); - int g = std::max(groups, 1); - - platform::GetGroupConvWeightsTz(weights_tz, g); - auto dst_tz = paddle::framework::vectorize(output->dims()); - - std::transform(dilations.begin(), dilations.end(), dilations.begin(), - [](int64_t i) { return i - 1; }); - - const K* filter_data = filter->data(); - auto scale_in_data = ctx.Attr("Scale_in"); - auto scale_in_eltwise_data = ctx.Attr("Scale_in_eltwise"); - auto scale_weights_data = ctx.Attr>("Scale_weights"); - auto scale_out_data = - force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); - float sum_scale = - fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; - - bool is_multi_channel = scale_weights_data.size() > 1; - - int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] - : (weights_tz)[0]) - : 1; - std::vector output_shift_scale(count); -#pragma omp parallel for if (count > 1) - for (int i = 0; i < count; i++) { - if (scale_weights_data[i] == 0.0) - output_shift_scale[i] = - scale_out_data; // weights data will contain 0 - // in some models, then weights - // scale couldn't be calculated - else - output_shift_scale[i] = - static_cast(static_cast(scale_out_data) / - (static_cast(scale_in_data) * - static_cast(scale_weights_data[i]))); - } - - auto user_src_md = - platform::MKLDNNMemDesc({src_tz}, src_dt, input->format()); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), - ((g) == 1) ? MKLDNNMemoryFormat::oihw : MKLDNNMemoryFormat::goihw); - - /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ - auto chosen_memory_format = MKLDNNMemoryFormat::any; - - std::vector bias_tz; - - auto src_md = - platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); - auto weights_md = platform::MKLDNNMemDesc( - weights_tz, memory::data_type::s8, chosen_memory_format); - auto dst_md = platform::MKLDNNMemDesc( - dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - - handler.reset( - new platform::ConvMKLDNNHandler(dev_ctx, mkldnn_engine, key)); - // create a conv primitive descriptor and save it for usage in backward - auto propagation = is_test ? mkldnn::prop_kind::forward_scoring - : mkldnn::prop_kind::forward_training; - - if (bias) { - bias_tz = paddle::framework::vectorize(bias->dims()); - auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32, - MKLDNNMemoryFormat::x); - conv_pd = handler->AcquireConvolutionPrimitiveDescriptor( - src_md, weights_md, bias_md, dst_md, strides, dilations, paddings, - mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, propagation, output_shift_scale, sum_scale); - } else { - conv_pd = handler->AcquireConvolutionPrimitiveDescriptor( - src_md, weights_md, paddle::none, dst_md, strides, dilations, - paddings, mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, propagation, output_shift_scale, sum_scale); - } - - // create mkldnn memory from input tensors (data/weights) - user_src_memory_p = - handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); - auto user_weights_memory_p = handler->AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); - - // create reorder primitive if the input format is not the preferred one - src_memory_p = - handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); - - std::shared_ptr weights_memory_p; - int mask_reorder = - is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; - weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline, is_test, true, scale_weights_data, - mask_reorder); - - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - PADDLE_ENFORCE_EQ( - output->dims(), residual_param->dims(), - platform::errors::InvalidArgument( - "Output and elementwise parameter need to have the " - "same dimension sizes, but got output's dimension = %d" - " and residual param's dimension =%d .", - output->dims().size(), residual_param->dims().size())); - auto residual_dt = - paddle::framework::ToMKLDNNDataType(residual_param->type()); - if (residual_param->format() != handler->GetDstFormat()) { - auto residual_data_tz = - paddle::framework::vectorize(residual_param->dims()); - auto user_residual_md = platform::MKLDNNMemDesc( - residual_data_tz, residual_dt, residual_param->format()); - dst_memory_p = platform::SetDstMemory( - ctx, output, residual_param, user_residual_md, handler, - &pipeline); - } else { - output->ShareDataWith(*residual_param); - dst_memory_p = platform::SetDstMemory(ctx, output, handler); - } - need_s8_to_u8 = - (platform::MKLDNNGetDataType() == memory::data_type::s8) && - unsigned_output; - } else { - dst_memory_p = platform::SetDstMemory(ctx, output, handler); - } - - // create convolution op primitive - conv_p = handler->AcquireConvolution(); - if (bias) { - const K* bias_data = bias->data(); - auto user_bias_md = platform::MKLDNNMemDesc( - {bias_tz}, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); - auto user_bias_memory_p = handler->AcquireBiasMemory( - user_bias_md, to_void_cast(bias_data)); - std::shared_ptr bias_memory_p; - int mask_reorder = is_multi_channel ? 1 << 0 : 1; - int count = - is_multi_channel - ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) - : 1; - std::vector scale_bias_data(count); -#pragma omp parallel for if (count > 1) - for (int i = 0; i < count; i++) { - scale_bias_data[i] = scale_in_data * scale_weights_data[i]; - } - bias_memory_p = handler->AcquireBiasMemoryFromPrimitive( - user_bias_memory_p, pipeline, is_test, true, scale_bias_data, - mask_reorder); - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_BIAS, *bias_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } else { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } - } else { - auto src_memory_reorder_p = std::static_pointer_cast( - dev_ctx.GetBlob(src_reorder_key)); - src_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(src_key)); - if (src_memory_reorder_p) { - user_src_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(user_src_key)); - user_src_memory_p->set_data_handle(to_void_cast(input_data)); - { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - src_memory_reorder_p->execute(astream, *user_src_memory_p, - *src_memory_p); - astream.wait(); - } - } else if (src_memory_p) { - src_memory_p->set_data_handle(to_void_cast(input_data)); - } - auto weights_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(weights_key)); + output->dims(), residual_param->dims(), + platform::errors::InvalidArgument( + "Output and elementwise parameter need to have the " + "same dimension sizes, but got output's dimension = %d" + " and residual param's dimension =%d .", + output->dims().size(), residual_param->dims().size())); dst_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); - conv_p = std::static_pointer_cast( - dev_ctx.GetBlob(prim_key)); - handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, - mkldnn_engine, key)); - - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - output->ShareDataWith(*residual_param); - need_s8_to_u8 = - (platform::MKLDNNGetDataType() == memory::data_type::s8) && - unsigned_output; - } - platform::SetDstMemoryHandler(ctx, output, handler, dst_memory_p); + handler.AcquireDstMemoryWithResidual(output, residual_param); + need_s8_to_u8 = (platform::MKLDNNGetDataType() == + mkldnn::memory::data_type::s8) && + unsigned_output; + } else { + dst_memory_p = handler.template AcquireDstMemory(output); + } - auto residual_reorder_p = std::static_pointer_cast( - dev_ctx.GetBlob(residual_reorder_key)); - if (residual_reorder_p) { - auto user_residual_data_p = std::static_pointer_cast( - dev_ctx.GetBlob(user_residual_key)); - { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - residual_reorder_p->execute(astream, *user_residual_data_p, - *dst_memory_p); - astream.wait(); - } - } + auto conv_p = handler.AcquireForwardPrimitive(); + + std::unordered_map args = { + {MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}; - auto bias_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(bias_key)); + if (bias) { + float mask_reorder; + std::vector scale_bias_data; + std::tie(mask_reorder, scale_bias_data) = + handler.get_int8_bias_scales(ctx); - if (bias_memory_p) { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_BIAS, *bias_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } else { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } + auto bias_memory_p = handler.AcquireBiasMemoryWithReorder( + bias, is_test, scale_bias_data, mask_reorder); + args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + conv_p->execute(astream, args); astream.wait(); + if (need_s8_to_u8) { output->mutable_data(ctx.GetPlace()); } - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + + output->set_layout(framework::DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); } }; template -class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { +class ConvMKLDNNGradOpKernel : public framework::OpKernel { public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet( + platform::errors::PreconditionNotMet( "Operator DNNL ConvGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); @@ -1105,18 +892,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { {MKLDNN_ARG_DIFF_WEIGHTS, *diff_weights_memory_p}}); astream.wait(); - filter_grad->set_layout(DataLayout::kMKLDNN); + filter_grad->set_layout(framework::DataLayout::kMKLDNN); // in OneDNN groups in convolution are treated as separate dimension // which is not the case in paddlepaddle - auto filter_fmt = GetMKLDNNFormat(*diff_weights_memory_p); + auto filter_fmt = platform::GetMKLDNNFormat(*diff_weights_memory_p); // For convolution with groups convert from blocked to NCHW // otherwise there will be problems in next operators working on this data if (g > 1) { - memory::data_type in_type = framework::ToMKLDNNDataType(filter->type()); + mkldnn::memory::data_type in_type = + framework::ToMKLDNNDataType(filter->type()); // for 3d conv with groups (six dimensional data reorder to goidhw) // for 2d conv with groups (five dimensional data reorder to goihw) - // auto weights_tz = paddle::framework::vectorize(filter->dims()); + // auto weights_tz = framework::vectorize(filter->dims()); auto weights_tz = diff_weights_memory_p->get_desc().dims(); mkldnn::memory::format_tag out_format = @@ -1168,8 +956,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}}); astream.wait(); - input_grad->set_layout(DataLayout::kMKLDNN); - input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p)); + input_grad->set_layout(framework::DataLayout::kMKLDNN); + input_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory_p)); } } }; diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index f14f92cb51fdb1..37fa58e423db77 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -531,7 +531,13 @@ inline bool HasOpBFLOAT16DataType(const paddle::framework::OpDesc* op) { inline bool HasOpFLOAT32DataType(const paddle::framework::OpDesc* op) { return op->GetAttrIfExists("mkldnn_data_type") == "float32"; } + enum class RNNReorderType { PP_NTC, PP_TNC, NTC_PP, TNC_PP }; +template +bool constexpr is_int8() { + return std::is_same::value || std::is_same::value; +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 1aa8c0cdb57f97..084b47bb3c7a3b 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -527,7 +527,8 @@ class MKLDNNHandlerT { const mkldnn::memory::desc& user_md, const mkldnn::memory::desc& target_md, void* ptr, const std::string& suffix, bool is_persistent = false, - std::function(const F*)> custom_reorder_func = {}) { + std::function(const F*)> custom_reorder_func = {}, + const std::vector& scale_data = {1.0f}, int mask = 0) { const auto target_key = key_ + suffix + "_target"; const auto key_reorder_p = key_ + suffix + "reorder_p"; const auto user_key = key_ + suffix + "_user"; @@ -546,8 +547,17 @@ class MKLDNNHandlerT { std::make_shared(user_md, engine_, ptr); if (user_md != target_md) { target_memory_p = std::make_shared(target_md, engine_); - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); + dnnl::reorder::primitive_desc reorder_pdesc; + if (is_int8()) { + dnnl::primitive_attr attr; + attr.set_output_scales(mask, scale_data); + reorder_pdesc = dnnl::reorder::primitive_desc(*user_memory_p, + *target_memory_p, attr); + } else { + reorder_pdesc = + dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p); + } + auto reorder_p = std::make_shared(reorder_pdesc); dev_ctx_.SetBlob(key_reorder_p, reorder_p); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); @@ -597,201 +607,6 @@ class MKLDNNHandlerT { std::shared_ptr bwd_w_pd_; }; -// TODO(grygielski) this class will be deleted later. -class MKLDNNHandler { - public: - MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : dev_ctx_(dev_ctx), - engine_(engine), - key_(platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, base_key)) { - platform::MKLDNNDeviceContext::tls().log_lib_version(); - } - - std::shared_ptr AcquireSrcMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_src_mem_p"); - } - - std::shared_ptr AcquireDstMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_dst_mem_p"); - } - - std::shared_ptr AcquireDiffSrcMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p"); - } - - std::shared_ptr AcquireDiffDstMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p"); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md, void* ptr, const std::string& suffix) { - auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - mem_p = std::make_shared(md, engine_, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; - } - - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md, const std::string& suffix) { - const auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - mem_p = std::make_shared(md, engine_); - dev_ctx_.SetBlob(local_key, mem_p); - } - return mem_p; - } - - // This incarnation of AcquireMemory can call user function eg. custom reorder - // or preprocessing routine if needed - std::shared_ptr AcquireMemory( - const mkldnn::memory::desc& md, void* ptr, const std::string& suffix, - user_function custom_func = {}) { - /*Generate key*/ - auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - // Call custom reorder/preprocessing func if available - if (custom_func) { - auto reordered_data = custom_func(reinterpret_cast(ptr)); - dev_ctx_.SetBlob(local_key + "-custom_reorder", reordered_data); - ptr = reinterpret_cast(reordered_data.get()); - } - - mem_p = std::make_shared(md, engine_, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; - } - - std::shared_ptr AcquireMemory( - const std::vector& dims, const mkldnn::memory::data_type dtype, - const MKLDNNMemoryFormat& fmt, void* ptr, const std::string& suffix) { - /*Generate key*/ - auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - auto md = mkldnn::memory::desc(dims, dtype, fmt); - - mem_p = std::make_shared(md, engine_, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; - } - - std::shared_ptr AcquireMemory( - const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p, - const std::string& suffix, - std::vector& pipeline) { // NOLINT - auto local_key = key_ + suffix; - auto key_reorder_p = key_ + suffix + "reorder_p"; - - auto stored_reorder_p = std::static_pointer_cast( - dev_ctx_.GetBlob(key_reorder_p)); - - if (stored_reorder_p) { - pipeline.push_back(*stored_reorder_p); - } else { - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - dev_ctx_.SetBlob(key_reorder_p, reorder_p); - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } - - return target_memory_p; - } - - std::shared_ptr AcquireMemory( - mkldnn::memory::desc& md, // NOLINT - mkldnn::memory::desc& user_md, // NOLINT - const std::shared_ptr user_memory_p, - const std::string& suffix, - std::vector& pipeline, // NOLINT - bool is_persistent = false, bool is_INT8 = false, - std::vector scale_data = {1.0f}, int mask = 0) { - // create reorder primitive if the input format is not the preferred one - auto local_key = key_ + suffix; - auto key_reorder_p = key_ + suffix + "reorder_p"; - - auto target_memory_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (target_memory_p == nullptr) { - target_memory_p = user_memory_p; - if (md != user_md) { - target_memory_p = std::make_shared(md, engine_); - std::shared_ptr reorder_pd; - if (is_INT8) { - mkldnn::primitive_attr - attri; // attribute for int8 weights and bias data reorder. - attri.set_output_scales(mask, scale_data); - - reorder_pd = std::shared_ptr( - new mkldnn::reorder::primitive_desc(*user_memory_p, - *target_memory_p, attri)); - } else { - reorder_pd = std::shared_ptr( - new mkldnn::reorder::primitive_desc(*user_memory_p, - *target_memory_p)); - } - auto reorder_p = - std::shared_ptr(new mkldnn::reorder(*reorder_pd)); - dev_ctx_.SetBlob(key_reorder_p, reorder_p); - - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } - dev_ctx_.SetBlob(local_key, target_memory_p); - } else if (!is_persistent) { - // Make reorder if needed - auto reorder_p = std::static_pointer_cast( - dev_ctx_.GetBlob(key_reorder_p)); - if (reorder_p != nullptr) { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } - } - return target_memory_p; - } - - protected: - const MKLDNNDeviceContext& dev_ctx_; - mkldnn::engine engine_; - std::string key_; -}; - template class BinaryMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT { @@ -1143,362 +958,6 @@ class ReorderMKLDNNHandler { mkldnn::engine engine_; }; -template -struct convolutional_algorithm; - -template <> -struct convolutional_algorithm { - static constexpr mkldnn::algorithm T = mkldnn::algorithm::convolution_direct; -}; - -template <> -struct convolutional_algorithm { - static constexpr mkldnn::algorithm T = - mkldnn::algorithm::deconvolution_direct; -}; - -template -class ConvMKLDNNTemplateHandler : public MKLDNNHandler { - public: - ConvMKLDNNTemplateHandler(const platform::MKLDNNDeviceContext& dev_ctx, - mkldnn::engine engine, const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key) {} - - // TODO(jczaja): remove after conv int8 is adapted - ConvMKLDNNTemplateHandler( - std::shared_ptr conv_pd, - const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key) { - conv_pd_ = conv_pd; - } - - ConvMKLDNNTemplateHandler( - std::shared_ptr conv_pd, - std::shared_ptr - conv_bwd_data_pd, - std::shared_ptr - conv_bwd_weights_pd, - const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key), - conv_pd_(conv_pd), - conv_bwd_weights_pd_(conv_bwd_weights_pd), - conv_bwd_data_pd_(conv_bwd_data_pd) { - // If we are in Grad operatgor then update a key with BWD suffix to - // distinguish from FWD memory primitives - key_ += "-BWD"; - } - - size_t GetDstMemorySize() const { return conv_pd_->dst_desc().get_size(); } - - MKLDNNMemoryFormat GetDstFormat() const { - return paddle::platform::GetMKLDNNFormat(conv_pd_->dst_desc()); - } - - size_t GetDiffWeightsMemorySize() const { - return conv_bwd_weights_pd_->diff_weights_desc().get_size(); - } - - size_t GetDiffSourceMemorySize() const { - return conv_bwd_data_pd_->diff_src_desc().get_size(); - } - - std::shared_ptr AcquireSrcMemoryFromWeightsPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto src_pd = conv_bwd_weights_pd_->src_desc(); - auto user_pd = user_memory_p->get_desc(); - return this->AcquireMemory(src_pd, user_pd, user_memory_p, - "@weights-src_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffDstMemoryFromWeightsPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_desc(); - auto user_pd = user_memory_p->get_desc(); - return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, - "@weights-diff_dst_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffWeightsMemoryFromWeightsPrimitive( - void* ptr) { - return this->AcquireMemoryFromPrimitive( - conv_bwd_weights_pd_->diff_weights_desc(), ptr, "@diff_weights_mem_p"); - } - - std::shared_ptr AcquireDiffWeightsMemoryFromWeightsPrimitive( - void) { - return this->AcquireMemoryFromPrimitive( - conv_bwd_weights_pd_->diff_weights_desc(), "@diff_weights_mem_p"); - } - - std::shared_ptr AcquireDiffDstMemoryFromDataPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_desc(); - auto user_pd = user_memory_p->get_desc(); - return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, - "@data-diff_dst_mem_p", pipeline); - } - - std::shared_ptr AcquireWeightsMemoryFromDataPrimitive( - const std::shared_ptr user_weights_memory_p, - std::vector& pipeline) { // NOLINT - auto weights_pd = conv_bwd_data_pd_->weights_desc(); - auto user_pd = user_weights_memory_p->get_desc(); - return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p, - "@data-weights_mem_p", pipeline); - } - - std::shared_ptr AcquireResidualDataMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p"); - } - - std::shared_ptr AcquireDstMemoryFromResidualDataMemory( - const std::shared_ptr& user_residual_memory_p, - void* dst_ptr, - std::vector& pipeline) { // NOLINT - return this->AcquireMemory(user_residual_memory_p, - this->AcquireDstMemoryFromPrimitive(dst_ptr), - "@residual_data_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffSrcMemoryFromDataPrimitive( - void* ptr) { - return this->AcquireMemoryFromPrimitive(conv_bwd_data_pd_->diff_src_desc(), - ptr, "@diff_src_mem_p"); - } - - std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { - return this->AcquireMemoryFromPrimitive(conv_pd_->dst_desc(), ptr, - "@dst_mem_p"); - } - - std::shared_ptr AcquireSrcMemoryFromPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto src_pd = conv_pd_->src_desc(); - auto user_pd = user_memory_p->get_desc(); - return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p", - pipeline); - } - - std::shared_ptr AcquireWeightsMemory( - const mkldnn::memory::desc& md, void* ptr, - user_function custom_func = {}) { - return this->AcquireMemory(md, ptr, "@user_weights_mem_p", custom_func); - } - - std::shared_ptr AcquireBiasMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_bias_mem_p"); - } - - std::shared_ptr AcquireWeightsMemoryFromPrimitive( - const std::shared_ptr user_weights_memory_p, - std::vector& pipeline, // NOLINT - bool is_persistent = false, bool is_INT8 = false, - std::vector scale_data = {1.0f}, int mask = 0) { - auto user_weights_pd = user_weights_memory_p->get_desc(); - auto weights_pd = conv_pd_->weights_desc(); - return this->AcquireMemory( - weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p", - pipeline, is_persistent, is_INT8, scale_data, mask); - } - - std::shared_ptr AcquireBiasMemoryFromPrimitive( - const std::shared_ptr user_bias_memory_p, - std::vector& pipeline, // NOLINT - bool is_persistent = false, bool is_INT8 = false, - std::vector scale_data = {1.0f}, - int mask = 0) { // NOLINT - auto user_bias_pd = user_bias_memory_p->get_desc(); - auto bias_pd = conv_pd_->bias_desc(); - return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p, - "@bias_mem_p", pipeline, is_persistent, is_INT8, - scale_data, mask); - } - - mkldnn::primitive_attr CreatePostOps( - std::string fuse_activation, float fuse_alpha, float fuse_beta, - bool fuse_residual_conn, const std::vector output_shift_scale = {}, - float sum_scale = 1.0f) const { - mkldnn::primitive_attr conv_attr; - mkldnn::post_ops post_operations; - if (output_shift_scale.size() > 0) { - int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; - conv_attr.set_output_scales(mask, output_shift_scale); - } - // Fusion with Elementwise layer relies on adding a sum post-operation with - // the scale parameter. It is assumed that when fuse_residual_connection is - // true, the output tensor contains the data coming from residual - // connection. The result of this post_op is: - // Output = scale * Output + Conv_Out. - if (fuse_residual_conn) { - post_operations.append_sum(sum_scale); - } - // Fusion with ReLU layer is executed through the PostOps feature. Create a - // PostOps object and configure it to execute an eltwise relu operation. - if (fuse_activation == "relu" || fuse_activation == "leaky_relu") { - constexpr float scale = 1.0f; - post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, - fuse_alpha, fuse_beta); - } else if (fuse_activation == "relu6") { - constexpr float scale = 1.0f; - post_operations.append_eltwise(scale, - mkldnn::algorithm::eltwise_bounded_relu, - fuse_alpha, fuse_beta); - } else if (fuse_activation == "swish") { - constexpr float scale = 1.0f; - post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_swish, - fuse_alpha, fuse_beta); - } - conv_attr.set_post_ops(post_operations); - return conv_attr; - } - - std::shared_ptr - AcquireConvolutionPrimitiveDescriptor( - const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights, - paddle::optional bias, - const mkldnn::memory::desc& dst, const std::vector& strides, - const std::vector& dilations, - const std::vector& paddings, const mkldnn::engine& engine, - const std::string& fuse_activation, float fuse_alpha, float fuse_beta, - const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind, - const std::vector output_shift_scale = {}, - const float sum_scale = 1.0f) { - // Conv PD has to be passed to Grad op that - // may be exxecuted by diffrent thread, hence - // for that one we use key that does not contain TID - const std::string key_conv_pd = key_ + "@conv_pd"; - - conv_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_conv_pd)); - - if (conv_pd_ == nullptr) { - mkldnn::memory::dims stride_dims = strides; - mkldnn::memory::dims dilations_dims = dilations; - auto mkldnn_paddings = ToMkldnnPadding(paddings); - - auto conv_desc = - bias ? typename forward_t::desc( - fwd_prop_kind, convolutional_algorithm::T, src, - weights, *bias, dst, stride_dims, dilations_dims, - mkldnn_paddings[0], mkldnn_paddings[1]) - : typename forward_t::desc( - fwd_prop_kind, convolutional_algorithm::T, src, - weights, dst, stride_dims, dilations_dims, - mkldnn_paddings[0], mkldnn_paddings[1]); - - mkldnn::primitive_attr conv_attr = - CreatePostOps(fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, output_shift_scale, sum_scale); - - conv_pd_.reset( - new typename forward_t::primitive_desc(conv_desc, conv_attr, engine)); - // Save conv_pd/src_memory/weights_memory for backward pass - dev_ctx_.SetBlob(key_conv_pd, conv_pd_); - } - - return conv_pd_; - } - - std::shared_ptr AcquireConvolution() { - auto prim_key = key_ + "@conv_p"; - auto conv_p = - std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); - if (conv_p == nullptr) { - conv_p = std::make_shared(*conv_pd_); - - dev_ctx_.SetBlob(prim_key, conv_p); - } - return conv_p; - } - - std::shared_ptr AcquireConvolutionBackwardWeights() { - auto prim_key = key_ + "@conv_bwd_weights_p"; - auto conv_bwd_weights_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - if (conv_bwd_weights_p == nullptr) { - // create backward conv primitive for weights - conv_bwd_weights_p = - std::make_shared(*conv_bwd_weights_pd_); - dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p); - } - return conv_bwd_weights_p; - } - - std::shared_ptr AcquireConvolutionBackwardData() { - auto prim_key = key_ + "@conv_bwd_data_p"; - auto conv_bwd_data_p = - std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); - if (conv_bwd_data_p == nullptr) { - conv_bwd_data_p = std::make_shared(*conv_bwd_data_pd_); - dev_ctx_.SetBlob(prim_key, conv_bwd_data_p); - } - return conv_bwd_data_p; - } - - private: - std::shared_ptr conv_pd_; - std::shared_ptr - conv_bwd_weights_pd_; - std::shared_ptr conv_bwd_data_pd_; -}; - -using ConvMKLDNNHandler = - ConvMKLDNNTemplateHandler; - -template -static std::shared_ptr SetDstMemory( - const framework::ExecutionContext& ctx, framework::Tensor* output, - const std::shared_ptr& handler) { - T* output_data = - output->mutable_data(ctx.GetPlace(), handler->GetDstMemorySize()); - std::shared_ptr dst_memory_p = - handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); - return dst_memory_p; -} - -template -static std::shared_ptr SetDstMemory( - const framework::ExecutionContext& ctx, framework::Tensor* output, - const framework::Tensor* residual_param, - const mkldnn::memory::desc& user_residual_md, - const std::shared_ptr& handler, - std::vector* pipeline) { - const T* residual_param_data = residual_param->data(); - PADDLE_ENFORCE_NOT_NULL( - residual_param_data, - platform::errors::PreconditionNotMet("Residual parameter is required for " - "the DNNL conv+elementwise_add " - "fusion, but now it is missing.")); - std::shared_ptr user_residual_memory_p = - handler->AcquireResidualDataMemory(user_residual_md, - to_void_cast(residual_param_data)); - T* output_data = output->mutable_data(ctx.GetPlace()); - std::shared_ptr dst_memory_p = - handler->AcquireDstMemoryFromResidualDataMemory( - user_residual_memory_p, to_void_cast(output_data), *pipeline); - return dst_memory_p; -} - -template -static void SetDstMemoryHandler( - const framework::ExecutionContext& ctx, framework::Tensor* output, - const std::shared_ptr& handler, - std::shared_ptr dst_memory_p) { - T* output_data = - output->mutable_data(ctx.GetPlace(), handler->GetDstMemorySize()); - dst_memory_p->set_data_handle(to_void_cast(output_data)); -} - template static void SetDstMemoryQuantized( const framework::ExecutionContext& ctx, framework::Tensor* output, @@ -1524,5 +983,6 @@ static void SetDstMemoryQuantized( dst_memory.reset( new mkldnn::memory(*dst_md, engine, to_void_cast(output_data))); } + } // namespace platform } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index db05801c7227b0..8ea4e369d32361 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -20,7 +20,8 @@ import paddle import paddle.fluid.core as core import paddle.fluid as fluid -from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient +from paddle.fluid.tests.unittests.op_test import ( + OpTest, convert_float_to_uint16, get_numeric_gradient) from paddle.fluid.tests.unittests.testsuite import create_op from paddle.fluid import Program, program_guard diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py index 027c806fc02e90..89125dc326d15b 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py @@ -22,7 +22,7 @@ paddle.enable_static() import paddle.fluid.core as core import paddle.fluid as fluid -from op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest def conv2dtranspose_forward_naive(input_, filter_, attrs): From 730dcaf48f6b1e0e561860eb503ceef9a9498b59 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Thu, 7 Oct 2021 22:06:21 +0800 Subject: [PATCH 067/298] fix bugs in HybridParallelClipGrad of hybrid_parallel_optimizer (#36237) * fix bugs in HybridParallelClipGrad of hybrid_parallel_optimizer * update * update --- .../hybrid_parallel_optimizer.py | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index 581fbc5153ad49..b00ef2cdcb0e10 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -50,7 +50,8 @@ def __init__(self, clip, hcg): @imperative_base.no_grad def _dygraph_clip(self, params_grads): params_and_grads = [] - sum_square_list = [] + sum_square_list_dist = [] + sum_square_list_not_dist = [] for p, g in params_grads: if g is None: continue @@ -62,18 +63,33 @@ def _dygraph_clip(self, params_grads): merge_grad = layers.get_tensor_from_selected_rows(merge_grad) square = layers.square(merge_grad) sum_square = layers.reduce_sum(square) - sum_square_list.append(sum_square) + + if p.is_distributed: + sum_square_list_dist.append(sum_square) + else: + sum_square_list_not_dist.append(sum_square) # all parameters have been filterd out - if len(sum_square_list) == 0: + if len(sum_square_list_dist) + len(sum_square_list_not_dist) == 0: return params_grads - global_norm_var = layers.concat(sum_square_list) - global_norm_var = layers.reduce_sum(global_norm_var) - # add all reduce to get global norm in world size - paddle.distributed.all_reduce(global_norm_var, - self._hcg.get_check_parallel_group()) - global_norm_var = layers.sqrt(global_norm_var) + global_norm_var_dist = layers.concat(sum_square_list_dist) if len( + sum_square_list_dist) != 0 else layers.concat( + [paddle.to_tensor([0.])]) + global_norm_var_dist = layers.reduce_sum(global_norm_var_dist) + global_norm_var_not_dist = layers.concat( + sum_square_list_not_dist) if len( + sum_square_list_not_dist) != 0 else layers.concat( + [paddle.to_tensor([0.])]) + global_norm_var_not_dist = layers.reduce_sum(global_norm_var_not_dist) + + # add all reduce to get global norm of distributed params_and_grads in world size + # all reduce is not needed while getting global norm of non-distributed params_and_grads + paddle.distributed.all_reduce( + global_norm_var_dist, group=self._hcg.get_check_parallel_group()) + + global_norm_var = layers.sqrt(global_norm_var_dist + + global_norm_var_not_dist) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) @@ -96,7 +112,7 @@ def __getattr__(self, item): return getattr(self._clip, item) def __call__(self, params_grads): - return self._clip(params_grads) + return self._dygraph_clip(params_grads) class HybridParallelOptimizer: @@ -112,7 +128,7 @@ def __init__(self, optimizer, hcg, strategy): self._need_dp = (self._hcg.get_data_parallel_world_size() > 1) # NOTE(shenliang03): Because of the pure DataParallel mode, the gradient synchronization - # is achieved through reducer, so there is no need to call fuse_allreduce in oprimizer. + # is achieved through reducer, so there is no need to call fuse_allreduce in optimizer. self._dp_enable = not self._use_dp_mode and self._need_dp self._sharding_enable = ( From 9814f89551e2133c6733352f6445d4d668da6f63 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 8 Oct 2021 10:47:13 +0800 Subject: [PATCH 068/298] fix cast cuda implementation (#36266) --- paddle/fluid/operators/cast_op.cu | 64 ++++++++++++++++--------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index 06300817e0a128..601735c2f148ad 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -47,12 +47,12 @@ __global__ void CastCUDAKernel(const InT* in, const int64_t N, OutT* out) { } template -struct CastOpFunctor { +struct CastCUDAOpFunctor { const framework::Tensor* in_; framework::Tensor* out_; const platform::CUDADeviceContext& ctx_; - CastOpFunctor(const framework::Tensor* in, framework::Tensor* out, - const platform::CUDADeviceContext& ctx) + CastCUDAOpFunctor(const framework::Tensor* in, framework::Tensor* out, + const platform::CUDADeviceContext& ctx) : in_(in), out_(out), ctx_(ctx) {} template @@ -75,6 +75,21 @@ struct CastOpFunctor { } }; +template +class CastCUDAOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + framework::VisitDataType( + static_cast( + context.Attr("out_dtype")), + CastCUDAOpFunctor( + in, out, + context.template device_context())); + } +}; + } // namespace operators } // namespace paddle @@ -82,34 +97,21 @@ namespace ops = paddle::operators; #ifdef PADDLE_WITH_HIP REGISTER_OP_CUDA_KERNEL( - cast, ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel>, - ops::CastOpKernel>); + cast, ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, + ops::CastCUDAOpKernel, + ops::CastCUDAOpKernel, + ops::CastCUDAOpKernel>, + ops::CastCUDAOpKernel>); #else REGISTER_OP_CUDA_KERNEL( - cast, ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel>, - ops::CastOpKernel>); + cast, ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, + ops::CastCUDAOpKernel, + ops::CastCUDAOpKernel, + ops::CastCUDAOpKernel, + ops::CastCUDAOpKernel>, + ops::CastCUDAOpKernel>); #endif From 1bd9cfef4e27baa84fd40ed1e65e80017d0cf232 Mon Sep 17 00:00:00 2001 From: arlesniak Date: Fri, 8 Oct 2021 05:33:09 +0200 Subject: [PATCH 069/298] Added oneDNN BF16 relu (#36265) * Added oneDNN BF16 relu * fixed typo * refactored test, review fixes --- .../operators/mkldnn/activation_mkldnn_op.cc | 3 +- .../mkldnn/test_activation_bf16_mkldnn_op.py | 44 ++++++++++++++++--- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index d992890adeec3e..603a70458b0ceb 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -257,7 +257,6 @@ namespace ops = paddle::operators; ops::grad_functor>); #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ - __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); \ __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor); \ @@ -267,6 +266,8 @@ namespace ops = paddle::operators; __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL); +REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor, + ReluMKLDNNGradFunctor); REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor); REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor, diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py index 3d5a0139158337..cd9987b3c8e824 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py @@ -14,6 +14,8 @@ from __future__ import print_function +import six +import abc import unittest import numpy as np from scipy.special import expit, erf @@ -24,15 +26,19 @@ @OpTestTool.skip_if_not_cpu_bf16() -class TestMKLDNNSigmoidBF16Op(TestActivation): +@six.add_metaclass(abc.ABCMeta) +class MKLDNNBF16ActivationOp(object): + @abc.abstractmethod def config(self): - self.op_type = "sigmoid" + pass + @abc.abstractmethod def op_forward(self, x): - return 1 / (1 + np.exp(-x)) + pass + @abc.abstractmethod def op_grad(self, dout, x): - return dout * self.op_forward(x) * (1 - self.op_forward(x)) + pass def set_attrs(self): self.attrs = {"use_mkldnn": True} @@ -65,7 +71,18 @@ def test_check_grad(self): user_defined_grad_outputs=[convert_float_to_uint16(self.out)]) -class TestMKLDNNGeluErfBF16Op(TestMKLDNNSigmoidBF16Op): +class TestMKLDNNSigmoidBF16Op(MKLDNNBF16ActivationOp, TestActivation): + def config(self): + self.op_type = "sigmoid" + + def op_forward(self, x): + return 1 / (1 + np.exp(-x)) + + def op_grad(self, dout, x): + return dout * self.op_forward(x) * (1 - self.op_forward(x)) + + +class TestMKLDNNGeluErfBF16Op(MKLDNNBF16ActivationOp, TestActivation): def config(self): self.op_type = "gelu" @@ -83,7 +100,7 @@ def init_data(self): self.x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32) -class TestMKLDNNGeluTanhBF16Op(TestMKLDNNSigmoidBF16Op): +class TestMKLDNNGeluTanhBF16Op(MKLDNNBF16ActivationOp, TestActivation): def config(self): self.op_type = "gelu" @@ -104,3 +121,18 @@ def set_attrs(self): class TestMKLDNNGeluTanhDim2BF16Op(TestMKLDNNGeluTanhBF16Op): def init_data(self): self.x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32) + + +class TestMKLDNNReluBF16Op(MKLDNNBF16ActivationOp, TestActivation): + def config(self): + self.op_type = "relu" + + def op_forward(self, x): + return np.maximum(x, 0) + + def op_grad(self, dout, x): + return dout + + +if __name__ == '__main__': + unittest.main() From a29ff4c77a658f1265b56b3cb9b3a7ad7f296f73 Mon Sep 17 00:00:00 2001 From: huangxu96 <46740794+huangxu96@users.noreply.github.com> Date: Fri, 8 Oct 2021 16:19:16 +0800 Subject: [PATCH 070/298] add python interface of sub_graph (#36120) Add python interface of subgraph: 1. all_sub_graphs() 2. get_sub_graph(idx) --- paddle/fluid/pybind/ir.cc | 10 +- python/paddle/fluid/framework.py | 26 ++++- .../ir/test_ir_subgraph_python_interface.py | 96 +++++++++++++++++++ 3 files changed, 128 insertions(+), 4 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index e27e3674eeeb5b..050bfc967daa10 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -125,7 +125,15 @@ void BindGraph(py::module *m) { return_value_policy::reference) .def("resolve_hazard", &Graph::ResolveHazard) .def("origin_program_desc", &Graph::OriginProgram, - return_value_policy::reference); + return_value_policy::reference) + .def("sub_graph_size", &Graph::SubGraphsSize) + .def("get_sub_graph", [](Graph &self, int i) { + /* Here we use a lambda function as an empty deleter to avoid the double + free of smart pointer. + Otherwise, this shared pointer will be free both in python and + cpp scope, which will lead a core dumped. */ + return std::shared_ptr(self.GetSubGraph(i), [](Graph *) {}); + }); } void BindNode(py::module *m) { diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b6241f6e5299df..7f2937b9af7643 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -3956,6 +3956,23 @@ def all_op_nodes(self): """ return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()} + def all_sub_graphs(self, for_test=False): + """ + Return all sub_graphs included in the main graph as a set. + """ + + return [ + IrGraph( + self.graph.get_sub_graph(i), for_test=for_test) + for i in range(self.graph.sub_graph_size()) + ] + + def get_sub_graph(self, i, for_test=False): + """ + Return i-th sub_graph in the main graph. + """ + return IrGraph(self.graph.get_sub_graph(i), for_test=for_test) + def create_persistable_node(self, name, var_type, shape, var_dtype): """ Create a persistable variable node in the graph. In IrGraph, @@ -4102,8 +4119,10 @@ def link_to(self, node_in, node_out): node_in(IrNode): the input node. node_out(IrNode): the output node. """ - assert node_in.node in self.graph.nodes() and node_out.node in self.graph.nodes(), \ - 'The two arguments(node_in&node_out) must be in the graph nodes.' + assert node_in.node in self.graph.nodes(), ( + 'node_in(%s) must be in the graph nodes.' % node_in.node.name()) + assert node_out.node in self.graph.nodes(), ( + 'node_out(%s) must be in the graph nodes.' % node_out.node.name()) node_in.append_output(node_out) node_out.append_input(node_in) @@ -4265,7 +4284,8 @@ def _find_node_by_name(self, nodes, node_name): for n in nodes: if n.name() == node_name: target_node = n - assert target_node is not None, "Cannot find the target node in the giving set." + assert target_node is not None, ( + "Cannot find the target node (%s)in the giving set." % node_name) return target_node def _update_desc_attr(self, desc, name, val): diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py new file mode 100644 index 00000000000000..49ca89a35f4ac7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py @@ -0,0 +1,96 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import paddle.fluid as fluid +import six + +from paddle.fluid.framework import IrGraph +from paddle.fluid.framework import IrNode +from paddle.fluid.tests.unittests.op_test import OpTestTool +from paddle.fluid import core +import paddle.fluid.layers as layers +from paddle.fluid.framework import Program, program_guard, default_startup_program +from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass + +paddle.enable_static() + + +class TestQuantizationSubGraph(unittest.TestCase): + def build_graph_with_sub_graph(self): + def linear_fc(num): + data = fluid.layers.data( + name='image', shape=[1, 32, 32], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = data + for _ in six.moves.xrange(num): + hidden = fluid.layers.fc(hidden, size=128, act='relu') + loss = fluid.layers.cross_entropy(input=hidden, label=label) + loss = fluid.layers.mean(loss) + return loss + + main_program = Program() + startup_program = Program() + + def true_func(): + return linear_fc(3) + + def false_func(): + return linear_fc(5) + + with program_guard(main_program, startup_program): + x = layers.fill_constant(shape=[1], dtype='float32', value=0.1) + y = layers.fill_constant(shape=[1], dtype='float32', value=0.23) + pred = layers.less_than(y, x) + out = layers.cond(pred, true_func, false_func) + + core_graph = core.Graph(main_program.desc) + # We should create graph for test, otherwise it will throw a + # error that it cannot find the node of "STEP_COUNTER" + graph = IrGraph(core_graph, for_test=True) + sub_graph = graph.get_sub_graph(0) + all_sub_graphs = graph.all_sub_graphs( + for_test=True) # same reason for subgraph + # Should return graph and sub_graphs at the same time. If only return sub_graph, the graph will + # be destructed and the sub_graphs will be empty. + return graph, all_sub_graphs + + def test_quant_sub_graphs(self, use_cuda=False): + graph, sub_graphs = self.build_graph_with_sub_graph() + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + transform_pass = QuantizationTransformPass( + scope=fluid.global_scope(), + place=place, + activation_quantize_type='abs_max', + weight_quantize_type='range_abs_max') + Find_inserted_quant_op = False + for sub_graph in sub_graphs: + transform_pass.apply(sub_graph) + for op in sub_graph.all_op_nodes(): + if 'quantize' in op.name(): + Find_inserted_quant_op = True + self.assertTrue(Find_inserted_quant_op) + + def test_quant_sub_graphs_cpu(self): + self.test_quant_sub_graphs(use_cuda=False) + + @OpTestTool.skip_if(not paddle.is_compiled_with_cuda(), + "Not GPU version paddle") + def test_quant_sub_graphs_gpu(self): + self.test_quant_sub_graphs(use_cuda=True) + + +if __name__ == '__main__': + unittest.main() From 7cb19f575f8ff7e8f4d03fd70a5fc33c76360a36 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Fri, 8 Oct 2021 16:44:01 +0800 Subject: [PATCH 071/298] [NPU] BatchNorm support layout of NCL and NLC, test=develop (#35668) * [NPU] support NCL and NCL for BatchNorm, test=develop * [NPU] remove debug files, test=develop * update, test=develop --- paddle/fluid/operators/batch_norm_op_npu.cc | 62 ++++++++++++++----- paddle/fluid/operators/conv_op_npu.cc | 5 -- .../unittests/npu/test_batch_norm_op_npu.py | 54 +++++++++++++++- .../tests/unittests/test_batch_norm_op.py | 37 ++++++++++- 4 files changed, 133 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc index dfb620a4e96bdb..791c3656791da4 100644 --- a/paddle/fluid/operators/batch_norm_op_npu.cc +++ b/paddle/fluid/operators/batch_norm_op_npu.cc @@ -38,11 +38,13 @@ class NPUBatchNormOpKernel : public framework::OpKernel { const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::InvalidArgument( - "The input tensor X's dimension must equal to 4. But " - "received X's shape = [%s], X's dimension = [%d].", - x_dims, x_dims.size())); + PADDLE_ENFORCE_EQ( + (x_dims.size() == 4UL || x_dims.size() == 3UL), true, + platform::errors::InvalidArgument( + "The input tensor X's dimension must equal to 3 or 4. " + " But got X's shape = [%s], X's dimension = [%d].", + x_dims.to_str(), x_dims.size())); + const auto *running_mean = ctx.Input("Mean"); const auto *running_var = ctx.Input("Variance"); const auto *scale = ctx.Input("Scale"); @@ -51,8 +53,11 @@ class NPUBatchNormOpKernel : public framework::OpKernel { auto *y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); - Tensor x_tensor(x->type()); - Tensor y_tesnor(y->type()); + auto &dev_ctx = ctx.template device_context(); + auto x_tensor = + ctx.AllocateTmpTensor(x->dims(), dev_ctx); + auto y_tesnor = + ctx.AllocateTmpTensor(y->dims(), dev_ctx); x_tensor.ShareDataWith(*x); y_tesnor.ShareDataWith(*y); if (data_layout == DataLayout::kNHWC) { @@ -89,6 +94,18 @@ class NPUBatchNormOpKernel : public framework::OpKernel { sum.mutable_data(running_mean->dims(), ctx.GetPlace()); square_sum.mutable_data(running_mean->dims(), ctx.GetPlace()); + // BNTrainingReduce ONLY support rank = 4 + if (x->dims().size() == 3) { + auto x_shape_vec = framework::vectorize(x->dims()); + if (data_layout == DataLayout::kNCHW) { + x_shape_vec.push_back(1); // expand NCL -> NCL1 + } else { + x_shape_vec.insert(x_shape_vec.begin() + 2, 1); // expand NLC -> NL1C + } + auto x_new_shape = framework::make_ddim(x_shape_vec); + x_tensor.Resize(x_new_shape); + x_tensor.Resize(x_new_shape); + } const auto &runner_reduce = NpuOpRunner("BNTrainingReduce", {x_tensor}, {sum, square_sum}, {{"epsilon", epsilon}}); @@ -127,8 +144,11 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { use_global_stats = is_test || use_global_stats; - Tensor x_tensor(x->type()); - Tensor dy_tensor(d_y->type()); + auto &dev_ctx = ctx.template device_context(); + auto x_tensor = + ctx.AllocateTmpTensor(x->dims(), dev_ctx); + auto dy_tensor = + ctx.AllocateTmpTensor(d_y->dims(), dev_ctx); x_tensor.ShareDataWith(*x); dy_tensor.ShareDataWith(*d_y); if (data_layout == DataLayout::kNHWC) { @@ -136,14 +156,14 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { dy_tensor.set_layout(DataLayout::kNHWC); } - Tensor scale_grad_tmp(scale->type()); - Tensor bias_grad_tmp(bias->type()); + auto scale_grad_tmp = + ctx.AllocateTmpTensor(scale->dims(), dev_ctx); + auto bias_grad_tmp = + ctx.AllocateTmpTensor(bias->dims(), dev_ctx); if (d_scale == nullptr) { - scale_grad_tmp.Resize(scale->dims()); d_scale = &scale_grad_tmp; } if (d_bias == nullptr) { - bias_grad_tmp.Resize(bias->dims()); d_bias = &bias_grad_tmp; } @@ -169,9 +189,23 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { } if (d_x) { d_x->mutable_data(ctx.GetPlace()); - Tensor dx_tensor(d_x->type()); + auto dx_tensor = + ctx.AllocateTmpTensor(d_x->dims(), dev_ctx); dx_tensor.ShareDataWith(*d_x); if (use_global_stats) { + if (x->dims().size() == 3) { + // BNInferGrad only support x rank = 4, + auto x_shape_vec = framework::vectorize(d_x->dims()); + if (data_layout == DataLayout::kNCHW) { + x_shape_vec.push_back(1); // expand NCL -> NCL1 + } else { + x_shape_vec.insert(x_shape_vec.begin() + 2, + 1); // expand NLC -> NL1C + } + auto x_new_shape = framework::make_ddim(x_shape_vec); + dx_tensor.Resize(x_new_shape); + dy_tensor.Resize(x_new_shape); + } const auto *running_var = ctx.Input("Variance"); const auto &runner_infer = NpuOpRunner("BNInferGrad", {dy_tensor, *scale, *running_var}, diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc index 86724e06975ed4..47de843d1ac6f6 100644 --- a/paddle/fluid/operators/conv_op_npu.cc +++ b/paddle/fluid/operators/conv_op_npu.cc @@ -186,11 +186,6 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel { dilations[3] = dilation[1]; } - // LOG(INFO) << "strides = " << framework::make_ddim(strides).to_str(); - // LOG(INFO) << "dilations = " << framework::make_ddim(dilations).to_str(); - // LOG(INFO) << "padding = " << framework::make_ddim(padding).to_str(); - // LOG(INFO) << "data_format = " << data_format; - if (filter_grad) { filter_grad->mutable_data(ctx.GetPlace()); diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py index 1b8b13a0d27eac..877f9904f3407c 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py @@ -45,6 +45,14 @@ def check_with_place(self, place, data_layout, dtype, shape): if len(shape) == 2: x_shape = shape c = x_shape[1] + if len(shape) == 3: + n, l, c = shape[0], shape[1], shape[2] + if data_layout == "NHWC": # NLC + x_shape = [n, l, c] + elif data_layout == "NCHW": # NCL + x_shape = [n, c, l] + else: + raise ValueError("Unknown data layout.") else: n, h, w, c = shape[0], shape[1], shape[2], shape[3] if data_layout == "NHWC": @@ -117,6 +125,7 @@ def test_check_output(self): place = core.NPUPlace(0) for data_format in self.data_formats: self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5]) + self.check_with_place(place, data_format, self.dtype, [3, 8, 5]) def init_kernel_type(self): pass @@ -185,10 +194,19 @@ def test_with_place(place, data_layout, shape): # attr epsilon = self.epsilon momentum = self.momentum - if data_layout == "NCHW": - n, c, h, w = shape[0], shape[1], shape[2], shape[3] + + if len(shape) == 3: + if data_layout == "NHWC": # NLC + n, l, c = shape[0], shape[1], shape[2] + elif data_layout == "NCHW": # NCL + n, c, l = shape[0], shape[1], shape[2] + else: + raise ValueError("Unknown data layout.") else: - n, h, w, c = shape[0], shape[1], shape[2], shape[3] + if data_layout == "NCHW": + n, c, h, w = shape[0], shape[1], shape[2], shape[3] + else: + n, h, w, c = shape[0], shape[1], shape[2], shape[3] scale_shape = [c] np.random.seed(123) @@ -296,6 +314,7 @@ def test_with_place(place, data_layout, shape): for data_format in self.data_formats: test_with_place(core.NPUPlace(0), data_format, [2, 3, 4, 5]) + test_with_place(core.NPUPlace(0), data_format, [3, 8, 5]) def init_kernel_type(self): pass @@ -328,6 +347,17 @@ def init_test_case(self): ] def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format): + x_shape = x.shape + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + y_grad = np.reshape(y_grad, + (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) + y_grad = np.reshape(y_grad, + (x_shape[0], x_shape[1], 1, x_shape[2])) + if data_format == "NCHW": x = np.transpose(x, (0, 2, 3, 1)) y_grad = np.transpose(y_grad, (0, 2, 3, 1)) @@ -343,6 +373,9 @@ def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format): x = np.transpose(x, (0, 3, 1, 2)) y_grad = np.transpose(y_grad, (0, 3, 1, 2)) + if len(x_shape) == 3: + x_grad = np.reshape(x_grad, x_shape) + return x_grad, grad_scale, grad_offset def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, @@ -350,6 +383,17 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, if data_layout != "NCHW" and data_layout != "NHWC": raise ValueError("Unknown data order.") + x_shape = x.shape + if len(x_shape) == 3: + if data_layout == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + y_grad = np.reshape(y_grad, + (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) + y_grad = np.reshape(y_grad, + (x_shape[0], x_shape[1], 1, x_shape[2])) + if data_layout == "NCHW": x = np.transpose(x, (0, 2, 3, 1)) @@ -369,6 +413,10 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, x_grad, scale_grad, bias_grad = self.reference_grad( x, y_grad, scale, mean, variance, epsilon, data_layout) + if len(x_shape) == 3: + y = np.reshape(y, x_shape) + x_grad = np.reshape(x_grad, x_shape) + return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py index 9eaa69ce644285..cce13a8bf3b74a 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py @@ -36,6 +36,11 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format): x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1)) else: x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1])) + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) if data_format == "NCHW": n, c, h, w = x.shape @@ -55,13 +60,19 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format): else: raise ValueError("Unknown data order.") - if len(x_shape) == 2: + if len(x_shape) == 2 or len(x_shape) == 3: y = np.reshape(y, x_shape) return y def _cal_mean_variance(x, epsilon, data_format): assert data_format in ['NCHW', 'NHWC'] + x_shape = x.shape + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) x_square = x * x axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2) C = x.shape[1] if data_format == 'NCHW' else x.shape[-1] @@ -76,6 +87,12 @@ def _cal_mean_variance(x, epsilon, data_format): def _reference_training(x, scale, offset, epsilon, data_format): x_shape = x.shape + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) + if data_format == "NCHW": n, c, h, w = x.shape x_square = x * x @@ -94,7 +111,6 @@ def _reference_training(x, scale, offset, epsilon, data_format): offset_tile = np.reshape(offset, (1, c, 1, 1)) offset_tile = np.reshape(offset_tile, (1, c, 1, 1)) y = normalized * scale_tile + offset_tile - return y, mean, var elif data_format == "NHWC": x_square = x * x x_square_sum = np.sum(x_square, (0, 1, 2)) @@ -104,10 +120,13 @@ def _reference_training(x, scale, offset, epsilon, data_format): var = x_square_sum / element_count - mean * mean normalized = (x - mean) / np.sqrt(var + epsilon) y = normalized * scale + offset - return y, mean, var else: raise ValueError("Unknown data order.") + if len(x_shape) == 3: + y = np.reshape(y, x_shape) + return y, mean, var + def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format): # Use the following formulas to calculate gradients: @@ -124,6 +143,15 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format): if data_format != "NCHW" and data_format != "NHWC": raise ValueError("Unknown data order.") + x_shape = x.shape + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + y_grad = np.reshape(y_grad, (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) + y_grad = np.reshape(y_grad, (x_shape[0], x_shape[1], 1, x_shape[2])) + if data_format == "NCHW": x = np.transpose(x, (0, 2, 3, 1)) y_grad = np.transpose(y_grad, (0, 2, 3, 1)) @@ -142,6 +170,9 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format): x = np.transpose(x, (0, 3, 1, 2)) y_grad = np.transpose(y_grad, (0, 3, 1, 2)) + if len(x_shape) == 3: + x_grad = np.reshape(x_grad, x_shape) + return x_grad, grad_scale, grad_offset From ca16e8fd7bd1bf27abb9b2cea053b9f98eddea76 Mon Sep 17 00:00:00 2001 From: yaoxuefeng Date: Fri, 8 Oct 2021 16:52:05 +0800 Subject: [PATCH 072/298] add fs list_files_info (#36224) --- python/paddle/distributed/fleet/utils/fs.py | 32 +++++++++++++++++++ .../fluid/tests/unittests/hdfs_test_utils.py | 9 ++++++ .../fluid/tests/unittests/test_hdfs2.py | 1 + 3 files changed, 42 insertions(+) diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index fb518f62a1269e..d3f84d50ac8f9f 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -1106,3 +1106,35 @@ def _split_files(self, files, trainer_id, trainers): begin += blocks[i] return trainer_files[trainer_id] + + def list_files_info(self, path_list): + """ + list_files return file path and size + Args: + path_list(list): file list + Returns: + fileist(list): file list with file path and size + """ + if len(path_list) <= 0: + return [] + + file_list = [] + + #concat filelist can speed up 'hadoop ls' + str_concat = "" + for path in path_list: + str_concat += path + " " + cmd = "ls " + str_concat + " | awk '{if ($8 != \"\") {print $5\" \"$8 }}'" + ret, lines = self._run_cmd(cmd) + if (len(lines) == 0): + logger.warning("list_files empty, path[%s]" % path_list) + return [] + for line in lines: + arr = line.split(' ') + if len(arr) < 2: + continue + file_path = arr[1] + file_size = int(arr[0]) + file_list.append({'path': file_path, 'size': file_size}) + + return file_list diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py index 1535fac499ec61..6b49049073948f 100644 --- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py +++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py @@ -245,6 +245,15 @@ def _test_touch(self, fs): self.assertFalse(fs.is_dir(path)) fs.delete(path) + def _test_list_files_info(self, fs): + path = [] + fs.list_files_info(path) + path = ["./list_files_info.flag"] + fs.list_files_info(path) + fs.touch(path, exist_ok=True) + fs.list_files_info(path) + fs.delete(path) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_hdfs2.py b/python/paddle/fluid/tests/unittests/test_hdfs2.py index 1fa019bb9cd02c..a74fc558382fe3 100644 --- a/python/paddle/fluid/tests/unittests/test_hdfs2.py +++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py @@ -35,6 +35,7 @@ def test_hdfs(self): self._test_rm(fs) self._test_touch(fs) self._test_dirs(fs) + self._test_list_files_info(fs) def test_local(self): fs = LocalFS() From f9591bb172e7274a77bfdcb6493579824aec8b47 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 8 Oct 2021 18:06:26 +0800 Subject: [PATCH 073/298] Support CUDA Graph on ParallelExecutor (#36250) * support CUDA Graph on PE * add ut, fix CI compile * reduce memory consumption * fix CUDA 10 CI * improve coverage * improve python coverage --- .../fluid/framework/details/build_strategy.h | 2 + .../details/scale_loss_grad_op_handle.cc | 19 ++- .../details/scale_loss_grad_op_handle.h | 6 + .../scope_buffered_ssa_graph_executor.cc | 53 ++++--- .../scope_buffered_ssa_graph_executor.h | 2 +- .../framework/distributed_strategy.proto | 1 + .../multi_devices_graph_pass/CMakeLists.txt | 2 +- .../modify_op_lock_and_record_event_pass.cc | 14 +- paddle/fluid/framework/parallel_executor.cc | 143 ++++++++++++++++++ paddle/fluid/framework/parallel_executor.h | 2 + paddle/fluid/operators/conv_cudnn_helper.h | 3 + paddle/fluid/platform/cuda_graph.cc | 12 ++ paddle/fluid/platform/cuda_graph.h | 10 +- .../platform/cuda_graph_with_memory_pool.cc | 9 +- paddle/fluid/platform/gpu_info.cc | 2 +- paddle/fluid/pybind/pybind.cc | 27 +++- python/paddle/fluid/executor.py | 12 +- .../fluid/tests/unittests/test_cuda_graph.py | 91 ++++++++++- 18 files changed, 368 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 0629f1b91504a2..25110fe24f5871 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -143,6 +143,8 @@ struct BuildStrategy { // Turn off inplace addto by default. bool enable_addto_{false}; + bool allow_cuda_graph_capture_{false}; + // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // num_trainers is 1, so the current fields of build_strategy doesn't tell if // it's distributed model. diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index c0c3e14c8bf231..1e3cd4f0aa77c9 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -86,19 +86,28 @@ struct ScaleLossGradFunctor { } }; +std::string ScaleLossGradOpHandle::LossGradName() const { + return static_cast(this->outputs_[0])->name(); +} + void ScaleLossGradOpHandle::RunImpl() { platform::RecordEvent record_event(Name()); - // Doesn't wait any event - std::string var_name = static_cast(this->outputs_[0])->name(); + RunOnVar(local_exec_scopes_[0]->FindVar(LossGradName()), true); +} - auto *tensor = - local_exec_scopes_[0]->FindVar(var_name)->GetMutable(); +void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) { + auto *tensor = var->GetMutable(); tensor->Resize(make_ddim({1})); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_)); - this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); }); + if (record_event) { + this->RunAndRecordEvent( + [&] { framework::VisitDataType(out_dtype_, func); }); + } else { + framework::VisitDataType(out_dtype_, func); + } #else ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr); framework::VisitDataType(out_dtype_, func); diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h index 02e5aa88443df1..88fe02a749fe4b 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h @@ -46,6 +46,12 @@ struct ScaleLossGradOpHandle : public OpHandleBase { std::string Name() const override; + platform::Place GetPlace() const { return place_; } + + void RunOnVar(Variable *var, bool record_event = false); + + std::string LossGradName() const; + protected: void RunImpl() override; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index ad47846c59a05b..5d271d06b6922f 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -22,7 +22,9 @@ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/profiler.h" + namespace paddle { namespace framework { namespace details { @@ -49,8 +51,29 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( PrepareLocalExeScopes(); } +static void RunProgramDescs(const ProgramDescs &programs, + const std::vector &local_exec_scopes, + const std::vector &places) { + for (auto &program : programs) { + for (auto &op_desc : program.Block(0).AllOps()) { + for (size_t i = 0; i < local_exec_scopes.size(); ++i) { + auto op = OpRegistry::CreateOp(*op_desc); + op->Run(*local_exec_scopes[i], places[i]); + } + } + } +} + FetchResultType ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { +#ifdef PADDLE_WITH_CUDA + if (platform::IsCUDAGraphCapturing()) { + strategy_.num_iteration_per_drop_scope_ = + std::numeric_limits::max(); + DropLocalExeScopes(/*need_wait=*/false); + } +#endif + if (drop_scope_counter_ == 0) { platform::RecordEvent e("InitLocalVars"); InitVariables(); @@ -84,7 +107,7 @@ FetchResultType ScopeBufferedSSAGraphExecutor::Run( ++drop_scope_counter_; if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_ || DropScopeOrNot()) { - DropLocalExeScopes(); + DropLocalExeScopes(!platform::IsCUDAGraphCapturing()); } if (VLOG_IS_ON(5)) { @@ -128,15 +151,7 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() { if (graph.Has(details::kStartupProgramDescs)) { auto &program_descs = graph.Get(details::kStartupProgramDescs); - - for (auto &program_desc : program_descs) { - for (auto &op_desc : program_desc.Block(0).AllOps()) { - for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { - auto op = OpRegistry::CreateOp(*op_desc); - op->Run(*local_exec_scopes_[i], places_[i]); - } - } - } + RunProgramDescs(program_descs, local_exec_scopes_, places_); } is_initialized_ = true; } @@ -144,23 +159,17 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() { if (graph.Has(details::kProgramDescs)) { auto &program_descs = graph.Get(details::kProgramDescs); - - for (auto &program_desc : program_descs) { - for (auto &op_desc : program_desc.Block(0).AllOps()) { - for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { - auto op = OpRegistry::CreateOp(*op_desc); - op->Run(*local_exec_scopes_[i], places_[i]); - } - } - } + RunProgramDescs(program_descs, local_exec_scopes_, places_); } } -void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() { +void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) { platform::RecordEvent drop_scope_event("DropLocalExeScopes"); drop_scope_counter_ = 0; - for (auto &p : places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); + if (need_wait) { + for (auto &p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } } scope_monitor_.ClearHistoryLocalExecScopes(); for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index aa2b113c960a38..ea5a3c07957bfd 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -53,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { FetchResultType Run(const std::vector& fetch_tensors, bool return_merged) override; - void DropLocalExeScopes(); + void DropLocalExeScopes(bool need_wait = true); bool NeedCreateLocalExeScope(); diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 17d15a94c7287b..e7a25de96a9471 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -115,6 +115,7 @@ message BuildStrategy { optional bool enable_auto_fusion = 11 [ default = false ]; optional bool enable_addto = 12 [ default = false ]; optional bool fix_op_run_order = 13 [ default = false ]; + optional bool allow_cuda_graph_capture = 14 [ default = false ]; } message ExecutionStrategy { diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt index 6764799d828661..fea12baf0651fa 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) +cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle op_graph_view multi_devices_helper) cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc index 70b95c9154fd30..afd80e45cf65e5 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h" @@ -21,14 +22,23 @@ namespace paddle { namespace framework { namespace ir { +template +static bool IsMatchedPlaceSingleDeviceOp(details::OpHandleBase *op_base, + const platform::Place &place) { + auto *op = dynamic_cast(op_base); + return op && op->GetPlace() == place; +} + static bool IsLockAndRecordEventFreeComputationOpHandle( details::ComputationOpHandle *op, const OpGraphView &graph_view) { if (!platform::is_gpu_place(op->GetPlace()) && !platform::is_xpu_place(op->GetPlace())) return false; for (auto &pending_op : graph_view.PendingOps(op)) { - auto *tmp = dynamic_cast(pending_op); - if (tmp == nullptr || !(tmp->GetPlace() == op->GetPlace())) { + if (!IsMatchedPlaceSingleDeviceOp( + pending_op, op->GetPlace()) && + !IsMatchedPlaceSingleDeviceOp( + pending_op, op->GetPlace())) { return false; } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index adbbfb380bc45f..d19ac0b65f4d1e 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -27,6 +27,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" @@ -34,6 +35,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/profiler.h" @@ -43,6 +45,10 @@ limitations under the License. */ DECLARE_double(eager_delete_tensor_gb); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +DECLARE_bool(sync_nccl_allreduce); +#endif + #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" #endif @@ -669,6 +675,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, // ncclOp std::vector async_graphs = CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name); + PrepareForCUDAGraphCapture(graph); graph = member_->ApplyMemoryOptimizePass(graph); async_graphs[0] = graph; @@ -882,6 +889,23 @@ void ParallelExecutor::BCastParamsToDevices( FetchResultType ParallelExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { VLOG(3) << "enter ParallelExecutor Run"; +#ifdef PADDLE_WITH_CUDA + if (platform::IsCUDAGraphCapturing()) { + PADDLE_ENFORCE_EQ(fetch_tensors.empty(), true, + platform::errors::InvalidArgument( + "Cannot fetch data when using CUDA Graph.")); + PADDLE_ENFORCE_EQ( + member_->build_strategy_.allow_cuda_graph_capture_, true, + platform::errors::InvalidArgument( + "You must turn on build_strategy.allow_cuda_graph_capture = True " + "to enable CUDA Graph capturing.")); + PADDLE_ENFORCE_EQ( + member_->places_[0], platform::CUDAGraphCapturingPlace(), + platform::errors::InvalidArgument("The place to capture CUDAGraph is " + "not the same as the place to run.")); + } +#endif + #ifdef WITH_GPERFTOOLS if (gProfileStarted) { ProfilerFlush(); @@ -932,6 +956,16 @@ void ParallelExecutor::SkipMemoryReuse( void ParallelExecutor::FeedTensorsIntoLocalScopes( const std::vector> &tensors) { + if (platform::IsCUDAGraphCapturing()) { + for (auto &tensor : tensors) { + PADDLE_ENFORCE_EQ( + tensor.empty(), true, + platform::errors::PermissionDenied( + "Feeding data is not permitted when capturing CUDA Graph.")); + } + return; + } + if (!member_->AllowPartialFeed()) { PADDLE_ENFORCE_EQ(tensors.size(), member_->local_scopes_.size(), platform::errors::Unimplemented( @@ -987,6 +1021,14 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes( void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( const std::unordered_map &tensors) { + if (platform::IsCUDAGraphCapturing()) { + PADDLE_ENFORCE_EQ( + tensors.empty(), true, + platform::errors::PermissionDenied( + "Feeding data is not permitted when capturing CUDA Graph.")); + return; + } + size_t num_places = member_->places_.size(); bool allow_partial_feed = member_->AllowPartialFeed(); @@ -1568,6 +1610,107 @@ const ir::Graph &ParallelExecutor::Graph() const { return member_->executor_->Graph(); } +void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) { + const auto &build_strategy = member_->build_strategy_; + if (!build_strategy.allow_cuda_graph_capture_) return; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_EQ( + build_strategy.async_mode_, false, + platform::errors::InvalidArgument( + "Async Executor does not support CUDA Graph capturing.")); + PADDLE_ENFORCE_EQ( + platform::IsCUDAGraphCapturing(), false, + platform::errors::PermissionDenied("CUDA Graph is not allowed to capture " + "when running the first batch.")); + PADDLE_ENFORCE_EQ( + member_->places_.size(), 1, + platform::errors::InvalidArgument( + "CUDA Graph is only supported when one GPU device is running.")); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]), true, + platform::errors::InvalidArgument( + "CUDA Graph is only supported on NVIDIA GPU device.")); + PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce, false, + platform::errors::InvalidArgument( + "FLAGS_sync_nccl_allreduce must be False to support " + "CUDA Graph capturing.")); + + std::unordered_map> all_vars; + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + auto *var_desc = node->Var(); + all_vars[var_desc->Name()].emplace_back(var_desc); + } + } + + auto mark_var_as_persistable = [&all_vars](const std::string &name) { + auto iter = all_vars.find(name); + if (iter != all_vars.end()) { + for (auto *var_desc : iter->second) { + var_desc->SetPersistable(true); + } + } + }; + + // Step 1: All fused vars must be persistable. + if (graph->Has(details::kFusedVars)) { + auto &fused_vars = graph->Get(details::kFusedVars); + for (auto &fused_var : fused_vars) { + fused_var.second.persistable_ = true; + mark_var_as_persistable(fused_var.first); + } + } + + // Step 2: All pinned vars must be persistable. + if (graph->Has(details::kPinnedVars)) { + auto &pinned_vars = graph->Get(details::kPinnedVars); + for (auto &pinned_var : pinned_vars) { + mark_var_as_persistable(pinned_var); + } + } + + // Step 3: Move all main programs to startup programs to make sure that + // the main programs would only be run once. + if (graph->Has(details::kProgramDescs)) { + auto &startup_programs = + graph->GetOrInit(details::kStartupProgramDescs); + auto &main_programs = + graph->Get(details::kProgramDescs); + for (auto &main_program : main_programs) { + startup_programs.emplace_back(main_program); + } + graph->Erase(details::kProgramDescs); + } + + // Step 4: Mark all vars in startup programs to be persistable. + if (graph->Has(details::kStartupProgramDescs)) { + auto &startup_programs = + graph->GetOrInit(details::kStartupProgramDescs); + for (auto &startup_program : startup_programs) { + for (auto &op_desc : startup_program.Block(0).AllOps()) { + for (auto &output : op_desc->OutputArgumentNames()) { + mark_var_as_persistable(output); + } + } + } + } + + // Step 5: ScaleLossGrad must be run beforehand to avoid H2D copy. + auto ops = ir::FilterByNodeWrapper(*graph); + auto *scope = member_->local_scopes_[0]; + for (auto *op : ops) { + auto *loss_grad_op = dynamic_cast(op); + if (loss_grad_op == nullptr) continue; + auto loss_grad_name = loss_grad_op->LossGradName(); + mark_var_as_persistable(loss_grad_name); + loss_grad_op->RunOnVar(scope->Var(loss_grad_name)); + loss_grad_op->SetSkipRunning(true); + } +#else + PADDLE_THROW(platform::errors::Unimplemented( + "CUDA Graph is only supported on NVIDIA GPU device.")); +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 6c871a8d858156..78774f04896389 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -144,6 +144,8 @@ class ParallelExecutor { void SetReaderOpDeviceInfoOfGraphs( const std::vector &final_graphs); + void PrepareForCUDAGraphCapture(ir::Graph *graph); + ParallelExecutorPrivate *member_; std::vector> async_graphs_; std::vector var_infos_; diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 4c0ef02074e2ed..f4183bf570926d 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/cudnn_desc.h" namespace paddle { namespace operators { @@ -480,6 +481,7 @@ struct SearchAlgorithm { static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, const framework::ExecutionContext& ctx) { + platform::CUDAGraphCaptureModeGuard guard; auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size = 0; @@ -601,6 +603,7 @@ struct SearchAlgorithm { } static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + platform::CUDAGraphCaptureModeGuard guard; size_t workspace_size = 0; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( diff --git a/paddle/fluid/platform/cuda_graph.cc b/paddle/fluid/platform/cuda_graph.cc index 6e518d779e9cd4..693a5927990271 100644 --- a/paddle/fluid/platform/cuda_graph.cc +++ b/paddle/fluid/platform/cuda_graph.cc @@ -70,6 +70,9 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream, cudaStreamCaptureStatus status; PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamGetCaptureInfo( capturing_graph_->stream_, &status, &(capturing_graph_->id_))); + PADDLE_ENFORCE_EQ(IsValidCapturing(), true, + platform::errors::PermissionDenied( + "CUDA Graph should not be invalidated.")); VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_; } @@ -88,5 +91,14 @@ std::unique_ptr CUDAGraph::EndCapture() { #endif } +bool CUDAGraph::IsValidCapturing() { + if (!IsCapturing()) return false; + cudaStreamCaptureStatus status; + CUDAGraphID id; + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamGetCaptureInfo(capturing_graph_->stream_, &status, &id)); + return status == cudaStreamCaptureStatusActive; +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/cuda_graph.h b/paddle/fluid/platform/cuda_graph.h index 41e36049aa1a01..55ec463556b452 100644 --- a/paddle/fluid/platform/cuda_graph.h +++ b/paddle/fluid/platform/cuda_graph.h @@ -84,6 +84,10 @@ class CUDAGraph { return capturing_graph_->place_; } + // This API can be used to debug which GPU operation is not + // supported during capturing CUDA Graph. + static bool IsValidCapturing(); + private: #if CUDA_VERSION >= 10010 cudaGraph_t graph_{nullptr}; @@ -104,7 +108,8 @@ class CUDAGraphCaptureModeGuard { DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard); public: - explicit CUDAGraphCaptureModeGuard(cudaStreamCaptureMode mode) { + explicit CUDAGraphCaptureModeGuard( + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) { if (UNLIKELY(CUDAGraph::IsCapturing())) { PADDLE_ENFORCE_CUDA_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode)); // After cudaThreadExchangeStreamCaptureMode is called, @@ -128,7 +133,8 @@ class CUDAGraphCaptureModeGuard { DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard); public: - explicit CUDAGraphCaptureModeGuard(cudaStreamCaptureMode) {} + explicit CUDAGraphCaptureModeGuard( + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {} }; #endif diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc index 1f0d39e2abe236..4804d3f6ed3016 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc @@ -22,8 +22,10 @@ namespace platform { #ifdef PADDLE_WITH_CUDA void BeginCUDAGraphCapture(platform::CUDAPlace place, cudaStreamCaptureMode mode) { - auto stream = - platform::DeviceContextPool::Instance().GetByPlace(place)->stream(); + auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + dev_ctx->cudnn_workspace_handle().ResetWorkspace(); + + auto stream = dev_ctx->stream(); CUDAGraph::BeginCapture(place, stream, mode); auto id = CUDAGraph::CapturingID(); memory::allocation::AllocatorFacade::Instance().PrepareMemoryPoolForCUDAGraph( @@ -35,6 +37,9 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place, } std::unique_ptr EndCUDAGraphCapture() { + auto place = CUDAGraph::CapturingPlace(); + auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + dev_ctx->cudnn_workspace_handle().ResetWorkspace(); return CUDAGraph::EndCapture(); } #endif diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 59e4404ffe535c..c624ba94b74a3e 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -558,7 +558,7 @@ class RecordedCudaMallocHelper { #ifdef PADDLE_WITH_HIP auto result = hipMalloc(ptr, size); #else - CUDAGraphCaptureModeGuard capture_mode_guard{cudaStreamCaptureModeRelaxed}; + CUDAGraphCaptureModeGuard capture_mode_guard; auto result = cudaMalloc(ptr, size); #endif if (result == gpuSuccess) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 6b24c644925815..f58c2a5db381c7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -736,6 +736,17 @@ PYBIND11_MODULE(core_noavx, m) { paddle::framework::proto::VarType::Type type) { return reinterpret_cast(self.mutable_data(place, type)); }) + .def("_copy_from", + [](framework::Tensor &self, const framework::Tensor &other, + const platform::Place &place, int64_t batch_size) { + if (batch_size < 0) { + framework::TensorCopy(other, place, &self); + } else { + auto sliced = other.Slice(0, batch_size); + framework::TensorCopy(sliced, place, &self); + } + }, + py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, @@ -2299,7 +2310,14 @@ All parameter, weight, gradient are variables in Paddle. m.def("op_support_gpu", OpSupportGPU); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("get_cuda_device_count", platform::GetCUDADeviceCount); - m.def("cuda_empty_cache", platform::EmptyCache); + m.def("cuda_empty_cache", [] { + for (int dev_id : platform::GetSelectedDevices()) { + auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace( + platform::CUDAPlace(dev_id)); + dev_ctx->cudnn_workspace_handle().ResetWorkspace(); + } + platform::EmptyCache(); + }); m.def("get_device_properties", [](int id) -> const gpuDeviceProp & { return platform::GetDeviceProperties(id); @@ -3211,6 +3229,13 @@ All parameter, weight, gradient are variables in Paddle. [](BuildStrategy &self, bool fix_op_run_order) { self.fix_op_run_order_ = fix_op_run_order; }) + .def_property("allow_cuda_graph_capture", + [](const BuildStrategy &self) { + return self.allow_cuda_graph_capture_; + }, + [](BuildStrategy &self, bool allow_cuda_graph_capture) { + self.allow_cuda_graph_capture_ = allow_cuda_graph_capture; + }) .def("_copy", [](const BuildStrategy &self) { auto new_bs = self; diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 4c7537d8d5c8eb..8c118f31cbe87a 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1044,9 +1044,15 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, lr_value = lr_sheduler() lr_var = program._program.global_block().vars[lr_sheduler._var_name] lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype) - exe.feed_and_split_tensor_into_local_scopes({ - lr_sheduler._var_name: lr_tensor - }) + if core.is_cuda_graph_capturing(): + warnings.warn( + "Caution!!! When capturing CUDA Graph, the learning rate scheduler would not " + "take any effect! Please set the learning rate manually before each batch!" + ) + else: + exe.feed_and_split_tensor_into_local_scopes({ + lr_sheduler._var_name: lr_tensor + }) fetch_var_names = list(map(_to_name_str, fetch_list)) tensors = exe.run(fetch_var_names, return_merged)._move_to_list() diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py index 272d68e17fcc4d..7d1317473531e4 100644 --- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py +++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py @@ -17,18 +17,105 @@ from paddle.device.cuda.graphs import CUDAGraph import unittest import numpy as np +from paddle.fluid.dygraph.base import switch_to_static_graph +from simple_nets import simple_fc_net_with_inputs class TestCUDAGraph(unittest.TestCase): def setUp(self): - fluid.set_flags({'FLAGS_allocator_strategy': 'auto_growth'}) + if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm( + ): + fluid.set_flags({ + 'FLAGS_allocator_strategy': 'auto_growth', + 'FLAGS_sync_nccl_allreduce': False, + 'FLAGS_cudnn_deterministic': True + }) def random_tensor(self, shape): return paddle.to_tensor( np.random.randint( low=0, high=10, size=shape).astype("float32")) - def test_cuda_graph(self): + @switch_to_static_graph + def test_cuda_graph_static_graph(self): + if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(): + return + + seed = 100 + loss_cuda_graph = self.cuda_graph_static_graph_main( + seed, use_cuda_graph=True) + loss_no_cuda_graph = self.cuda_graph_static_graph_main( + seed, use_cuda_graph=False) + self.assertEqual(loss_cuda_graph, loss_no_cuda_graph) + + def cuda_graph_static_graph_main(self, seed, use_cuda_graph): + batch_size = 1 + class_num = 10 + image_shape = [batch_size, 784] + label_shape = [batch_size, 1] + + paddle.seed(seed) + np.random.seed(seed) + startup = paddle.static.Program() + main = paddle.static.Program() + with paddle.static.program_guard(main, startup): + image = paddle.static.data( + name="image", shape=image_shape, dtype='float32') + label = paddle.static.data( + name="label", shape=label_shape, dtype='int64') + image.persistable = True + label.persistable = True + loss = simple_fc_net_with_inputs(image, label, class_num) + loss.persistable = True + lr = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04]) + optimizer = paddle.optimizer.SGD(learning_rate=lr) + optimizer.minimize(loss) + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + scope = paddle.static.Scope() + with paddle.static.scope_guard(scope): + exe.run(startup) + build_strategy = paddle.static.BuildStrategy() + build_strategy.allow_cuda_graph_capture = True + build_strategy.fix_op_run_order = True + build_strategy.fuse_all_optimizer_ops = True + compiled_program = paddle.static.CompiledProgram( + main).with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + places=place) + image_t = scope.var(image.name).get_tensor() + label_t = scope.var(label.name).get_tensor() + loss_t = scope.var(loss.name).get_tensor() + lr_var = main.global_block().var(lr._var_name) + self.assertTrue(lr_var.persistable) + lr_t = scope.var(lr_var.name).get_tensor() + cuda_graph = None + for batch_id in range(20): + image_t.set( + np.random.rand(*image_shape).astype('float32'), place) + label_t.set(np.random.randint( + low=0, high=class_num, size=label_shape, dtype='int64'), + place) + + if batch_id == 1 and use_cuda_graph: + cuda_graph = CUDAGraph(place, mode="global") + cuda_graph.capture_begin() + exe.run(compiled_program) + cuda_graph.capture_end() + + if cuda_graph: + lr_t.set(np.array([lr()], dtype='float32'), place) + cuda_graph.replay() + else: + exe.run(compiled_program) + lr.step() + if cuda_graph: + cuda_graph.reset() + return np.array(loss_t) + + def test_cuda_graph_dynamic_graph(self): if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(): return From 57e8cbecaf06a54686f9aa28f2a8a84d32dcae6f Mon Sep 17 00:00:00 2001 From: jakpiase <62569058+jakpiase@users.noreply.github.com> Date: Fri, 8 Oct 2021 17:29:51 +0200 Subject: [PATCH 074/298] Fix for oneDNN conv op (#36284) * fix for conv op * Minor change --- paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index c663ba2f886809..cce835e6bc0354 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -217,9 +217,10 @@ class ConvMKLDNNHandlerT const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference : mkldnn::prop_kind::forward_training; - float sum_scale; + float sum_scale = 1.0f; std::vector output_shift_scale; - std::tie(sum_scale, output_shift_scale) = get_int8_scales(ctx); + if (platform::is_int8()) + std::tie(sum_scale, output_shift_scale) = get_int8_scales(ctx); const mkldnn::primitive_attr conv_attr = CreatePostOps( fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn, From d8887afaf0d4ae9bb30831f58cd5eb62e3f63e0a Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Sat, 9 Oct 2021 10:08:52 +0800 Subject: [PATCH 075/298] fix hasattr(paddle.fluid.ir.PassDesc.OP, '__name__') error (#36229) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 对于__getattr__重载后不满足条件的参数,全部抛出AttributeError异常,达到与未重载版本一致。 --- python/paddle/fluid/ir.py | 10 ++++++---- .../fluid/tests/unittests/ir/test_ir_generate_pass.py | 3 +++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/ir.py b/python/paddle/fluid/ir.py index 17b7ea1122ab75..7e2d3df1ce1e43 100644 --- a/python/paddle/fluid/ir.py +++ b/python/paddle/fluid/ir.py @@ -230,9 +230,6 @@ def __init__(self, type=None): self._type = type def __getattr__(self, name): - if self._type is not None: - raise AttributeError( - "type object 'OpHelper' has no attribute '{}'".format(name)) op = PassDesc.OpHelper(name) op.Init() return op @@ -261,7 +258,12 @@ def Init(self): self._op_idx = len(block.ops) self._op_desc = block.desc.append_op() self._op_desc.set_type(self._type) - self._op_proto = OpProtoHolder.instance().get_op_proto(self._type) + self._op_proto = OpProtoHolder.instance().op_proto_map.get( + self._type) + if self._op_proto is None: + raise AttributeError( + "type object 'OpHelper' has no attribute '{}'".format( + self._type)) block.ops.append(self) def Attr(self, name): diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py index c8b9d5e5739ddd..851ae21c38378f 100644 --- a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py @@ -123,6 +123,9 @@ def convert_ops_to_op_dicts(self, ops): op_dicts[op.type] = [op] return op_dicts + def test_has_attr(self): + self.assertFalse(hasattr(ir.PassDesc.OP, '__name__')) + def test_generate_fc_fuse(self): def _check_fc_fuse_pass(pass_desc, with_relu): pattern_op_dicts = self.convert_ops_to_op_dicts( From 2fd8deea8d6dedd567000fb092f4c1292e6dbdc8 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Sat, 9 Oct 2021 10:09:10 +0800 Subject: [PATCH 076/298] C++ support register pass via PassDesc (#36095) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 支持C++开发注册GeneratePass,简化针对fusion等子图优化场景开发方式。 --- paddle/fluid/framework/ir/generate_pass.cc | 110 ++++++++ paddle/fluid/framework/ir/generate_pass.h | 153 +++++++++- .../framework/ir/generate_pass_tester.cc | 267 ++++-------------- 3 files changed, 314 insertions(+), 216 deletions(-) diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc index 9eba6fc89a2e96..085298314ea3ff 100644 --- a/paddle/fluid/framework/ir/generate_pass.cc +++ b/paddle/fluid/framework/ir/generate_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/generate_pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { @@ -224,6 +225,115 @@ bool GeneratePass::VerifyGraph(const Graph& graph) { return true; } +namespace generate_pass { + +VarHelper::VarHelper(const char* name) : name_(name), type_(Type::kInput) {} +VarHelper::VarHelper(const std::string& name, Type type) + : name_(name), type_(type) {} + +OpHelper::OpHelper(const char* type, SubgraphHelper* subgraph_helper) + : type_(type), subgraph_helper_(subgraph_helper) { + op_desc_ = subgraph_helper_->ProgramDesc()->mutable_blocks(0)->add_ops(); + op_desc_->set_type(type_); +} + +OpHelper::Arguments::Arguments(const char* parameter, + const VarHelper& var_helper) + : parameter_(parameter) { + var_helpers_.push_back(var_helper); +} + +OpHelper::Arguments::Arguments(const char* parameter, + std::initializer_list var_helpers) + : parameter_(parameter), var_helpers_(var_helpers) {} + +OpHelper& OpHelper::operator()(const Arguments& input) { + proto::OpDesc::Var* var = op_desc_->add_inputs(); + var->set_parameter(input.parameter_); + for (const VarHelper& var_helper : input.var_helpers_) { + var->add_arguments()->assign(var_helper.name_); + if (VarHelper::Type::kInput == var_helper.type_) { + subgraph_helper_->AddInputVar(var_helper.name_); + } + } + return *this; +} + +OpHelper& OpHelper::operator()(std::initializer_list inputs) { + for (const auto& input : inputs) { + operator()(input); + } + return *this; +} + +VarHelper OpHelper::Out(const char* name) { + std::string argument = patterns::UniqueKey(type_); + proto::OpDesc::Var* var = op_desc_->add_outputs(); + var->set_parameter(name); + var->add_arguments()->assign(argument); + return VarHelper(argument, VarHelper::Type::kOutput); +} + +proto::ProgramDesc* SubgraphHelper::ProgramDesc() { return &program_desc_; } + +const proto::ProgramDesc& SubgraphHelper::ProgramDesc() const { + return program_desc_; +} + +const std::vector& SubgraphHelper::InputVars() const { + return input_vars_; +} + +const std::vector& SubgraphHelper::OutputVars() const { + return output_vars_; +} + +void SubgraphHelper::AddInputVar(const std::string& name) { + auto iter = std::find(input_vars_.begin(), input_vars_.end(), name); + if (input_vars_.end() == iter) { + input_vars_.push_back(name); + } +} + +void SubgraphHelper::AddOutputVars(const VarHelper& var_helper) { + output_vars_.push_back(var_helper.name_); +} + +} // namespace generate_pass + +PassPairs::PassPairs(const SubgraphType& pattern, const SubgraphType& replace) { + AddPassDesc(pattern, replace); +} + +void PassPairs::AddPassDesc(const SubgraphType& pattern, + const SubgraphType& replace) { + proto::PassDesc* pass_desc = multi_pass_desc_.add_pass_descs(); + pass_desc->mutable_pattern()->CopyFrom(pattern.ProgramDesc()); + pass_desc->mutable_replace()->CopyFrom(replace.ProgramDesc()); + PADDLE_ENFORCE_EQ(pattern.InputVars().size(), replace.InputVars().size(), + platform::errors::InvalidArgument( + "Size of lambda expression arguments is not equal " + "between pattern/replace subgraph.")); + for (size_t i = 0; i < pattern.InputVars().size(); i++) { + proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); + var_map->set_pattern_var(pattern.InputVars()[i]); + var_map->set_replace_var(replace.InputVars()[i]); + } + PADDLE_ENFORCE_EQ(pattern.OutputVars().size(), replace.OutputVars().size(), + platform::errors::InvalidArgument( + "Size of lambda expression returns is not equal " + "between pattern/replace subgraph.")); + for (size_t i = 0; i < pattern.OutputVars().size(); i++) { + proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); + var_map->set_pattern_var(pattern.OutputVars()[i]); + var_map->set_replace_var(replace.OutputVars()[i]); + } +} + +const proto::MultiPassDesc& PassPairs::MultiPassDesc() const { + return multi_pass_desc_; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/generate_pass.h b/paddle/fluid/framework/ir/generate_pass.h index f73173233aed32..26e5231fbc16e7 100644 --- a/paddle/fluid/framework/ir/generate_pass.h +++ b/paddle/fluid/framework/ir/generate_pass.h @@ -13,7 +13,6 @@ // limitations under the License. #pragma once -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/pass_desc.pb.h" @@ -43,6 +42,158 @@ class GeneratePass : public Pass { proto::MultiPassDesc multi_pass_desc_; }; +namespace generate_pass { + +class VarHelper; +class OpHelper; +class SubgraphHelper; + +// VarHelper is used to represent a variable node. +struct VarHelper { + enum class Type { kInput, kOutput }; + + explicit VarHelper(const char* name); + VarHelper(const std::string& name, Type type); + + std::string name_; + Type type_; +}; + +// OpHelper is used to represent a operator node. +class OpHelper { + public: + // Convert multiple inputs. + struct Arguments { + Arguments(const char* parameter, const VarHelper& var_helper); + Arguments(const char* parameter, + std::initializer_list var_helpers); + + std::string parameter_; + std::vector var_helpers_; + }; + + OpHelper(const char* type, SubgraphHelper* subgraph_helper); + + OpHelper& operator()(const Arguments& input); + OpHelper& operator()(std::initializer_list inputs); + + VarHelper Out(const char* name); + + private: + OpHelper() = delete; + DISABLE_COPY_AND_ASSIGN(OpHelper); + + const char* type_; + proto::OpDesc* op_desc_; + SubgraphHelper* subgraph_helper_; +}; + +/* + * SubgraphHelper is used to define pattern/replace subgraphs. + * + * Use lambda expression to define subgraph like Python. SubgraphHelper + * converts lambda expression to ProgramDesc. + * + * In order to define a subgraph, user need to use VarHelper and OpHelper. + * Use the macros instead of class names, so user can develop better and + * don't need to know too much about underlying implementation. + * + * An example of defining a subgraph as follows: + * + * SUBGRAPH_(subgraph)([subgraph=&subgraph](VAR_(x), VAR_(y), VAR_(z)) { + * auto ewadd1 = OP_(elementwise_add)({{"X", x}, {"Y", y}}).Out("Out"); + * auto ewadd2 = OP_(elementwise_add)({{"X", ewadd1}, {"Y", z}}).Out("Out"); + * return ewadd2; + * }); + * + */ +class SubgraphHelper { + public: + SubgraphHelper() = default; + // The lambda expression is a prvalue expression. + template + SubgraphHelper& operator=(const T&& f) { + proto::BlockDesc* block = program_desc_.add_blocks(); + block->set_idx(0); + block->set_parent_idx(0); + AddOutputVars(f()); + return *this; + } + + proto::ProgramDesc* ProgramDesc(); + const proto::ProgramDesc& ProgramDesc() const; + const std::vector& InputVars() const; + const std::vector& OutputVars() const; + + void AddInputVar(const std::string& name); + + void AddOutputVars(const VarHelper& var_helper); + + template * = nullptr> + void AddOutputVars(const std::tuple& outputs) { + AddOutputVars(std::get(outputs)); + AddOutputVars(outputs); + } + + template * = nullptr> + void AddOutputVars(const std::tuple& outputs) { + AddOutputVars(std::get(outputs)); + } + + template + void AddOutputVars(const std::tuple& outputs) { + AddOutputVars<0>(outputs); + } + + private: + DISABLE_COPY_AND_ASSIGN(SubgraphHelper); + std::vector input_vars_; + std::vector output_vars_; + proto::ProgramDesc program_desc_; +}; + +} // namespace generate_pass + +class PassPairs { + public: + using SubgraphType = generate_pass::SubgraphHelper; + + PassPairs() = default; + PassPairs(const SubgraphType& pattern, const SubgraphType& replace); + + void AddPassDesc(const SubgraphType& pattern, const SubgraphType& replace); + + const proto::MultiPassDesc& MultiPassDesc() const; + + private: + proto::MultiPassDesc multi_pass_desc_; +}; + +// Use function to register in CC. +template +class MacroPassHelper : public GeneratePass { + public: + MacroPassHelper() : GeneratePass(Functor().MultiPassDesc()) {} +}; + +#define VAR_(name) \ + ::paddle::framework::ir::generate_pass::VarHelper name = \ + ::paddle::framework::ir::generate_pass::VarHelper(#name) +#define OP_(type) \ + ::paddle::framework::ir::generate_pass::OpHelper(#type, subgraph) +#define SUBGRAPH_(name) \ + ::paddle::framework::ir::generate_pass::SubgraphHelper name; \ + name + +#define REGISTER_GENERATE_PASS(pass_type) \ + paddle::framework::ir::PassPairs register_##pass_type(); \ + REGISTER_PASS( \ + pass_type, \ + ::paddle::framework::ir::MacroPassHelper<®ister_##pass_type>); \ + paddle::framework::ir::PassPairs register_##pass_type() + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/generate_pass_tester.cc b/paddle/fluid/framework/ir/generate_pass_tester.cc index c3852d29c308ff..6876dde50c157c 100644 --- a/paddle/fluid/framework/ir/generate_pass_tester.cc +++ b/paddle/fluid/framework/ir/generate_pass_tester.cc @@ -16,234 +16,71 @@ #include "gtest/gtest.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" -namespace paddle { -namespace framework { -namespace ir { - -template -class CXXGeneratePass : public GeneratePass { - public: - CXXGeneratePass() : GeneratePass(Functor()) {} -}; - -#define REGISTER_GENERATE_PASS(pass_type, function) \ - REGISTER_PASS(pass_type, ::paddle::framework::ir::CXXGeneratePass<&function>) - -proto::MultiPassDesc generate_fc_fuse() { - proto::MultiPassDesc multi_pass_desc; +REGISTER_GENERATE_PASS(generate_fc_fuse) { + paddle::framework::ir::PassPairs pass_pairs; for (bool with_relu : {true, false}) { - proto::PassDesc* pass_desc = multi_pass_desc.add_pass_descs(); - proto::BlockDesc* pattern = pass_desc->mutable_pattern()->add_blocks(); - pattern->set_idx(0); - pattern->set_parent_idx(0); - proto::OpDesc* mul = pattern->add_ops(); - mul->set_type("mul"); - proto::OpDesc::Var* mul_x = mul->add_inputs(); - mul_x->set_parameter("X"); - mul_x->add_arguments()->assign("x"); - proto::OpDesc::Var* mul_y = mul->add_inputs(); - mul_y->set_parameter("Y"); - mul_y->add_arguments()->assign("w"); - proto::OpDesc::Var* mul_out = mul->add_outputs(); - mul_out->set_parameter("Out"); - mul_out->add_arguments()->assign("mul_out"); - proto::OpDesc* ewadd = pattern->add_ops(); - ewadd->set_type("elementwise_add"); - proto::OpDesc::Var* ewadd_x = ewadd->add_inputs(); - ewadd_x->set_parameter("X"); - ewadd_x->add_arguments()->assign("mul_out"); - proto::OpDesc::Var* ewadd_y = ewadd->add_inputs(); - ewadd_y->set_parameter("Y"); - ewadd_y->add_arguments()->assign("b"); - proto::OpDesc::Var* ewadd_out = ewadd->add_outputs(); - ewadd_out->set_parameter("Out"); - ewadd_out->add_arguments()->assign("ewadd_out"); - proto::OpDesc* relu = nullptr; - proto::BlockDesc* replace = pass_desc->mutable_replace()->add_blocks(); - replace->set_idx(0); - replace->set_parent_idx(0); - proto::OpDesc* fc = replace->add_ops(); - fc->set_type("fc"); - proto::OpDesc::Var* fc_x = fc->add_inputs(); - fc_x->set_parameter("Input"); - fc_x->add_arguments()->assign("x"); - proto::OpDesc::Var* fc_w = fc->add_inputs(); - fc_w->set_parameter("W"); - fc_w->add_arguments()->assign("w"); - proto::OpDesc::Var* fc_b = fc->add_inputs(); - fc_b->set_parameter("Bias"); - fc_b->add_arguments()->assign("b"); - proto::OpDesc::Var* fc_out = fc->add_outputs(); - fc_out->set_parameter("Out"); - fc_out->add_arguments()->assign("fc_out"); - for (const char* var : {"x", "w", "b", "fc_out"}) { - proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); - var_map->set_pattern_var(var); - var_map->set_replace_var(var); - } - proto::PassDesc::AttrMap* attr_map = pass_desc->add_attr_maps(); - attr_map->set_pattern_op_idx(0); - attr_map->set_pattern_name("x_num_col_dims"); - attr_map->set_replace_op_idx(0); - attr_map->set_replace_name("in_num_col_dims"); - if (with_relu) { - relu = pattern->add_ops(); - relu->set_type("relu"); - proto::OpDesc::Var* relu_x = relu->add_inputs(); - relu_x->set_parameter("X"); - relu_x->add_arguments()->assign("ewadd_out"); - proto::OpDesc::Var* relu_out = relu->add_outputs(); - relu_out->set_parameter("Out"); - relu_out->add_arguments()->assign("relu_out"); - pass_desc->mutable_var_maps(3)->set_pattern_var("relu_out"); - proto::OpDesc::Attr* attr = fc->add_attrs(); - attr->set_name("activation_type"); - attr->set_type(proto::AttrType::STRING); - attr->set_s("relu"); - } else { - pass_desc->mutable_var_maps(3)->set_pattern_var("ewadd_out"); - } + // pattern + SUBGRAPH_(pattern) = + [ subgraph = &pattern, with_relu ](VAR_(x), VAR_(y), VAR_(z)) { + VLOG(3) << "exec lambda func."; + auto mul = OP_(mul)({{"X", x}, {"Y", y}}).Out("Out"); + auto ewadd = OP_(elementwise_add)({{"X", mul}, {"Y", z}}).Out("Out"); + if (with_relu) { + return OP_(relu)({"X", ewadd}).Out("Out"); + } else { + return ewadd; + } + }; + // replace + SUBGRAPH_(replace) = + [ subgraph = &replace, with_relu ](VAR_(x), VAR_(y), VAR_(z)) { + auto& fc = OP_(fc)({{"Input", x}, {"W", y}, {"Bias", z}}); + return fc.Out("Out"); + }; + pass_pairs.AddPassDesc(pattern, replace); } - return multi_pass_desc; + return pass_pairs; } -proto::MultiPassDesc generate_multi_add_to_addn() { - proto::MultiPassDesc multi_pass_desc; - proto::PassDesc* pass_desc = multi_pass_desc.add_pass_descs(); - proto::BlockDesc* pattern = pass_desc->mutable_pattern()->add_blocks(); - proto::OpDesc* ewadd_0 = pattern->add_ops(); - ewadd_0->set_type("elementwise_add"); - proto::OpDesc::Var* ewadd_0_x = ewadd_0->add_inputs(); - ewadd_0_x->set_parameter("X"); - ewadd_0_x->add_arguments()->assign("a"); - proto::OpDesc::Var* ewadd_0_y = ewadd_0->add_inputs(); - ewadd_0_y->set_parameter("Y"); - ewadd_0_y->add_arguments()->assign("b"); - proto::OpDesc::Var* ewadd_0_out = ewadd_0->add_outputs(); - ewadd_0_out->set_parameter("Out"); - ewadd_0_out->add_arguments()->assign("ewadd_out_0"); - proto::OpDesc* ewadd_1 = pattern->add_ops(); - ewadd_1->set_type("elementwise_add"); - proto::OpDesc::Var* ewadd_1_x = ewadd_1->add_inputs(); - ewadd_1_x->set_parameter("X"); - ewadd_1_x->add_arguments()->assign("ewadd_out_0"); - proto::OpDesc::Var* ewadd_1_y = ewadd_1->add_inputs(); - ewadd_1_y->set_parameter("Y"); - ewadd_1_y->add_arguments()->assign("c"); - proto::OpDesc::Var* ewadd_1_out = ewadd_1->add_outputs(); - ewadd_1_out->set_parameter("Out"); - ewadd_1_out->add_arguments()->assign("ewadd_out_1"); - proto::BlockDesc* replace = pass_desc->mutable_replace()->add_blocks(); - proto::OpDesc* addn = replace->add_ops(); - addn->set_type("add_n"); - proto::OpDesc::Var* addn_x = addn->add_inputs(); - addn_x->set_parameter("X"); - addn_x->add_arguments()->assign("a"); - addn_x->add_arguments()->assign("b"); - addn_x->add_arguments()->assign("c"); - proto::OpDesc::Var* addn_out = addn->add_outputs(); - addn_out->set_parameter("Out"); - addn_out->add_arguments()->assign("addn_out"); - for (const char* var : {"a", "b", "c", "ewadd_out_1"}) { - proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); - var_map->set_pattern_var(var); - var_map->set_replace_var(var); - } - pass_desc->mutable_var_maps(3)->set_replace_var("addn_out"); - return multi_pass_desc; +REGISTER_GENERATE_PASS(generate_multi_add_to_addn) { + // pattern + SUBGRAPH_(pattern) = [subgraph = &pattern](VAR_(x), VAR_(y), VAR_(z)) { + auto ewadd1 = OP_(elementwise_add)({{"X", x}, {"Y", y}}).Out("Out"); + auto ewadd2 = OP_(elementwise_add)({{"X", ewadd1}, {"Y", z}}).Out("Out"); + return ewadd2; + }; + // replace + SUBGRAPH_(replace) = [subgraph = &replace](VAR_(x), VAR_(y), VAR_(z)) { + return OP_(sum)({"X", {x, y, z}}).Out("Out"); + }; + return {pattern, replace}; } -proto::MultiPassDesc generate_combine_matmul() { - proto::MultiPassDesc multi_pass_desc; - proto::PassDesc* pass_desc = multi_pass_desc.add_pass_descs(); - proto::BlockDesc* pattern = pass_desc->mutable_pattern()->add_blocks(); - proto::OpDesc* matmul_0 = pattern->add_ops(); - matmul_0->set_type("matmul"); - proto::OpDesc::Var* matmul_0_x = matmul_0->add_inputs(); - matmul_0_x->set_parameter("X"); - matmul_0_x->add_arguments()->assign("a"); - proto::OpDesc::Var* matmul_0_y = matmul_0->add_inputs(); - matmul_0_y->set_parameter("Y"); - matmul_0_y->add_arguments()->assign("b"); - proto::OpDesc::Var* matmul_0_out = matmul_0->add_outputs(); - matmul_0_out->set_parameter("Out"); - matmul_0_out->add_arguments()->assign("matmul_out_0"); - proto::OpDesc* matmul_1 = pattern->add_ops(); - matmul_1->set_type("matmul"); - proto::OpDesc::Var* matmul_1_x = matmul_1->add_inputs(); - matmul_1_x->set_parameter("X"); - matmul_1_x->add_arguments()->assign("a"); - proto::OpDesc::Var* matmul_1_y = matmul_1->add_inputs(); - matmul_1_y->set_parameter("Y"); - matmul_1_y->add_arguments()->assign("c"); - proto::OpDesc::Var* matmul_1_out = matmul_1->add_outputs(); - matmul_1_out->set_parameter("Out"); - matmul_1_out->add_arguments()->assign("matmul_out_1"); - proto::BlockDesc* replace = pass_desc->mutable_replace()->add_blocks(); - proto::OpDesc* concat = replace->add_ops(); - concat->set_type("concat"); - proto::OpDesc::Var* concat_x = concat->add_inputs(); - concat_x->set_parameter("X"); - concat_x->add_arguments()->assign("b"); - concat_x->add_arguments()->assign("c"); - proto::OpDesc::Var* concat_out = concat->add_outputs(); - concat_out->set_parameter("Out"); - concat_out->add_arguments()->assign("concat_out"); - proto::OpDesc* matmul = replace->add_ops(); - matmul->set_type("matmul"); - proto::OpDesc::Var* matmul_x = matmul->add_inputs(); - matmul_x->set_parameter("X"); - matmul_x->add_arguments()->assign("a"); - proto::OpDesc::Var* matmul_y = matmul->add_inputs(); - matmul_y->set_parameter("Y"); - matmul_y->add_arguments()->assign("concat_out"); - proto::OpDesc::Var* matmul_out = matmul->add_outputs(); - matmul_out->set_parameter("Out"); - matmul_out->add_arguments()->assign("matmul_out"); - proto::OpDesc* slice_0 = replace->add_ops(); - slice_0->set_type("slice"); - proto::OpDesc::Var* slice_0_x = slice_0->add_inputs(); - slice_0_x->set_parameter("X"); - slice_0_x->add_arguments()->assign("matmul_out"); - proto::OpDesc::Var* slice_0_out = slice_0->add_outputs(); - slice_0_out->set_parameter("Out"); - slice_0_out->add_arguments()->assign("slice_out_0"); - proto::OpDesc* slice_1 = replace->add_ops(); - slice_1->set_type("slice"); - proto::OpDesc::Var* slice_1_x = slice_1->add_inputs(); - slice_1_x->set_parameter("X"); - slice_1_x->add_arguments()->assign("matmul_out"); - proto::OpDesc::Var* slice_1_out = slice_1->add_outputs(); - slice_1_out->set_parameter("Out"); - slice_1_out->add_arguments()->assign("slice_out_1"); - for (const char* var : {"a", "b", "c", "matmul_out_0", "matmul_out_1"}) { - proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); - var_map->set_pattern_var(var); - var_map->set_replace_var(var); - } - pass_desc->mutable_var_maps(3)->set_replace_var("slice_out_0"); - pass_desc->mutable_var_maps(4)->set_replace_var("slice_out_1"); - return multi_pass_desc; +REGISTER_GENERATE_PASS(generate_combine_matmul) { + // pattern + SUBGRAPH_(pattern) = [subgraph = &pattern](VAR_(x), VAR_(y), VAR_(z)) { + auto matmul1 = OP_(matmul)({{"X", x}, {"Y", y}}).Out("Out"); + auto matmul2 = OP_(matmul)({{"X", x}, {"Y", z}}).Out("Out"); + return std::make_tuple(matmul1, matmul2); + }; + // replace + SUBGRAPH_(replace) = [subgraph = &replace](VAR_(x), VAR_(y), VAR_(z)) { + auto concat = OP_(concat)({"X", {y, z}}).Out("Out"); + auto matmul = OP_(matmul)({{"X", x}, {"Y", concat}}).Out("Out"); + auto slice1 = OP_(slice)({"X", matmul}).Out("Out"); + auto slice2 = OP_(slice)({"X", matmul}).Out("Out"); + return std::make_tuple(slice1, slice2); + }; + return {pattern, replace}; } -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_GENERATE_PASS(generate_fc_fuse, - paddle::framework::ir::generate_fc_fuse); -REGISTER_GENERATE_PASS(generate_multi_add_to_addn, - paddle::framework::ir::generate_multi_add_to_addn); -REGISTER_GENERATE_PASS(generate_combine_matmul, - paddle::framework::ir::generate_combine_matmul); - namespace paddle { namespace framework { namespace ir { TEST(GeneratePass, construct_with_string) { std::string binary_str; - generate_fc_fuse().SerializeToString(&binary_str); + register_generate_fc_fuse().MultiPassDesc().SerializeToString(&binary_str); GeneratePass generate_pass(binary_str); } @@ -318,7 +155,7 @@ TEST(GeneratePass, generate_multi_add_to_addn) { graph.reset(pass->Apply(graph.release())); int num_nodes_after = graph->Nodes().size(); - int num_addn_nodes_after = GetNumOpNodes(graph, "add_n"); + int num_addn_nodes_after = GetNumOpNodes(graph, "sum"); VLOG(3) << DebugString(graph); PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 2, From 623df4293f1c7e08386f8786d8e6338c043fde25 Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Sat, 9 Oct 2021 12:00:35 +0800 Subject: [PATCH 077/298] support ClipGradByGlobalNorm in sharding (#36012) * support ClipGradByGlobalNorm in sharding * support ClipGradByGlobalNorm in sharding * test=allcase --- .../dygraph_optimizer/__init__.py | 1 + .../hybrid_parallel_optimizer.py | 16 ++++++++++++++-- .../hybrid_parallel_sharding_model.py | 19 ++++++++++++------- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py index f0f26bd2e0d060..28260d7aa18635 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py @@ -12,5 +12,6 @@ # See the License for the specific language governing permissions and from .hybrid_parallel_optimizer import HybridParallelOptimizer from .hybrid_parallel_gradscaler import HybridParallelGradScaler +from .dygraph_sharding_optimizer import DygraphShardingOptimizer __all__ = [] diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index b00ef2cdcb0e10..76e326ce20d7cb 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -88,6 +88,13 @@ def _dygraph_clip(self, params_grads): paddle.distributed.all_reduce( global_norm_var_dist, group=self._hcg.get_check_parallel_group()) + # In Sharding mode, param and grad is mapping different rank in optimizer. + # ClipGradByGlobalNorm need allreduce to get globol norm + if self._hcg.get_sharding_parallel_world_size() > 1: + paddle.distributed.all_reduce( + global_norm_var_not_dist, + group=self._hcg.get_sharding_parallel_group()) + global_norm_var = layers.sqrt(global_norm_var_dist + global_norm_var_not_dist) @@ -139,8 +146,13 @@ def __init__(self, optimizer, hcg, strategy): logger.warning("using ClipGradByGlobalNorm in TensorParallel, the origin " \ "optmizer'grad clip will be changed.") - self._inner_opt._grad_clip = HybridParallelClipGrad( - self._inner_opt._grad_clip, hcg) + if self._sharding_enable: + # change sharding inner_optimizer's _grad_clip + self._inner_opt._inner_optimizer._grad_clip = HybridParallelClipGrad( + self._inner_opt._grad_clip, hcg) + else: + self._inner_opt._grad_clip = HybridParallelClipGrad( + self._inner_opt._grad_clip, hcg) @imperative_base.no_grad @framework.dygraph_only diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py index 2995e4dbf84018..8cb1166cd0d832 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py @@ -183,21 +183,23 @@ def build_optimizer(self, strategy=None, is_sharding=True, Optimizer="adam"): - + clip = paddle.nn.ClipGradByGlobalNorm(0.5) if Optimizer == "adam": if is_sharding: optimizer = DygraphShardingOptimizer( hcg=fleet.get_hybrid_communicate_group(), user_defined_strategy=strategy, params=model.parameters(), - inner_optimizer_class=paddle.optimizer.Adam, + inner_optimizer_class=paddle.optimizer.AdamW, learning_rate=0.001, - weight_decay=0.00001, ) + weight_decay=0.00001, + grad_clip=clip) else: - optimizer = paddle.optimizer.Adam( + optimizer = paddle.optimizer.AdamW( parameters=model.parameters(), learning_rate=0.001, - weight_decay=0.00001, ) + weight_decay=0.00001, + grad_clip=clip) else: if is_sharding: optimizer = DygraphShardingOptimizer( @@ -205,10 +207,13 @@ def build_optimizer(self, user_defined_strategy=strategy, params=model.parameters(), inner_optimizer_class=paddle.optimizer.Momentum, - learning_rate=0.001, ) + learning_rate=0.001, + grad_clip=clip) else: optimizer = paddle.optimizer.Momentum( - learning_rate=0.001, parameters=model.parameters()) + learning_rate=0.001, + parameters=model.parameters(), + grad_clip=clip) return optimizer def build_model_optimizer(self, Optimizer="adam"): From c8a01010e84bf8566a417060f50a43e100a10172 Mon Sep 17 00:00:00 2001 From: zhiboniu <31800336+zhiboniu@users.noreply.github.com> Date: Sat, 9 Oct 2021 16:21:39 +0800 Subject: [PATCH 078/298] update fft api path (#36219) * update fft api path * add sample code for ihfft2 Co-authored-by: chenfeiyu --- python/paddle/__init__.py | 2 +- python/paddle/fft.py | 61 +++++++++++++++++++++++++++++++++++++ python/paddle/tensor/fft.py | 44 ++++++++++++-------------- 3 files changed, 81 insertions(+), 26 deletions(-) create mode 100644 python/paddle/fft.py diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index ad8640f6f55848..decffa66f4174f 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -64,7 +64,6 @@ import paddle.static # noqa: F401 import paddle.vision # noqa: F401 -from .tensor import fft from .tensor.random import bernoulli # noqa: F401 from .tensor.attribute import rank # noqa: F401 @@ -294,6 +293,7 @@ from .hapi import flops # noqa: F401 from . import hub # noqa: F401 from . import linalg # noqa: F401 +from . import fft # noqa: F401 import paddle.text # noqa: F401 import paddle.vision # noqa: F401 diff --git a/python/paddle/fft.py b/python/paddle/fft.py new file mode 100644 index 00000000000000..3ac02c9c8dc18a --- /dev/null +++ b/python/paddle/fft.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .tensor.fft import fft # noqa: F401 +from .tensor.fft import fft2 # noqa: F401 +from .tensor.fft import fftn # noqa: F401 +from .tensor.fft import ifft # noqa: F401 +from .tensor.fft import ifft2 # noqa: F401 +from .tensor.fft import ifftn # noqa: F401 +from .tensor.fft import rfft # noqa: F401 +from .tensor.fft import rfft2 # noqa: F401 +from .tensor.fft import rfftn # noqa: F401 +from .tensor.fft import irfft # noqa: F401 +from .tensor.fft import irfft2 # noqa: F401 +from .tensor.fft import irfftn # noqa: F401 +from .tensor.fft import hfft # noqa: F401 +from .tensor.fft import hfft2 # noqa: F401 +from .tensor.fft import hfftn # noqa: F401 +from .tensor.fft import ihfft # noqa: F401 +from .tensor.fft import ihfft2 # noqa: F401 +from .tensor.fft import ihfftn # noqa: F401 +from .tensor.fft import fftfreq # noqa: F401 +from .tensor.fft import rfftfreq # noqa: F401 +from .tensor.fft import fftshift # noqa: F401 +from .tensor.fft import ifftshift # noqa: F401 + +__all__ = [ # noqa + 'fft', + 'fft2', + 'fftn', + 'ifft', + 'ifft2', + 'ifftn', + 'rfft', + 'rfft2', + 'rfftn', + 'irfft', + 'irfft2', + 'irfftn', + 'hfft', + 'hfft2', + 'hfftn', + 'ihfft', + 'ihfft2', + 'ihfftn', + 'fftfreq', + 'rfftfreq', + 'fftshift', + 'ifftshift' +] diff --git a/python/paddle/tensor/fft.py b/python/paddle/tensor/fft.py index 98ca858c0eb85a..829399d14eaa08 100644 --- a/python/paddle/tensor/fft.py +++ b/python/paddle/tensor/fft.py @@ -21,30 +21,7 @@ from ..fluid.data_feeder import check_variable_and_dtype from ..fluid.layer_helper import LayerHelper -__all__ = [ - 'fft', - 'fft2', - 'fftn', - 'ifft', - 'ifft2', - 'ifftn', - 'rfft', - 'rfft2', - 'rfftn', - 'irfft', - 'irfft2', - 'irfftn', - 'hfft', - 'hfft2', - 'hfftn', - 'ihfft', - 'ihfft2', - 'ihfftn', - 'fftfreq', - 'rfftfreq', - 'fftshift', - 'ifftshift', -] +__all__ = [] def _check_normalization(norm): @@ -1135,7 +1112,24 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None): refer to :ref:`api_guide_Name` . Returns: - out(Tensor) : The result of the inverse real 2-D FFT. + out(Tensor) : The result of the inverse hermitian 2-D FFT. + + Examples: + + .. code-block:: python + + import numpy as np + import paddle + + x = np.mgrid[:5, :5][0].astype(np.float64) + xp = paddle.to_tensor(x) + ihfft2_xp = paddle.fft.ihfft2(xp).numpy() + print(ihfft2_xp) + # [[ 2. +0.j 0. +0.j 0. +0.j ] + # [-0.5-0.68819096j 0. +0.j 0. +0.j ] + # [-0.5-0.16245985j 0. +0.j 0. +0.j ] + # [-0.5+0.16245985j 0. +0.j 0. +0.j ] + # [-0.5+0.68819096j 0. +0.j 0. +0.j ]] """ _check_at_least_ndim(x, 2) if s is not None: From 62e411508f31814e9b9f71f78769d3ce2101e35b Mon Sep 17 00:00:00 2001 From: zhiboniu <31800336+zhiboniu@users.noreply.github.com> Date: Sat, 9 Oct 2021 16:35:17 +0800 Subject: [PATCH 079/298] fill_diagonal op fix border cross caused by offset (#36212) --- paddle/fluid/operators/fill_diagonal_op.cc | 18 ++++++++--- paddle/fluid/operators/fill_diagonal_op.cu | 16 +++++++--- .../unittests/test_tensor_fill_diagonal_.py | 30 +++++++++++++++++++ 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/fill_diagonal_op.cc b/paddle/fluid/operators/fill_diagonal_op.cc index db55c3e99693ae..be3239d5048442 100644 --- a/paddle/fluid/operators/fill_diagonal_op.cc +++ b/paddle/fluid/operators/fill_diagonal_op.cc @@ -108,8 +108,15 @@ class FillIDiagonalKernel : public framework::OpKernel { size = std::min(size, out_dims[1] * out_dims[1]); } - for (int64_t i = offset; i < size; i += strides) { - out_data[i] = temp_var; + for (int64_t i = 0; i < size; i += strides) { + // to check if the new position with offset is still in the same line; + // this modify should not affect across lines. + // out_dims[1] is also work for tensor with dim>2, for which the dims must + // be the same number + if (i % out_dims[1] + offset >= 0 && + i % out_dims[1] + offset < out_dims[1]) { + out_data[i + offset] = temp_var; + } } } }; @@ -176,8 +183,11 @@ class FillIDiagonalGradKernel : public framework::OpKernel { wrapsize = size; } - for (int64_t i = offset; i < wrapsize; i += strides) { - data[i] = T(0); + for (int64_t i = 0; i < wrapsize; i += strides) { + if (i % dx_dims[1] + offset >= 0 && + i % dx_dims[1] + offset < dx_dims[1]) { + data[i + offset] = T(0); + } } } } diff --git a/paddle/fluid/operators/fill_diagonal_op.cu b/paddle/fluid/operators/fill_diagonal_op.cu index 5047059fb364d3..15eabd4216d0bb 100644 --- a/paddle/fluid/operators/fill_diagonal_op.cu +++ b/paddle/fluid/operators/fill_diagonal_op.cu @@ -22,11 +22,19 @@ using CUDADeviceContext = paddle::platform::CUDADeviceContext; template __global__ void fill_constant_kernel(const int64_t featuresize, T* in_data, - int64_t strides, int offset, T fillvar) { + int64_t strides, int offset, T fillvar, + int dims) { for (int64_t idx = blockIdx.x * featuresize + threadIdx.x; idx * strides + offset < (blockIdx.x + 1) * featuresize; idx += blockDim.x) { - in_data[idx * strides + offset] = fillvar; + // to check if the new position with offset is still in the same line; + // this modify should not affect across lines. + // out_dims[1] is also work for tensor with dim>2, for which the dims must + // be the same number + if ((idx * strides) % dims + offset < dims && + (idx * strides) % dims + offset >= 0) { + in_data[idx * strides + offset] = fillvar; + } } } @@ -62,7 +70,7 @@ class FillIDiagonalCUDAKernel : public framework::OpKernel { int64_t kBlockDim = std::min(int64_t(size / strides), kMaxBlockDim); fill_constant_kernel<<<1, kBlockDim, 0>>>(size, out_data, strides, - offset, temp_var); + offset, temp_var, out_dims[1]); } }; @@ -96,7 +104,7 @@ class FillIDiagonalGradCUDAKernel : public framework::OpKernel { int64_t kBlockDim = std::min(int64_t(size), kMaxBlockDim); fill_constant_kernel<<<1, kBlockDim, 0>>>(wrapsize, in_data, strides, - offset, T(0)); + offset, T(0), out_dims[1]); } }; diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py index 41a8a9750cb64c..3beb6a537eca07 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py @@ -50,6 +50,36 @@ def test_dim2_normal(self): (y.grad.numpy().astype('float32') == expected_grad).all(), True) + def test_offset(self): + expected_np = np.array( + [[2, 2, 1], [2, 2, 2], [2, 2, 2]]).astype('float32') + expected_grad = np.array( + [[1, 1, 0], [1, 1, 1], [1, 1, 1]]).astype('float32') + + typelist = ['float32', 'float64', 'int32', 'int64'] + places = [fluid.CPUPlace()] + if fluid.core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + + for idx, p in enumerate(places): + if idx == 0: + paddle.set_device('cpu') + else: + paddle.set_device('gpu') + for dtype in typelist: + x = paddle.ones((3, 3), dtype=dtype) + x.stop_gradient = False + y = x * 2 + y.fill_diagonal_(1, offset=2, wrap=True) + loss = y.sum() + loss.backward() + + self.assertEqual( + (y.numpy().astype('float32') == expected_np).all(), True) + self.assertEqual( + (y.grad.numpy().astype('float32') == expected_grad).all(), + True) + def test_bool(self): expected_np = np.array( [[False, True, True], [True, False, True], [True, True, False]]) From 21dc7f40e14a09528711054e8bc329e3d9b15ee2 Mon Sep 17 00:00:00 2001 From: From00 Date: Sat, 9 Oct 2021 19:06:18 +0800 Subject: [PATCH 080/298] Add new API 'tensordot' (#36273) * Add new API tensordot * Set timeout value 400 for UT; Fix format for EN docs * Set timeout value 1000 for UT; Fix format for EN docs * Remove some input check * Coding style improve: don't compare boolean values to True or False using == --- python/paddle/__init__.py | 2 + .../fluid/tests/unittests/CMakeLists.txt | 1 + .../fluid/tests/unittests/test_tensordot.py | 238 ++++++++++++++++++ python/paddle/tensor/__init__.py | 2 + python/paddle/tensor/manipulation.py | 208 +++++++++++++++ 5 files changed, 451 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_tensordot.py diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index decffa66f4174f..2051a4f6fcd50d 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -151,6 +151,7 @@ from .tensor.manipulation import roll # noqa: F401 from .tensor.manipulation import chunk # noqa: F401 from .tensor.manipulation import tolist # noqa: F401 +from .tensor.manipulation import tensordot # noqa: F401 from .tensor.math import abs # noqa: F401 from .tensor.math import acos # noqa: F401 from .tensor.math import asin # noqa: F401 @@ -470,6 +471,7 @@ 'bmm', 'chunk', 'tolist', + 'tensordot', 'greater_than', 'shard_index', 'argsort', diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index cd1c4363879bb6..61a43aeb44e848 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1042,3 +1042,4 @@ if(WITH_GPU OR WITH_ROCM) endif() set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120) set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400) +set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000) diff --git a/python/paddle/fluid/tests/unittests/test_tensordot.py b/python/paddle/fluid/tests/unittests/test_tensordot.py new file mode 100644 index 00000000000000..29f3308988f6d3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_tensordot.py @@ -0,0 +1,238 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest +import paddle.fluid.core as core +import numpy as np +import itertools as it + +np.set_printoptions(threshold=np.inf) + + +def tensordot_np(x, y, axes): + if isinstance(axes, paddle.fluid.framework.Variable): + axes = axes.tolist() + + # np.tensordot does not support empty axes + if not axes: + axes = 0 + if (isinstance(axes, (tuple, list))): + if all(np.issubdtype(type(i), np.integer) for i in axes): + axes = [axes, axes] + else: + axes_x = axes[0] + if len(axes) > 1: + axes_y = axes[1] + else: + axes_y = axes_x + len_axes_x, len_axes_y = len(axes_x), len(axes_y) + if len_axes_x < len_axes_y: + axes_x = axes_x + axes_y[len_axes_x:] + elif len_axes_y < len_axes_x: + axes_y = axes_y + axes_x[len_axes_y:] + axes = [axes_x, axes_y] + + # np.tensordot does not support broadcast + if (isinstance(axes, (tuple, list))): + axes_x, axes_y = axes + else: + axes_x = list(range(x.ndim - axes, x.ndim)) + axes_y = list(range(axes)) + shape_x, shape_y = list(np.shape(x)), list(np.shape(y)) + for i in range(len(axes_x)): + dim_x, dim_y = axes_x[i], axes_y[i] + sx, sy = shape_x[dim_x], shape_y[dim_y] + if sx == 1: + shape_y[dim_y] = 1 + y = np.sum(y, dim_y) + y = np.reshape(y, shape_y) + elif sy == 1: + shape_x[dim_x] = 1 + x = np.sum(x, dim_x) + x = np.reshape(x, shape_x) + + return np.tensordot(x, y, axes) + + +class TestTensordotAPI(unittest.TestCase): + def setUp(self): + self.set_dtype() + self.set_input_shape() + self.set_input_data() + + def set_dtype(self): + self.dtype = np.float32 + + def set_input_shape(self): + self.x_shape = [5, 5, 5, 5] + self.y_shape = [5, 5, 5, 5] + + def set_input_data(self): + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.y = np.random.random(self.y_shape).astype(self.dtype) + self.all_axes = [2] + + def run_dygraph(self, place): + paddle.disable_static() + x = paddle.to_tensor(self.x, place=place) + y = paddle.to_tensor(self.y, place=place) + paddle_res = paddle.tensordot(x, y, self.axes) + np_res = tensordot_np(self.x, self.y, self.axes) + np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6) + + def run_static(self, place): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program(), + paddle.static.Program()): + x = paddle.static.data( + name='x', shape=self.x_shape, dtype=self.dtype) + y = paddle.static.data( + name='y', shape=self.y_shape, dtype=self.dtype) + z = paddle.tensordot(x, y, self.axes) + exe = paddle.static.Executor(place) + paddle_res = exe.run(feed={'x': self.x, + 'y': self.y}, + fetch_list=[z]) + np_res = tensordot_np(self.x, self.y, self.axes) + np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6) + + def test_cases(self): + self.all_axes = [] + axial_index = range(4) + all_permutations = list(it.permutations(axial_index, 0)) + list( + it.permutations(axial_index, 1)) + list( + it.permutations(axial_index, 2)) + list( + it.permutations(axial_index, 3)) + list( + it.permutations(axial_index, 4)) + self.all_axes.extend(list(i) for i in all_permutations) + + for axes_x in all_permutations: + for axes_y in all_permutations: + if len(axes_x) < len(axes_y): + supplementary_axes_x = axes_x + axes_y[len(axes_x):] + if any( + supplementary_axes_x.count(i) > 1 + for i in supplementary_axes_x): + continue + elif len(axes_y) < len(axes_x): + supplementary_axes_y = axes_y + axes_x[len(axes_y):] + if any( + supplementary_axes_y.count(i) > 1 + for i in supplementary_axes_y): + continue + self.all_axes.append([list(axes_x), list(axes_y)]) + + self.all_axes.extend(range(5)) + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for axes in self.all_axes: + self.axes = axes + for place in places: + self.run_dygraph(place) + self.run_static(place) + + +class TestTensordotAPIFloat64(TestTensordotAPI): + def set_dtype(self): + self.dtype = np.float64 + + +class TestTensordotAPIAxesType(TestTensordotAPI): + def set_input_shape(self): + self.x_shape = [3, 4, 4] + self.y_shape = [4, 4, 5] + + def test_cases(self): + self.all_axes = [ + 0, 1, 2, (1, ), [1], ((1, ), ), ([1], ), ((2, 1), (0, )), ( + (1, 2), (0, 1)), ([1, 2], [0, 1]), ([1, 2], [0, 1]), + [[1, 2], [0, 1]] + ] + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for axes in self.all_axes: + self.axes = axes + for place in places: + self.run_dygraph(place) + self.run_static(place) + + # The 'axes' with type 'Tensor' in tensordot is not available in static mode + paddle.disable_static() + for place in places: + self.all_axes = [ + paddle.to_tensor([1]), (paddle.to_tensor([1])), + (paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])), + [paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])], + paddle.to_tensor([[1, 2], [0, 1]]) + ] + for axes in self.all_axes: + self.axes = axes + for place in places: + self.run_dygraph(place) + + def test_error(self): + self.all_axes = [[[[0], [1]]], 0.1, -1, 100, [[1, 2], [0, 0]], + [[1, 2], [0, -1]], [0, 1, 2, 3]] + paddle.disable_static() + x = paddle.to_tensor(self.x) + y = paddle.to_tensor(self.y) + for axes in self.all_axes: + with self.assertRaises(BaseException): + paddle.tensordot(x, y, axes) + + +class TestTensordotAPIAxesTypeFloat64(TestTensordotAPIAxesType): + def set_dtype(self): + self.dtype = np.float64 + + +class TestTensordotAPIBroadcastCase1(TestTensordotAPI): + def set_input_shape(self): + self.x_shape = [1, 1, 1, 5] + self.y_shape = [1, 5, 1, 1] + + +class TestTensordotAPIBroadcastCase2(TestTensordotAPI): + def set_input_shape(self): + self.x_shape = [1, 5, 5, 5] + self.y_shape = [1, 1, 1, 5] + + +class TestTensordotAPIBroadcastCase3(TestTensordotAPI): + def set_input_shape(self): + self.x_shape = [5, 5, 5, 1] + self.y_shape = [5, 5, 1, 5] + + +class TestTensordotAPIBroadcastCase4(TestTensordotAPI): + def set_input_shape(self): + self.x_shape = [5, 5, 5, 1] + self.y_shape = [1, 1, 1, 1] + + +class TestTensordotAPIBroadcastCase5(TestTensordotAPI): + def set_input_shape(self): + self.x_shape = [1, 1, 5, 5] + self.y_shape = [5, 5, 1, 5] + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index b5d79b60393202..c8f897c21648f5 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -105,6 +105,7 @@ from .manipulation import unbind # noqa: F401 from .manipulation import roll # noqa: F401 from .manipulation import chunk # noqa: F401 +from .manipulation import tensordot # noqa: F401 from .math import abs # noqa: F401 from .math import acos # noqa: F401 from .math import asin # noqa: F401 @@ -346,6 +347,7 @@ 'slice', 'split', 'chunk', + 'tensordot', 'squeeze', 'squeeze_', 'stack', diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 4129a1060daf95..5f7588cb2a9a06 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -2173,3 +2173,211 @@ def strided_slice(x, axes, starts, ends, strides, name=None): return paddle.fluid.layers.strided_slice( input=x, axes=axes, starts=starts, ends=ends, strides=strides) + + +def tensordot(x, y, axes=2, name=None): + r""" + This function computes a contraction, which sum the product of elements from two tensors along the given axes. + + Args: + x (Tensor): The left tensor for contraction with data type ``float32`` or ``float64``. + y (Tensor): The right tensor for contraction with the same data type as ``x``. + axes (int|tuple|list|Tensor, optional): The axes to contract for ``x`` and ``y``, defaulted to integer ``2``. + + 1. It could be a non-negative integer ``n``, + in which the function will sum over the last ``n`` axes of ``x`` and the first ``n`` axes of ``y`` in order. + + 2. It could be a 1-d tuple or list with data type ``int``, in which ``x`` and ``y`` will be contracted along the same given axes. + For example, ``axes`` =[0, 1] applies contraction along the first two axes for ``x`` and the first two axes for ``y``. + + 3. It could be a tuple or list containing one or two 1-d tuple|list|Tensor with data type ``int``. + When containing one tuple|list|Tensor, the data in tuple|list|Tensor specified the same axes for ``x`` and ``y`` to contract. + When containing two tuple|list|Tensor, the first will be applied to ``x`` and the second to ``y``. + When containing more than two tuple|list|Tensor, only the first two axis sequences will be used while the others will be ignored. + + 4. It could be a tensor, in which the ``axes`` tensor will be translated to a python list + and applied the same rules described above to determine the contraction axes. + Note that the ``axes`` with Tensor type is ONLY available in Dygraph mode. + name(str, optional): The default value is None. Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name` . + + Return: + Output (Tensor): The contraction result with the same data type as ``x`` and ``y``. + In general, :math:`output.ndim = x.ndim + y.ndim - 2 \times n_{axes}`, where :math:`n_{axes}` denotes the number of axes to be contracted. + + NOTES: + 1. This function supports tensor broadcast, + the size in the corresponding dimensions of ``x`` and ``y`` should be equal, or applies to the broadcast rules. + 2. This function also supports axes expansion, + when the two given axis sequences for ``x`` and ``y`` are of different lengths, + the shorter sequence will expand the same axes as the longer one at the end. + For example, if ``axes`` =[[0, 1, 2, 3], [1, 0]], + the axis sequence for ``x`` is [0, 1, 2, 3], + while the corresponding axis sequences for ``y`` will be expanded from [1, 0] to [1, 0, 2, 3]. + + Examples: + .. code-block:: python + + import paddle + + data_type = 'float64' + + # For two 2-d tensor x and y, the case axes=0 is equivalent to outer product. + # Note that tensordot supports empty axis sequence, so all the axes=0, axes=[], axes=[[]], and axes=[[],[]] are equivalent cases. + x = paddle.arange(4, dtype=data_type).reshape([2, 2]) + y = paddle.arange(4, dtype=data_type).reshape([2, 2]) + z = paddle.tensordot(x, y, axes=0) + # z = [[[[0., 0.], + # [0., 0.]], + # + # [[0., 1.], + # [2., 3.]]], + # + # + # [[[0., 2.], + # [4., 6.]], + # + # [[0., 3.], + # [6., 9.]]]] + + + # For two 1-d tensor x and y, the case axes=1 is equivalent to inner product. + x = paddle.arange(10, dtype=data_type) + y = paddle.arange(10, dtype=data_type) + z1 = paddle.tensordot(x, y, axes=1) + z2 = paddle.dot(x, y) + # z1 = z2 = [285.] + + + # For two 2-d tensor x and y, the case axes=1 is equivalent to matrix multiplication. + x = paddle.arange(6, dtype=data_type).reshape([2, 3]) + y = paddle.arange(12, dtype=data_type).reshape([3, 4]) + z1 = paddle.tensordot(x, y, axes=1) + z2 = paddle.matmul(x, y) + # z1 = z2 = [[20., 23., 26., 29.], + # [56., 68., 80., 92.]] + + + # When axes is a 1-d int list, x and y will be contracted along the same given axes. + # Note that axes=[1, 2] is equivalent to axes=[[1, 2]], axes=[[1, 2], []], axes=[[1, 2], [1]], and axes=[[1, 2], [1, 2]]. + x = paddle.arange(24, dtype=data_type).reshape([2, 3, 4]) + y = paddle.arange(36, dtype=data_type).reshape([3, 3, 4]) + z = paddle.tensordot(x, y, axes=[1, 2]) + # z = [[506. , 1298., 2090.], + # [1298., 3818., 6338.]] + + + # When axes is a list containing two 1-d int list, the first will be applied to x and the second to y. + x = paddle.arange(60, dtype=data_type).reshape([3, 4, 5]) + y = paddle.arange(24, dtype=data_type).reshape([4, 3, 2]) + z = paddle.tensordot(x, y, axes=([1, 0], [0, 1])) + # z = [[4400., 4730.], + # [4532., 4874.], + # [4664., 5018.], + # [4796., 5162.], + # [4928., 5306.]] + + + # Thanks to the support of axes expansion, axes=[[0, 1, 3, 4], [1, 0, 3, 4]] can be abbreviated as axes= [[0, 1, 3, 4], [1, 0]]. + x = paddle.arange(720, dtype=data_type).reshape([2, 3, 4, 5, 6]) + y = paddle.arange(720, dtype=data_type).reshape([3, 2, 4, 5, 6]) + z = paddle.tensordot(x, y, axes=[[0, 1, 3, 4], [1, 0]]) + # z = [[23217330., 24915630., 26613930., 28312230.], + # [24915630., 26775930., 28636230., 30496530.], + # [26613930., 28636230., 30658530., 32680830.], + # [28312230., 30496530., 32680830., 34865130.]] + """ + op_type = 'tensordot' + input_dtype = ['float32', 'float64'] + + check_variable_and_dtype(x, 'x', input_dtype, op_type) + check_variable_and_dtype(y, 'y', input_dtype, op_type) + check_type(axes, 'axes', (int, tuple, list, Variable), op_type) + + def _var_to_list(var): + if in_dygraph_mode(): + return tolist(var) + raise TypeError( + "The 'axes' with type 'Tensor' in " + op_type + + " is not available in static graph mode, " + "please convert its type to int|Tuple|List, or use dynamic graph mode." + ) + + axes_x = [] + axes_y = [] + if np.issubdtype(type(axes), np.integer): + assert axes >= 0, ( + "The 'axes' in " + op_type + + f" should not be negative, but received axes={axes}.") + axes_x = range(x.ndim - axes, x.ndim) + axes_y = range(axes) + else: + if isinstance(axes, Variable): + axes = _var_to_list(axes) + + if not axes or np.issubdtype(type(axes[0]), np.integer): + axes_x = axes + else: + axes_x = axes[0] + if len(axes) > 1: + axes_y = axes[1] + + if isinstance(axes_x, Variable): + axes_x = _var_to_list(axes_x) + if isinstance(axes_y, Variable): + axes_y = _var_to_list(axes_y) + + axes_x, axes_y = list(axes_x), list(axes_y) + len_axes_x, len_axes_y = len(axes_x), len(axes_y) + if len_axes_x < len_axes_y: + axes_x.extend(axes_y[len_axes_x:]) + elif len_axes_y < len_axes_x: + axes_y.extend(axes_x[len_axes_y:]) + + shape_x, shape_y = list(x.shape), list(y.shape) + need_contracted_dim_x = np.zeros((x.ndim), dtype=bool) + need_contracted_dim_y = np.zeros((y.ndim), dtype=bool) + contraction_size = 1 + for i in range(len(axes_x)): + dim_x, dim_y = axes_x[i], axes_y[i] + sx, sy = shape_x[dim_x], shape_y[dim_y] + if sx == 1: + shape_y[dim_y] = 1 + y = y.sum(dim_y).reshape(shape_y) + elif sy == 1: + shape_x[dim_x] = 1 + x = x.sum(dim_x).reshape(shape_x) + else: + assert sx == sy, "The dimensional size for 'x' and 'y' in " + op_type + f" should match each other, but 'x' has size {sx} in dim {dim_x} while 'y' has size {sy} in dim {dim_y}." + + need_contracted_dim_x[dim_x] = True + need_contracted_dim_y[dim_y] = True + contraction_size *= shape_x[dim_x] + + perm_x = [] + perm_y = [] + shape_out = [] + not_contraction_size_x = 1 + not_contraction_size_y = 1 + for i in range(x.ndim): + if not need_contracted_dim_x[i]: + perm_x.append(i) + shape_out.append(shape_x[i]) + not_contraction_size_x *= shape_x[i] + perm_x.extend(axes_x) + perm_y.extend(axes_y) + for i in range(y.ndim): + if not need_contracted_dim_y[i]: + perm_y.append(i) + shape_out.append(shape_y[i]) + not_contraction_size_y *= shape_y[i] + + if not shape_out: + shape_out = [1] + + x = x.transpose(perm=perm_x).reshape( + [not_contraction_size_x, contraction_size]) + y = y.transpose(perm=perm_y).reshape( + [contraction_size, not_contraction_size_y]) + out = x.matmul(y).reshape(shape_out) + return out From cb620ca6de8909eed0ed14620dbb0c60628def86 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Sat, 9 Oct 2021 19:09:40 +0800 Subject: [PATCH 081/298] Add const for OpDesc::id() and VarDesc::id() (#36298) * add const OpDesc id() * add const for VarDesc::id() --- paddle/fluid/framework/op_desc.h | 2 +- paddle/fluid/framework/var_desc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 0eafbb027f0421..9470fd9b699330 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -164,7 +164,7 @@ class OpDesc { // Note: the identity only used as a key for referring to its // distributed attribute now. - uint64_t Id() { return id_; } + uint64_t Id() const { return id_; } private: template diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index d1a1757d5309b6..a6f56ad4458348 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -160,7 +160,7 @@ class VarDesc { // Note: the identity only used as a key for referring to its // distributed attribute now. - uint64_t Id() { return id_; } + uint64_t Id() const { return id_; } private: const proto::VarType::TensorDesc &tensor_desc() const; From 91119271584dbf6cefe86a170e078d245bf912e5 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Sat, 9 Oct 2021 19:20:51 +0800 Subject: [PATCH 082/298] Enhance OpTest for bfloat16. (#36079) --- paddle/fluid/operators/cast_op.cu | 33 +++---- .../paddle/fluid/tests/unittests/op_test.py | 86 +++++++++++++------ .../fluid/tests/unittests/test_cast_op.py | 38 +++++++- 3 files changed, 106 insertions(+), 51 deletions(-) diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index 601735c2f148ad..05a110fe65b839 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -94,24 +94,19 @@ class CastCUDAOpKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; - -#ifdef PADDLE_WITH_HIP -REGISTER_OP_CUDA_KERNEL( - cast, ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, - ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, - ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, - ops::CastCUDAOpKernel, - ops::CastCUDAOpKernel, - ops::CastCUDAOpKernel>, - ops::CastCUDAOpKernel>); +namespace plat = paddle::platform; + +#define REGISTER_CAST_CUDA_BASE(op_name, ...) \ + REGISTER_OP_CUDA_KERNEL( \ + op_name, ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, \ + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, \ + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, \ + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, \ + ops::CastCUDAOpKernel>, \ + ops::CastCUDAOpKernel>, ##__VA_ARGS__); + +#if !defined(PADDLE_WITH_HIP) +REGISTER_CAST_CUDA_BASE(cast, ops::CastCUDAOpKernel) #else -REGISTER_OP_CUDA_KERNEL( - cast, ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, - ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, - ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, - ops::CastCUDAOpKernel, - ops::CastCUDAOpKernel, - ops::CastCUDAOpKernel, - ops::CastCUDAOpKernel>, - ops::CastCUDAOpKernel>); +REGISTER_CAST_CUDA_BASE(cast) #endif diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 3621d20fa24721..41fd0b442fe1c5 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -147,6 +147,9 @@ def get_output(): op.run(scope, place) for output_name in output_names: output_numpy = np.array(scope.find_var(output_name).get_tensor()) + # numpy.dtype does not have bfloat16, thus we use numpy.uint16 to + # store bfloat16 data, and need to be converted to float to check + # the floating precision. if tensor_to_check._dtype() == core.VarDesc.VarType.BF16: output_numpy = convert_uint16_to_float(output_numpy) sum.append(output_numpy.astype(tensor_to_check_dtype).mean()) @@ -362,11 +365,26 @@ def try_call_once(self, data_type): self.dtype = data_type def is_bfloat16_op(self): + # self.dtype is the dtype of inputs, and is set in infer_dtype_from_inputs_outputs. + # Make sure this function is called after calling infer_dtype_from_inputs_outputs. return self.dtype == np.uint16 or ( - hasattr(self, 'mkldnn_data_type') and - getattr(self, 'mkldnn_data_type') is "bfloat16") or ( - hasattr(self, 'attrs') and 'mkldnn_data_type' in self.attrs and - self.attrs['mkldnn_data_type'] == 'bfloat16') + hasattr(self, 'output_dtype') and + self.output_dtype == np.uint16) or ( + hasattr(self, 'mkldnn_data_type') and + getattr(self, 'mkldnn_data_type') is "bfloat16") or ( + hasattr(self, 'attrs') and + 'mkldnn_data_type' in self.attrs and + self.attrs['mkldnn_data_type'] == 'bfloat16') + + def is_mkldnn_op(self): + return (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or ( + hasattr(self, "attrs") and "use_mkldnn" in self.attrs and + self.attrs["use_mkldnn"] == True) + + def is_xpu_op(self): + return (hasattr(self, "use_xpu") and self.use_xpu == True) or ( + hasattr(self, "attrs") and "use_xpu" in self.attrs and + self.attrs["use_xpu"] == True) def infer_dtype_from_inputs_outputs(self, inputs, outputs): def is_np_data(input): @@ -398,8 +416,8 @@ def infer_dtype(numpy_dict, dtype_set): # infer dtype from inputs, and dtype means the precision of the test # collect dtype of all inputs - dtype_set = set() - infer_dtype(inputs, dtype_set) + input_dtype_set = set() + infer_dtype(inputs, input_dtype_set) dtype_list = [ np.dtype(np.float64), np.dtype(np.float32), np.dtype(np.float16), np.dtype(np.int64), np.dtype(np.int32), np.dtype(np.uint16), @@ -408,12 +426,20 @@ def infer_dtype(numpy_dict, dtype_set): ] # check the dtype in dtype_list in order, select the first dtype that in dtype_set for dtype in dtype_list: - if dtype in dtype_set: + if dtype in input_dtype_set: self.dtype = dtype break - # save dtype in class attr + # save input dtype in class attr self.__class__.dtype = self.dtype + # infer dtype of outputs + output_dtype_set = set() + infer_dtype(outputs, output_dtype_set) + for dtype in dtype_list: + if dtype in output_dtype_set: + self.output_dtype = dtype + break + def feed_var(self, input_vars, place): feed_map = {} for var_name in input_vars: @@ -439,14 +465,10 @@ def feed_var(self, input_vars, place): def _append_ops(self, block): self.__class__.op_type = self.op_type # for ci check, please not delete it for now - if (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or \ - (hasattr(self, "attrs") and "use_mkldnn" in self.attrs and \ - self.attrs["use_mkldnn"] == True): + if self.is_mkldnn_op(): self.__class__.use_mkldnn = True - if (hasattr(self, "use_xpu") and self.use_xpu == True) or \ - (hasattr(self, "attrs") and "use_xpu" in self.attrs and \ - self.attrs["use_xpu"] == True): + if self.is_xpu_op(): self.__class__.use_xpu = True op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) @@ -1092,12 +1114,15 @@ def check_output_with_place(self, atol = 0 if self.is_bfloat16_op(): - check_dygraph = False - if hasattr(self, 'force_fp32_output') and getattr( - self, 'force_fp32_output'): - atol = 1e-2 + if self.is_mkldnn_op(): + check_dygraph = False + if hasattr(self, 'force_fp32_output') and getattr( + self, 'force_fp32_output'): + atol = 1e-2 + else: + atol = 2 else: - atol = 2 + atol = 1e-2 if no_check_set is not None: if self.op_type not in no_check_set_white_list.no_check_set_white_list: @@ -1193,6 +1218,7 @@ def find_actual(target_name, fetch_list): expect = self.outputs[out_name] expect_t = expect[0] if isinstance(expect, tuple) else expect + # np.uint16 represents bfloat16 if actual_t.dtype == np.uint16 and expect_t.dtype in [ np.float32, np.float64 ]: @@ -1205,6 +1231,7 @@ def find_actual(target_name, fetch_list): expect_t = convert_uint16_to_float(expect_t) actual_t = convert_uint16_to_float(actual_t) atol = max(atol, 0.03) + # NOTE(zhiqiu): np.allclose([], [1.]) returns True # see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng if expect_t.size == 0: @@ -1214,13 +1241,19 @@ def find_actual(target_name, fetch_list): np.allclose( actual_t, expect_t, - rtol=rtol, atol=atol, + rtol=rtol, equal_nan=equal_nan), "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + str(actual_t) + " in class " + self.__class__.__name__) if check_dygraph: + if self.is_bfloat16_op(): + if imperative_actual_t.dtype == np.uint16: + imperative_actual_t = convert_uint16_to_float( + imperative_actual_t) + if expect_t.dtype == np.uint16: + expect_t = convert_uint16_to_float(expect_t) if six.moves.reduce( lambda x, y: x * y, imperative_actual_t.shape, 1) == 0 and six.moves.reduce( @@ -1232,6 +1265,7 @@ def find_actual(target_name, fetch_list): imperative_actual_t, expect_t, atol=atol, + rtol=rtol, equal_nan=equal_nan), "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + @@ -1340,14 +1374,10 @@ def check_output(self, check_dygraph=True, inplace_atol=None): self.__class__.op_type = self.op_type - if (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or \ - (hasattr(self, "attrs") and "use_mkldnn" in self.attrs and \ - self.attrs["use_mkldnn"] == True): + if self.is_mkldnn_op(): self.__class__.use_mkldnn = True - if (hasattr(self, "use_xpu") and self.use_xpu == True) or \ - (hasattr(self, "attrs") and "use_xpu" in self.attrs and \ - self.attrs["use_xpu"] == True): + if self.is_xpu_op(): self.__class__.use_xpu = True places = self._get_places() @@ -1452,10 +1482,10 @@ def check_grad_with_place(self, op_outputs = self.outputs if hasattr(self, "outputs") else dict() op_attrs = self.attrs if hasattr(self, "attrs") else dict() - if self.is_bfloat16_op(): + self._check_grad_helper() + if self.is_bfloat16_op() and self.is_mkldnn_op(): check_dygraph = False - self._check_grad_helper() if self.dtype == np.float64 and \ self.op_type not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST: numeric_grad_delta = 1e-5 diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py index 0fc3dccab4a64d..948e344e4c158a 100644 --- a/python/paddle/fluid/tests/unittests/test_cast_op.py +++ b/python/paddle/fluid/tests/unittests/test_cast_op.py @@ -14,7 +14,6 @@ from __future__ import print_function -import op_test import unittest import numpy as np @@ -22,9 +21,10 @@ import paddle.fluid.core as core import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard +from op_test import OpTest, convert_uint16_to_float, convert_float_to_uint16 -class TestCastOp1(op_test.OpTest): +class TestCastOpFp32ToFp64(OpTest): def setUp(self): ipt = np.random.random(size=[10, 10]) self.inputs = {'X': ipt.astype('float32')} @@ -42,7 +42,7 @@ def test_grad(self): self.check_grad(['X'], ['Out']) -class TestCastOp2(op_test.OpTest): +class TestCastOpFp16ToFp32(OpTest): def setUp(self): ipt = np.random.random(size=[10, 10]) self.inputs = {'X': ipt.astype('float16')} @@ -57,7 +57,7 @@ def test_check_output(self): self.check_output(atol=1e-3) -class TestCastOp3(op_test.OpTest): +class TestCastOpFp32ToFp16(OpTest): def setUp(self): ipt = np.random.random(size=[10, 10]) self.inputs = {'X': ipt.astype('float32')} @@ -72,6 +72,36 @@ def test_check_output(self): self.check_output(atol=1e-3) +class TestCastOpBf16ToFp32(OpTest): + def setUp(self): + ipt = np.array(np.random.randint(10, size=[10, 10])).astype('uint16') + self.inputs = {'X': ipt} + self.outputs = {'Out': convert_uint16_to_float(ipt)} + self.attrs = { + 'in_dtype': int(core.VarDesc.VarType.BF16), + 'out_dtype': int(core.VarDesc.VarType.FP32) + } + self.op_type = 'cast' + + def test_check_output(self): + self.check_output() + + +class TestCastOpFp32ToBf16(OpTest): + def setUp(self): + ipt = np.random.random(size=[10, 10]).astype('float32') + self.inputs = {'X': ipt} + self.outputs = {'Out': convert_float_to_uint16(ipt)} + self.attrs = { + 'in_dtype': int(core.VarDesc.VarType.FP32), + 'out_dtype': int(core.VarDesc.VarType.BF16) + } + self.op_type = 'cast' + + def test_check_output(self): + self.check_output() + + class TestCastOpError(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): From 7e6c0ceef27ec8e0f7fa15d688babd4ee67d20f0 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Sat, 9 Oct 2021 21:04:41 +0800 Subject: [PATCH 083/298] Implement Fused BN + Add + Relu with cudnnFusedOps API. (#35955) --- paddle/fluid/operators/fused/CMakeLists.txt | 1 + .../operators/fused/cudnn_bn_add_relu_test.cc | 380 ++++++++++++++++++ .../fused/cudnn_bn_stats_finalize.cu.h | 181 +++++++++ .../fused/cudnn_scale_bias_add_relu.cu.h | 292 ++++++++++++++ 4 files changed, 854 insertions(+) create mode 100644 paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc create mode 100644 paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h create mode 100644 paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 599be6912b760e..2630c12db2fc9a 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -80,5 +80,6 @@ if (WITH_GPU OR WITH_ROCM) endif() if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000)) cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory) + cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory) endif() endif() diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc new file mode 100644 index 00000000000000..7229754cb8ed82 --- /dev/null +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -0,0 +1,380 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h" +#include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/float16.h" + +DECLARE_bool(cudnn_batchnorm_spatial_persistent); + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace op = paddle::operators; +using Tensor = paddle::framework::Tensor; + +USE_OP(batch_norm); + +template +void InitRandomTensor(const std::vector &dims, + framework::Tensor *cpu_out) { + T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), + platform::CPUPlace()); + std::default_random_engine random(0); + std::uniform_real_distribution dis(0.0, 1.0); + for (int i = 0; i < cpu_out->numel(); ++i) { + cpu_out_ptr[i] = static_cast(dis(random)); + } +} + +template +void InitConstantTensor(const std::vector &dims, T value, + framework::Tensor *cpu_out) { + T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), + platform::CPUPlace()); + for (int i = 0; i < cpu_out->numel(); ++i) { + cpu_out_ptr[i] = value; + } +} + +template +void CheckOutput(std::string name, const framework::Tensor &cpu_res, + const framework::Tensor &cpu_base, float diff, + bool is_relative_atol = false) { + if (cpu_res.dims().size() == cpu_base.dims().size()) { + EXPECT_EQ(cpu_res.dims(), cpu_base.dims()); + } else { + EXPECT_EQ(cpu_res.numel(), cpu_base.numel()); + } + + const T *cpu_res_ptr = cpu_res.data(); + const T *cpu_base_ptr = cpu_base.data(); + float max_diff = 0; + int index = 0; + for (int i = 0; i < cpu_res.numel(); ++i) { + float cur_diff; + if (is_relative_atol) { + cur_diff = static_cast( + std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) / cpu_base_ptr[i])); + EXPECT_LT(static_cast(std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) / + cpu_base_ptr[i])), + diff); + } else { + cur_diff = static_cast(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i])); + EXPECT_LT(static_cast(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i])), + diff); + } + if (cur_diff > max_diff) { + max_diff = cur_diff; + index = i; + } + } + std::string error_type = is_relative_atol ? "relative" : "absolute"; + LOG(INFO) << "[" << name << "], The dims is [" << cpu_res.dims() + << "], maximum " << error_type << " error is " << max_diff << ": " + << cpu_res_ptr[index] << " vs " << cpu_base_ptr[index]; +} + +template +void ComputeSumAndSquareSum(const framework::Tensor &cpu_x, + framework::Tensor *cpu_sum, + framework::Tensor *cpu_sum_of_square) { + // x is in NHWC format. + auto dims = cpu_x.dims(); + int64_t c = dims[3]; + + const T *cpu_x_ptr = cpu_x.data(); + float *cpu_sum_ptr = + cpu_sum->mutable_data({1, 1, 1, c}, platform::CPUPlace()); + float *cpu_sum_square_ptr = cpu_sum_of_square->mutable_data( + {1, 1, 1, c}, platform::CPUPlace()); + + for (int j = 0; j < c; ++j) { + float tmp_sum = 0.0f; + float tmp_sum_of_squares = 0.0f; + for (int i = 0; i < cpu_x.numel() / c; ++i) { + float tmp_x = static_cast(cpu_x_ptr[i * c + j]); + tmp_sum += tmp_x; + tmp_sum_of_squares += tmp_x * tmp_x; + } + cpu_sum_ptr[j] = tmp_sum; + cpu_sum_square_ptr[j] = tmp_sum_of_squares; + } +} + +// get paddle batchnorm op results as baseline +void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_x, const Tensor &cpu_scale, + const Tensor &cpu_bias, Tensor *cpu_mean, + Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *cpu_reserve_space) { + framework::Scope scope; + auto *x = scope.Var("X")->GetMutable(); + auto *scale = scope.Var("Scale")->GetMutable(); + auto *bias = scope.Var("Bias")->GetMutable(); + auto *mean = scope.Var("Mean")->GetMutable(); + auto *var = scope.Var("Variance")->GetMutable(); + auto *y = scope.Var("Y")->GetMutable(); + auto *saved_mean = scope.Var("SavedMean")->GetMutable(); + auto *saved_var = + scope.Var("SavedVariance")->GetMutable(); + auto *reserve_space = + scope.Var("ReserveSpace")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x, place, x); + TensorCopySync(cpu_scale, place, scale); + TensorCopySync(cpu_bias, place, bias); + TensorCopySync(*cpu_mean, place, mean); + TensorCopySync(*cpu_var, place, var); + + int64_t channels = x->dims()[3]; + scale->Resize({channels}); + bias->Resize({channels}); + mean->Resize({channels}); + var->Resize({channels}); + + framework::AttributeMap attrs; + std::string data_layout = "NHWC"; + attrs.insert({"data_layout", data_layout}); + + auto op = framework::OpRegistry::CreateOp( + "batch_norm", {{"X", {"X"}}, + {"Scale", {"Scale"}}, + {"Bias", {"Bias"}}, + {"Mean", {"Mean"}}, + {"Variance", {"Variance"}}}, + {{"Y", {"Y"}}, + {"MeanOut", {"Mean"}}, + {"VarianceOut", {"Variance"}}, + {"SavedMean", {"SavedMean"}}, + {"SavedVariance", {"SavedVariance"}}, + {"ReserveSpace", {"ReserveSpace"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*y, platform::CPUPlace(), cpu_y); + TensorCopySync(*mean, platform::CPUPlace(), cpu_mean); + TensorCopySync(*var, platform::CPUPlace(), cpu_var); + TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); + TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); + TensorCopySync(*reserve_space, platform::CPUPlace(), cpu_reserve_space); +} + +template +class CudnnBNAddReluTester { + public: + CudnnBNAddReluTester(int batch_size, int height, int width, int channels) { + batch_size_ = batch_size; + height_ = height; + width_ = width; + channels_ = channels; + ele_count_ = batch_size_ * height_ * width_; + SetUp(); + } + + ~CudnnBNAddReluTester() {} + + void CheckForward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + framework::Tensor cpu_mean_base; + framework::Tensor cpu_var_base; + framework::Tensor cpu_saved_mean_base; + framework::Tensor cpu_saved_var_base; + framework::Tensor cpu_y_base; + framework::Tensor cpu_reserve_space_base; + BaselineForward(*ctx, &cpu_mean_base, &cpu_var_base, &cpu_saved_mean_base, + &cpu_saved_var_base, &cpu_y_base, &cpu_reserve_space_base); + + framework::Tensor cpu_mean; + framework::Tensor cpu_var; + framework::Tensor cpu_saved_mean; + framework::Tensor cpu_saved_var; + framework::Tensor cpu_y; + framework::Tensor cpu_bitmask; + FusedForward(*ctx, &cpu_mean, &cpu_var, &cpu_saved_mean, &cpu_saved_var, + &cpu_y, &cpu_bitmask); + + CheckOutput("Mean", cpu_mean, cpu_mean_base, diff, is_relative_atol); + CheckOutput("Variance", cpu_var, cpu_var_base, diff, + is_relative_atol); + CheckOutput("SavedMean", cpu_saved_mean, cpu_saved_mean_base, diff, + is_relative_atol); + CheckOutput("SavedVariance", cpu_saved_var, cpu_saved_var_base, diff, + is_relative_atol); + CheckOutput("Y", cpu_y, cpu_y_base, diff, is_relative_atol); + } + + private: + void SetUp() { + // Initialize input data + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_x_); + ComputeSumAndSquareSum(cpu_x_, &cpu_sum_, &cpu_sum_of_square_); + + // scale and bias should be initialized randomly. + InitConstantTensor({channels_}, static_cast(1.0f), + &cpu_bn_scale_); + InitConstantTensor({channels_}, static_cast(0.0f), + &cpu_bn_bias_); + } + + void InitMeanVar(Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var) { + InitConstantTensor({channels_}, static_cast(0.0f), cpu_mean); + InitConstantTensor({channels_}, static_cast(1.0f), cpu_var); + InitConstantTensor({channels_}, static_cast(0.0f), + cpu_saved_mean); + InitConstantTensor({channels_}, static_cast(0.0f), + cpu_saved_var); + } + + void BaselineForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean, + Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *cpu_reserve_space) { + InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var); + ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_, cpu_bn_bias_, cpu_mean, + cpu_var, cpu_saved_mean, cpu_saved_var, cpu_y, + cpu_reserve_space); + } + + // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu + void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean, + Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, Tensor *cpu_bitmask) { + framework::Tensor x; + framework::Tensor sum; + framework::Tensor sum_of_square; + framework::Tensor bn_scale; + framework::Tensor bn_bias; + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x_, place, &x); + TensorCopySync(cpu_sum_, place, &sum); + TensorCopySync(cpu_sum_of_square_, place, &sum_of_square); + TensorCopySync(cpu_bn_scale_, place, &bn_scale); + TensorCopySync(cpu_bn_bias_, place, &bn_bias); + + bn_scale.Resize({1, 1, 1, channels_}); + bn_bias.Resize({1, 1, 1, channels_}); + + T *x_ptr = x.data(); + float *sum_ptr = sum.data(); + float *sum_of_square_ptr = sum_of_square.data(); + float *bn_scale_ptr = bn_scale.data(); + float *bn_bias_ptr = bn_bias.data(); + + framework::Tensor mean; + framework::Tensor var; + framework::Tensor saved_mean; + framework::Tensor saved_var; + framework::Tensor equiv_scale; + framework::Tensor equiv_bias; + framework::Tensor y; + framework::Tensor bitmask; + + InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var); + TensorCopySync(*cpu_mean, place, &mean); + TensorCopySync(*cpu_var, place, &var); + + mean.Resize({1, 1, 1, channels_}); + var.Resize({1, 1, 1, channels_}); + + float *mean_ptr = mean.data(); + float *var_ptr = var.data(); + float *saved_mean_ptr = + saved_mean.mutable_data({1, 1, 1, channels_}, place); + float *saved_var_ptr = + saved_var.mutable_data({1, 1, 1, channels_}, place); + T *equiv_scale_ptr = + equiv_scale.mutable_data({1, 1, 1, channels_}, place); + T *equiv_bias_ptr = equiv_bias.mutable_data({1, 1, 1, channels_}, place); + T *y_ptr = + y.mutable_data({batch_size_, height_, width_, channels_}, place); + + // bitmask + int c = channels_; + int64_t nhw = ele_count_; + int32_t c_int32_elems = ((c + 63) & ~63) / 32; + int32_t nhw_int32_elems = (nhw + 31) & ~31; + int32_t *bitmask_ptr = bitmask.mutable_data( + {nhw_int32_elems, c_int32_elems, 1}, place); + + auto data_shape = framework::vectorize(x.dims()); + auto param_shape = framework::vectorize(bn_scale.dims()); + auto bitmask_shape = framework::vectorize(bitmask.dims()); + + // 1. BN Stats Finalize + op::CudnnBNStatsFinalize bn_op(ctx, param_shape); + bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr, + saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr, + equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_, + true); + + // 2. Scale Bias + Relu (not fused add) + std::string act_type = ""; + op::CudnnScaleBiasAddRelu sbar_op( + ctx, act_type, false, false, data_shape, param_shape, bitmask_shape); + sbar_op.Forward(ctx, x_ptr, equiv_scale_ptr, equiv_bias_ptr, y_ptr, + bitmask_ptr); + + TensorCopySync(mean, platform::CPUPlace(), cpu_mean); + TensorCopySync(var, platform::CPUPlace(), cpu_var); + TensorCopySync(saved_mean, platform::CPUPlace(), cpu_saved_mean); + TensorCopySync(saved_var, platform::CPUPlace(), cpu_saved_var); + TensorCopySync(y, platform::CPUPlace(), cpu_y); + TensorCopySync(bitmask, platform::CPUPlace(), cpu_bitmask); + } + + private: + int batch_size_; + int height_; + int width_; + int channels_; + int ele_count_; + + // Forward input + framework::Tensor cpu_x_; + framework::Tensor cpu_sum_; + framework::Tensor cpu_sum_of_square_; + framework::Tensor cpu_bn_scale_; + framework::Tensor cpu_bn_bias_; + + double eps_ = 1e-5; + float momentum_ = 0.9; +}; + +TEST(CudnnBNAddReluForward, GPUCudnnBNAddReluForwardFp16) { + int batch_size = 4; + int height = 8; + int width = 8; + int channels = 64; + FLAGS_cudnn_batchnorm_spatial_persistent = true; + CudnnBNAddReluTester test(batch_size, height, + width, channels); + test.CheckForward(2e-3); +} diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h new file mode 100644 index 00000000000000..7d4b24cd4fc3de --- /dev/null +++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h @@ -0,0 +1,181 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" +#include "paddle/fluid/platform/cudnn_desc.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +namespace dynload = platform::dynload; +template +using BatchNormParamType = + typename platform::CudnnDataType::BatchNormParamType; + +#if CUDNN_VERSION >= 8000 + +template +struct BNStatsFinalizeArgs { + BNStatsFinalizeArgs() { + dtype = platform::CudnnDataType::type; + param_dtype = platform::CudnnDataType>::type; + format = CUDNN_TENSOR_NHWC; + } + + void Set(const std::vector ¶m_shape) { + PADDLE_ENFORCE_EQ( + param_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of param_shape is expected to 4. But recieved " + "param_shape's size is %d, param_shape is [%s].", + param_shape.size(), framework::make_ddim(param_shape))); + + in_desc.set(param_shape, format, param_dtype); + out_desc.set(param_shape, format, dtype); + } + + cudnnDataType_t dtype; + cudnnDataType_t param_dtype; + cudnnTensorFormat_t format; + + platform::TensorDescriptor in_desc; + platform::TensorDescriptor out_desc; +}; + +template +class CudnnBNStatsFinalize { + public: + CudnnBNStatsFinalize(const platform::CUDADeviceContext &ctx, + const std::vector ¶m_shape) + : train_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING), + inference_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE) { + args_.Set(param_shape); + } + ~CudnnBNStatsFinalize() {} + + void Forward(const platform::CUDADeviceContext &ctx, float *sum_ptr, + float *sum_of_squares_ptr, float *scale_ptr, float *bias_ptr, + float *saved_mean_ptr, float *saved_invstd_ptr, + float *running_mean_ptr, float *running_var_ptr, + T *equiv_scale_ptr, T *equiv_bias_ptr, double eps, + float momentum, int64_t ele_count, bool is_train) { + if (is_train) { + TrainInit(ctx); + } else { + InferenceInit(ctx); + } + auto &op = is_train ? train_op_ : inference_op_; + + // Set variant_param for both inference_op_ and train_op_ + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_MEAN, running_mean_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_VAR, running_var_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, equiv_scale_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, equiv_bias_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_DOUBLE_BN_EPSILON, &eps); + + // Set extra variant_param only for train_op_: + if (is_train) { + op.SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_MEAN, saved_mean_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_INVSTD, saved_invstd_ptr); + double avg_factor = 1.0 - momentum; + op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT, + &ele_count); + op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR, + &avg_factor); + } + // fused op execute + auto handle = ctx.cudnn_handle(); + op.Execute(handle); + } + + private: + void TrainInit(const platform::CUDADeviceContext &ctx) { + // Set constant_param for train op + train_op_.SetOpConstParamAttr( + {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER, + CUDNN_PARAM_BN_SCALE_PLACEHOLDER, CUDNN_PARAM_BN_BIAS_PLACEHOLDER, + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER, + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER, + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER, + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER, + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + // Set input and output desc for train op + train_op_.SetOpConstParamDesc( + {CUDNN_PARAM_YSTATS_DESC, CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC}, + args_.in_desc.desc()); + train_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC, + args_.out_desc.desc()); + + // Get workspace + auto handle = ctx.cudnn_handle(); + train_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + // Check workspace size, also creates plan. + size_t workspace_size_bytes = train_op_.GetWorkspaceSizeInBytes(handle); + PADDLE_ENFORCE_EQ(workspace_size_bytes, 0U, + platform::errors::InvalidArgument( + "Unexpected non-zero workspace size for " + "CudnnBNStatsFinalize.")); + train_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + static_cast(nullptr)); + train_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + &workspace_size_bytes); + } + + void InferenceInit(const platform::CUDADeviceContext &ctx) { + // Set constant_param for inference op + inference_op_.SetOpConstParamAttr( + {CUDNN_PARAM_BN_SCALE_PLACEHOLDER, CUDNN_PARAM_BN_BIAS_PLACEHOLDER, + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER, + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER, + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + // Set input and output desc for inference op + inference_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC, + args_.in_desc.desc()); + inference_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC, + args_.out_desc.desc()); + + // Get workspace + auto handle = ctx.cudnn_handle(); + inference_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + // Check workspace size, also creates plan. + size_t workspace_size_bytes = inference_op_.GetWorkspaceSizeInBytes(handle); + PADDLE_ENFORCE_EQ(workspace_size_bytes, 0U, + platform::errors::InvalidArgument( + "Unexpected non-zero workspace size for " + "CudnnBNStatsFinalize.")); + inference_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + static_cast(nullptr)); + inference_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + &workspace_size_bytes); + } + + BNStatsFinalizeArgs args_; + CudnnFusionOp train_op_; + CudnnFusionOp inference_op_; +}; +#endif +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h new file mode 100644 index 00000000000000..2fdb3635e2e149 --- /dev/null +++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h @@ -0,0 +1,292 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" +#include "paddle/fluid/platform/cudnn_desc.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +template +using CudnnDataType = platform::CudnnDataType; +namespace dynload = platform::dynload; +template +using BatchNormParamType = + typename platform::CudnnDataType::BatchNormParamType; + +#if CUDNN_VERSION >= 8000 + +template +struct ScaleBiasAddReluArgs { + ScaleBiasAddReluArgs() { + dtype = platform::CudnnDataType::type; + param_dtype = platform::CudnnDataType>::type; + format = CUDNN_TENSOR_NHWC; + } + + void Set(const std::string &act_type, const std::vector &data_shape, + const std::vector ¶m_shape, + const std::vector &bitmask_shape) { + PADDLE_ENFORCE_EQ( + data_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of data_shape is expected to 4. But recieved " + "data_shape's size is %d, data_shape is [%s].", + data_shape.size(), framework::make_ddim(data_shape))); + PADDLE_ENFORCE_EQ( + param_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of param_shape is expected to 4. But recieved " + "param_shape's size is %d, param_shape is [%s].", + param_shape.size(), framework::make_ddim(param_shape))); + PADDLE_ENFORCE_EQ( + bitmask_shape.size(), 3U, + platform::errors::InvalidArgument( + "The size of bitmask_shape is expected to 3. But recieved " + "bitmask_shape's size is %d, bitmask_shape is [%s].", + bitmask_shape.size(), framework::make_ddim(bitmask_shape))); + + in_desc.set(data_shape, format, dtype); + out_desc.set(data_shape, format, dtype); + equiv_scale_bias_desc.set(param_shape, format, dtype); + scale_bias_mean_var_desc.set(param_shape, format, param_dtype); + bitmask_desc.set(bitmask_shape, format, CUDNN_DATA_INT32); + // set activation desc + cudnnActivationMode_t mode = CUDNN_ACTIVATION_IDENTITY; + if (act_type != "") { + PADDLE_ENFORCE_EQ( + act_type, "relu", + platform::errors::InvalidArgument( + "Only relu activation supported in normalized convolution.")); + mode = CUDNN_ACTIVATION_RELU; + } + double dummy_clip = 0.0; + activation_desc.set(mode, dummy_clip); + } + + cudnnDataType_t dtype; + cudnnDataType_t param_dtype; + cudnnTensorFormat_t format; + + platform::TensorDescriptor in_desc; + platform::TensorDescriptor out_desc; + platform::TensorDescriptor equiv_scale_bias_desc; + platform::TensorDescriptor scale_bias_mean_var_desc; + platform::TensorDescriptor bitmask_desc; + platform::ActivationDescriptor activation_desc; +}; + +template +class CudnnScaleBiasAddRelu { + public: + CudnnScaleBiasAddRelu(const platform::CUDADeviceContext &ctx, + const std::string &act_type, bool fused_add, + bool has_shortcut, const std::vector &data_shape, + const std::vector ¶m_shape, + const std::vector &bitmask_shape) + : fwd_op_(CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK), + bwd_op_(CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM) { + fused_add_ = fused_add; + has_shortcut_ = has_shortcut; + args_.Set(act_type, data_shape, param_shape, bitmask_shape); + } + + ~CudnnScaleBiasAddRelu() {} + + void Forward(const platform::CUDADeviceContext &ctx, T *x_ptr, T *x_scale_ptr, + T *x_bias_ptr, T *out_ptr, int32_t *bitmask_ptr, + T *z_ptr = nullptr, T *z_scale_ptr = nullptr, + T *z_bias_ptr = nullptr) { + ForwardInit(ctx); + auto handle = ctx.cudnn_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); + fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle); + // Set variant_param + // input ptr + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, x_scale_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, x_bias_ptr); + if (has_shortcut_) { + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQSCALE, z_scale_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQBIAS, z_bias_ptr); + } else { + if (fused_add_) { + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr); + } + } + + fwd_op_.SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_); + + // output ptr + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, out_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr); + + workspace_handle.RunFunc( + [&](void *workspace_ptr) { + // workspace ptr + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); + // workspace ptr + fwd_op_.Execute(handle); + }, + fwd_workspace_byte_); + } + + void Backward(const platform::CUDADeviceContext &ctx, T *dy_ptr, T *x_ptr, + float *scale_ptr, float *bias_ptr, float *saved_mean_ptr, + float *saved_invstd_ptr, int32_t *bitmask_ptr, T *dx_ptr, + T *dz_ptr, float *dscale_ptr, float *dbias_ptr, double eps) { + BackwardInit(ctx); + auto handle = ctx.cudnn_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); + bwd_workspace_byte_ = bwd_op_.GetWorkspaceSizeInBytes(handle); + // Set variant_param + // input ptr + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, dy_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_MEAN, saved_mean_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_INVSTD, + saved_invstd_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr); + + bwd_op_.SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &bwd_workspace_byte_); + + // output ptr + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DXDATA, dx_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_DSCALE, dscale_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_DBIAS, dbias_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_SCALAR_DOUBLE_BN_EPSILON, + &eps); + if (has_shortcut_ || fused_add_) { + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DZDATA, dz_ptr); + } + + workspace_handle.RunFunc( + [&](void *workspace_ptr) { + // workspace ptr + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); + // workspace ptr + bwd_op_.Execute(handle); + }, + bwd_workspace_byte_); + } + + private: + void ForwardInit(const platform::CUDADeviceContext &ctx) { + // Set constant_param + fwd_op_.SetOpConstParamAttr( + {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER, CUDNN_PARAM_YDATA_PLACEHOLDER, + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + if (has_shortcut_) { + fwd_op_.SetOpConstParamAttr( + {CUDNN_PARAM_ZDATA_PLACEHOLDER, CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER, + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + } else if (fused_add_) { + fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_ZDATA_PLACEHOLDER, + CUDNN_PTR_16B_ALIGNED); + } + + // input desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc()); + if (has_shortcut_ || fused_add_) { + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ZDESC, args_.in_desc.desc()); + } + + // equiv scale/bias desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC, + args_.equiv_scale_bias_desc.desc()); + if (has_shortcut_) { + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC, + args_.equiv_scale_bias_desc.desc()); + } + + // output desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YDESC, args_.out_desc.desc()); + + // bitmask desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_BITMASK_DESC, + args_.bitmask_desc.desc()); + + // activation desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_DESC, + args_.activation_desc.desc()); + + // others + fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + } + + void BackwardInit(const platform::CUDADeviceContext &ctx) { + // Set constant_param + bwd_op_.SetOpConstParamAttr( + {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_DYDATA_PLACEHOLDER, + CUDNN_PARAM_DXDATA_PLACEHOLDER, CUDNN_PARAM_BN_SCALE_PLACEHOLDER, + CUDNN_PARAM_BN_BIAS_PLACEHOLDER, CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER, + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER, + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER, CUDNN_PARAM_BN_DBIAS_PLACEHOLDER, + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + if (has_shortcut_ || fused_add_) { + bwd_op_.SetOpConstParamAttr(CUDNN_PARAM_DZDATA_PLACEHOLDER, + CUDNN_PTR_16B_ALIGNED); + } + + // input desc + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc()); + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DXDESC, args_.in_desc.desc()); + if (has_shortcut_ || fused_add_) { + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DZDESC, args_.in_desc.desc()); + } + + // scale/bias/mean/var desc for backward + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC, + args_.scale_bias_mean_var_desc.desc()); + + // output desc + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DYDESC, args_.out_desc.desc()); + + // bitmask desc + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_BITMASK_DESC, + args_.bitmask_desc.desc()); + + // activation desc + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_DESC, + args_.activation_desc.desc()); + + // others + bwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + } + + bool fused_add_ = false; + bool has_shortcut_ = false; + size_t fwd_workspace_byte_; + size_t bwd_workspace_byte_; + ScaleBiasAddReluArgs args_; + CudnnFusionOp fwd_op_; + CudnnFusionOp bwd_op_; +}; +#endif +} // namespace operators +} // namespace paddle From 9b987b3d95dd6b29f0fb03f4d96e9398c67afe47 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Sun, 10 Oct 2021 20:59:55 -0500 Subject: [PATCH 084/298] Add skip case for conv2d convert test (#36301) --- .../inference/test_trt_convert_conv2d_transpose.py | 14 ++++++++++++-- .../inference/test_trt_convert_depthwise_conv2d.py | 11 ++++++++++- .../test_trt_convert_depthwise_conv2d_transpose.py | 12 +++++++++++- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py index 82dd492b5275fb..2c8f2592a737cd 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py @@ -173,7 +173,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): attrs, False), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), (1e-5, 1e-5) + attrs, False), (1e-5, 1e-3) self.trt_param.precision = paddle_infer.PrecisionType.Int8 yield self.create_inference_config(), generate_trt_nodes_num( attrs, False), (1e-5, 1e-5) @@ -185,7 +185,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): True), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True), (1e-5, 1e-5) + attrs, True), (1e-5, 1e-3) self.trt_param.precision = paddle_infer.PrecisionType.Int8 yield self.create_inference_config(), generate_trt_nodes_num( attrs, True), (1e-5, 1e-5) @@ -214,6 +214,16 @@ def teller2(program_config, predictor_config): "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle." ) + def teller3(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Int8: + return True + return False + + self.add_skip_case( + teller3, SkipReasons.TRT_NOT_IMPLEMENTED, + "When precisionType is int8 without relu op, output is different between Trt and Paddle." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py index e6b3aa30bf8962..fc2358bb116367 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py @@ -165,7 +165,6 @@ def generate_trt_nodes_num(attrs, dynamic_shape): attrs, False), (1e-5, 1e-5) # for dynamic_shape - generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 yield self.create_inference_config(), generate_trt_nodes_num(attrs, @@ -190,6 +189,16 @@ def teller1(program_config, predictor_config): "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op." ) + def teller2(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Int8: + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "When precisionType is int8 without relu op, output is different between Trt and Paddle." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py index 473925c6cdb794..2fcd2bf5aca974 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py @@ -137,7 +137,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): attrs, False), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), (1e-5, 1e-5) + attrs, False), (1e-5, 1e-3) self.trt_param.precision = paddle_infer.PrecisionType.Int8 yield self.create_inference_config(), generate_trt_nodes_num( attrs, False), (1e-5, 1e-5) @@ -178,6 +178,16 @@ def teller2(program_config, predictor_config): "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle." ) + def teller3(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Int8: + return True + return False + + self.add_skip_case( + teller3, SkipReasons.TRT_NOT_IMPLEMENTED, + "When precisionType is int8 without relu op, output is different between Trt and Paddle." + ) + def test(self): self.add_skip_trt_case() self.run_test() From 5690666ce60baaee84fb92583bf10a259a8cd385 Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Mon, 11 Oct 2021 10:23:17 +0800 Subject: [PATCH 085/298] Add use_cinn Flag and RunFromCinn in PE (#36107) Add use_cinn flag and use it to control whether we run PaddlePaddle using CINN. Also add: Replace PaddlePaddle graph with a CINN graph in a pass PE Method to feed data and run the graph by CINN --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../fluid/framework/details/build_strategy.cc | 7 ++- paddle/fluid/framework/ir/CMakeLists.txt | 2 + .../fluid/framework/ir/paddle_to_cinn_pass.cc | 31 ++++++++++ .../fluid/framework/ir/paddle_to_cinn_pass.h | 30 ++++++++++ .../framework/ir/paddle_to_cinn_pass_test.cc | 40 +++++++++++++ .../framework/paddle2cinn/cinn_runner.cc | 15 +++++ .../fluid/framework/paddle2cinn/cinn_runner.h | 12 +++- .../framework/paddle2cinn/cinn_runner_test.cc | 11 ++-- paddle/fluid/framework/parallel_executor.cc | 36 ++++++++++++ paddle/fluid/framework/parallel_executor.h | 5 ++ paddle/fluid/platform/flags.cc | 10 ++++ paddle/fluid/pybind/pybind.cc | 12 ++++ python/paddle/fluid/executor.py | 16 +++++- .../test_parallel_executor_run_cinn.py | 56 +++++++++++++++++++ 16 files changed, 277 insertions(+), 10 deletions(-) create mode 100644 paddle/fluid/framework/ir/paddle_to_cinn_pass.cc create mode 100644 paddle/fluid/framework/ir/paddle_to_cinn_pass.h create mode 100644 paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 67073350d5a8aa..6e57b829ade4ed 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -351,7 +351,7 @@ target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_h cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor graph build_strategy bind_threaded_ssa_graph_executor collective_helper - fast_threaded_ssa_graph_executor variable_helper) + fast_threaded_ssa_graph_executor variable_helper cinn_runner) cc_library(executor_cache SRCS executor_cache.cc DEPS parallel_executor) if(WITH_PSCORE) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 72f7f0e6011c1b..ad81b48847af9f 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass - fix_op_run_order_pass) + paddle_to_cinn_pass fix_op_run_order_pass) if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM)) set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass) endif() diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 0d55882953db35..a55b809055f3e7 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -19,8 +19,9 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_printer.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h" -DECLARE_bool(use_mkldnn); DECLARE_bool(convert_all_blocks); +DECLARE_bool(use_cinn); +DECLARE_bool(use_mkldnn); namespace paddle { namespace framework { @@ -71,6 +72,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Note: This pass is used to check whether the multi_device_graph is right. AppendPass("multi_devices_check_pass"); + // Note: This pass is used to enable cinn. + if (FLAGS_use_cinn) { + AppendPass("paddle_to_cinn_pass"); + } SetCollectiveContext(); } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 99c691e6cf6f7a..6f5f27400752dd 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -59,6 +59,7 @@ cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass) cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper) pass_library(graph_to_program_pass base) +pass_library(paddle_to_cinn_pass base DEPS cinn_runner) pass_library(graph_viz_pass base) pass_library(lock_free_optimize_pass base DEPS string_helper) pass_library(fc_fuse_pass inference) @@ -142,6 +143,7 @@ cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) +cc_test(paddle_to_cinn_pass_test SRCS paddle_to_cinn_pass_test.cc DEPS paddle_to_cinn_pass proto_desc) cc_test(cost_model_test SRCS cost_model_test.cc DEPS cost_model op_registry) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass) diff --git a/paddle/fluid/framework/ir/paddle_to_cinn_pass.cc b/paddle/fluid/framework/ir/paddle_to_cinn_pass.cc new file mode 100644 index 00000000000000..fbf2cfb8d41d6a --- /dev/null +++ b/paddle/fluid/framework/ir/paddle_to_cinn_pass.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/paddle_to_cinn_pass.h" + +#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" + +namespace paddle { +namespace framework { +namespace ir { + +void PaddleToCinnPass::ApplyImpl(ir::Graph* graph) const { + paddle2cinn::CinnRunner::GetInstance()->ReplaceWithCinn(graph); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(paddle_to_cinn_pass, paddle::framework::ir::PaddleToCinnPass); diff --git a/paddle/fluid/framework/ir/paddle_to_cinn_pass.h b/paddle/fluid/framework/ir/paddle_to_cinn_pass.h new file mode 100644 index 00000000000000..f3b9bd21ebf9ca --- /dev/null +++ b/paddle/fluid/framework/ir/paddle_to_cinn_pass.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class PaddleToCinnPass : public Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc b/paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc new file mode 100644 index 00000000000000..49d2ce295f3852 --- /dev/null +++ b/paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc @@ -0,0 +1,40 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/paddle_to_cinn_pass.h" + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +TEST(PaddleToCinnPassTest, TodoTest) { + ProgramDesc program; + Graph graph(program); + + auto pass = paddle::framework::ir::PassRegistry::Instance().Get( + "paddle_to_cinn_pass"); + + pass->Apply(&graph); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(paddle_to_cinn_pass); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner.cc b/paddle/fluid/framework/paddle2cinn/cinn_runner.cc index de5af910c99add..ba90095cae6799 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_runner.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_runner.cc @@ -15,6 +15,8 @@ #include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" #include +#include +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/scope.h" @@ -26,6 +28,19 @@ namespace paddle2cinn { using ir::Graph; +std::once_flag CinnRunner::get_instance_once_flag_; +std::shared_ptr CinnRunner::instance_; + +std::shared_ptr CinnRunner::GetInstance() { + std::call_once(get_instance_once_flag_, + [&]() { instance_.reset(new CinnRunner()); }); + return instance_; +} + +void CinnRunner::ReplaceWithCinn(Graph* graph) { + // TODO(zhhsplendid): call CINN Api when it is ready +} + std::map CinnRunner::Run( const Graph& graph, Scope* scope, std::map* feed_targets) { diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner.h b/paddle/fluid/framework/paddle2cinn/cinn_runner.h index 5f63d64545ff75..23d9565d2f3926 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_runner.h +++ b/paddle/fluid/framework/paddle2cinn/cinn_runner.h @@ -16,6 +16,7 @@ #include #include +#include #include #include "paddle/fluid/framework/ir/graph.h" @@ -36,15 +37,24 @@ namespace paddle2cinn { // cache. class CinnRunner { public: - CinnRunner() {} ~CinnRunner() {} + // Singleton + static std::shared_ptr GetInstance(); + + // Replace Paddle graph with some CINN subgraphs/ops + void ReplaceWithCinn(ir::Graph* graph); + // Feed LoDTensors to tun CINN compiled object and return fetched result std::map Run( const ir::Graph& graph, Scope* scope, std::map* feed_targets); private: + CinnRunner() {} + + static std::once_flag get_instance_once_flag_; + static std::shared_ptr instance_; std::unordered_map, CinnCacheKey::Hash> cache_; diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc index 88aca0bd66b375..c02b994c147ca1 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "gtest/gtest.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" + +#include +#include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -32,8 +34,9 @@ TEST(CinnRunnerTest, TodoTest) { Scope empty_scope; std::map empty_feed; - CinnRunner cinn_runner; - cinn_runner.Run(empty_graph, &empty_scope, &empty_feed); + std::shared_ptr cinn_runner = CinnRunner::GetInstance(); + cinn_runner->ReplaceWithCinn(&empty_graph); + cinn_runner->Run(empty_graph, &empty_scope, &empty_feed); } } // namespace paddle2cinn diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d19ac0b65f4d1e..3b80e9c78677d1 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/event.h" @@ -43,6 +44,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_device_guard.h" #endif +DECLARE_bool(use_cinn); DECLARE_double(eager_delete_tensor_gb); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -943,6 +945,40 @@ void ParallelExecutor::RunWithoutFetch( member_->executor_->Run(/*fetch_tensors*/ {}, /*return_merged*/ false); } +FetchResultType ParallelExecutor::RunFromCinn( + const std::unordered_map &feed_tensors, + const std::vector &fetch_names) { + // Feed tensor to scope, now only support 1 scope + // TODO(zhhsplendid): handle multiple scope + size_t scope_id = 0; + std::map cinn_input_tensors; + for (auto &name_tensor_pair : feed_tensors) { + bool is_persistable = member_->IsPersistable(name_tensor_pair.first); + if (!is_persistable) { + member_->SetSkipMemoryReuse(scope_id, name_tensor_pair.first); + } + Scope *feed_scope = is_persistable ? member_->local_scopes_[scope_id] + : member_->local_exec_scopes_[scope_id]; + Variable *feed_var = feed_scope->Var(name_tensor_pair.first); + LoDTensor *trg = feed_var->GetMutable(); + trg->ShareDataWith(name_tensor_pair.second); + trg->set_lod(name_tensor_pair.second.lod()); + + cinn_input_tensors[name_tensor_pair.first] = trg; + } + + // TODO(zhhsplendid): get correct API after CINN API is ready + // now only return empty fetch result; + std::shared_ptr cinn_runner = + paddle2cinn::CinnRunner::GetInstance(); + + cinn_runner->Run(Graph(), member_->local_exec_scopes_[scope_id], + &cinn_input_tensors); + + paddle::framework::FetchResultType fetches = FetchList(fetch_names.size()); + return fetches; +} + void ParallelExecutor::SkipMemoryReuse( size_t scope_idx, const std::vector &skip_vars) { for (auto &var_name : skip_vars) { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 78774f04896389..f908ce3f013937 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include @@ -92,6 +93,10 @@ class ParallelExecutor { void RunWithoutFetch(const std::vector &skip_eager_vars); + FetchResultType RunFromCinn( + const std::unordered_map &feed_tensors, + const std::vector &fetch_names); + void ResetOpHandleScopeMapOfGraphs( const std::unordered_map &scope_map); diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 7a7666665511fa..18636f6f842785 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -681,6 +681,16 @@ PADDLE_DEFINE_EXPORTED_bool( apply_pass_to_program, false, "It controls whether to apply IR pass to program when using Fleet APIs"); +/** + * CINN related FLAG + * Name: FLAGS_use_cinn + * Since Version: 2.3 + * Value Range: bool, default=false + * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN + */ +PADDLE_DEFINE_EXPORTED_bool( + use_cinn, false, "It controls whether to run PaddlePaddle using CINN"); + DEFINE_int32(record_pool_max_size, 2000000, "SlotRecordDataset slot record pool max size"); DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num"); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f58c2a5db381c7..80350abb4fe219 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -3293,6 +3293,18 @@ All parameter, weight, gradient are variables in Paddle. BOOST_GET(paddle::framework::FetchUnmergedList, ret))); } }) + .def("run_from_cinn", + [](ParallelExecutor &self, + const std::unordered_map &feed_tensors, + const std::vector &fetch_names) -> py::object { + paddle::framework::FetchResultType ret; + { + pybind11::gil_scoped_release release; + ret = self.RunFromCinn(feed_tensors, fetch_names); + } + return py::cast( + std::move(BOOST_GET(paddle::framework::FetchList, ret))); + }) .def("device_count", &ParallelExecutor::DeviceCount); BindFleetWrapper(&m); diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 8c118f31cbe87a..bea5b29ecafa65 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -23,7 +23,8 @@ from .wrapped_decorator import signature_safe_contextmanager import six from .data_feeder import convert_dtype -from .framework import Program, default_main_program, Variable, Operator, convert_np_dtype_to_dtype_ +from .framework import Program, default_main_program, Variable, Operator +from .framework import convert_np_dtype_to_dtype_, get_flags from . import core from . import unique_name from . import compiler @@ -1016,7 +1017,16 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, check_feed_shape_type(var, feed_tensor, exe.device_count()) feed_tensor_dict[feed_name] = feed_tensor - exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict) + #TODO(zhhsplendid): handle other feed data format case for CINN + use_cinn = get_flags("FLAGS_use_cinn")["FLAGS_use_cinn"] + if use_cinn: + fetch_var_names = list(map(_to_name_str, fetch_list)) + fetch_tensors = exe.run_from_cinn( + feed_tensor_dict, fetch_var_names)._move_to_list() + return as_numpy( + fetch_tensors) if return_numpy else fetch_tensors + else: + exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict) elif isinstance(feed, list) or isinstance(feed, tuple): res = list() for i, each in enumerate(feed): @@ -1036,6 +1046,8 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, check_feed_shape_type(var, tensor) res_dict[feed_name] = tensor res.append(res_dict) + + use_cinn = get_flags("FLAGS_use_cinn")["FLAGS_use_cinn"] exe.feed_tensors_into_local_scopes(res) if hasattr(program._program, 'lr_sheduler'): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py new file mode 100644 index 00000000000000..e8b1d838261f45 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py @@ -0,0 +1,56 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import paddle +import unittest + +paddle.enable_static() + + +class TestParallelExecutorRunCinn(unittest.TestCase): + def test_run_from_cinn(self): + paddle.set_flags({'FLAGS_use_cinn': True}) + + main_program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.static.program_guard(main_program, startup_program): + data = paddle.static.data( + name='X', shape=[None, 1], dtype='float32') + prediction = paddle.static.nn.fc(data, 2) + loss = paddle.mean(prediction) + adam = paddle.optimizer.Adam() + adam.minimize(loss) + + place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda( + ) else paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_program) + compiled_program = paddle.static.CompiledProgram( + main_program).with_data_parallel(loss_name=loss.name) + + batch_size = 16 + x = np.random.random(size=(batch_size, 1)).astype('float32') + fetch = exe.run(compiled_program, + feed={'X': x}, + fetch_list=[prediction.name], + return_merged=False) + + paddle.set_flags({'FLAGS_use_cinn': False}) + + +if __name__ == '__main__': + unittest.main() From 34bd18ff330fa2095338af1da3caa386f63fed60 Mon Sep 17 00:00:00 2001 From: baoachun <962571062@qq.com> Date: Mon, 11 Oct 2021 10:45:37 +0800 Subject: [PATCH 086/298] add skip case in trt converter ut (#36287) * add skip case in trt converter ut * disable group_norm trt plugin --- paddle/fluid/inference/tensorrt/op_teller.cc | 8 +- .../tensorrt/plugin/elementwise_op_plugin.cu | 6 - .../inference/test_trt_convert_elementwise.py | 135 +++++++++++++----- .../test_trt_convert_emb_eltwise_layernorm.py | 12 ++ .../inference/test_trt_convert_group_norm.py | 26 +++- .../test_trt_convert_multihead_matmul.py | 31 +++- 6 files changed, 165 insertions(+), 53 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 5bfd2f12777952..44c001b0bc595e 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -48,9 +48,11 @@ struct SimpleOpTypeSetTeller : public Teller { int8_teller_set.insert("skip_layernorm"); int8_teller_set.insert("slice"); #endif -#if IS_TRT_VERSION_GE(7130) - teller_set.insert("group_norm"); -#endif +// TODO(baoachun) The group_norm trt plugin will check input's dim +// not -1 failed when dynamic shape mode. +// #if IS_TRT_VERSION_GE(7130) +// teller_set.insert("group_norm"); +// #endif #if IS_TRT_VERSION_GE(7000) teller_set.insert("tile"); #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index 69e0075729b0dc..d6a1cdb9e68a65 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -65,12 +65,6 @@ nvinfer1::Dims ElementWisePlugin::getOutputDimensions( } int ElementWisePlugin::initialize() TRT_NOEXCEPT { - PADDLE_ENFORCE_GT(dims_y_.nbDims, 0, - platform::errors::InvalidArgument( - "The dimension of input Y of TRT elementwise op plugin " - "should be greater than 0, but got %d.", - dims_y_.nbDims)); - axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_; int trimed_nb_dims = dims_y_.nbDims; for (; trimed_nb_dims > 0; --trimed_nb_dims) { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py index 2d18738b614cb5..c8cba0f3723807 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py @@ -32,8 +32,8 @@ def generate_weight(): return np.random.randn(32).astype(np.float32) for batch in [1, 2, 4]: - for shape in [[32], [batch, 32], [batch, 64, 32], - [batch, 8, 16, 32]]: + for shape in [[32], [batch, 32], [batch, 32, 32], + [batch, 32, 16, 32]]: for op_type in ["elementwise_add", "elementwise_mul"]: for axis in [len(shape) - 1, -1]: self.dims = len(shape) @@ -68,26 +68,27 @@ def generate_weight(): def sample_predictor_configs( self, program_config) -> (paddle_infer.Config, List[int], float): def generate_dynamic_shape(attrs): + # The input.dims[1] must be equal to the weight's length. if self.dims == 1: self.dynamic_shape.min_input_shape = {"input_data": [4]} self.dynamic_shape.max_input_shape = {"input_data": [256]} self.dynamic_shape.opt_input_shape = {"input_data": [16]} elif self.dims == 2: - self.dynamic_shape.min_input_shape = {"input_data": [1, 4]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 256]} - self.dynamic_shape.opt_input_shape = {"input_data": [2, 16]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [2, 32]} elif self.dims == 3: - self.dynamic_shape.min_input_shape = {"input_data": [1, 4, 4]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 4]} self.dynamic_shape.max_input_shape = { - "input_data": [4, 256, 256] + "input_data": [4, 32, 256] } self.dynamic_shape.opt_input_shape = {"input_data": [2, 32, 16]} elif self.dims == 4: self.dynamic_shape.min_input_shape = { - "input_data": [1, 4, 4, 4] + "input_data": [1, 32, 4, 4] } self.dynamic_shape.max_input_shape = { - "input_data": [4, 256, 128, 256] + "input_data": [4, 32, 128, 256] } self.dynamic_shape.opt_input_shape = { "input_data": [2, 32, 32, 16] @@ -98,6 +99,11 @@ def clear_dynamic_shape(): self.dynamic_shape.min_input_shape = {} self.dynamic_shape.opt_input_shape = {} + def generate_trt_nodes_num(attrs, dynamic_shape): + if self.dims == 1: + return 0, 3 + return 1, 2 + attrs = [ program_config.ops[i].attrs for i in range(len(program_config.ops)) @@ -106,18 +112,52 @@ def clear_dynamic_shape(): # for static_shape clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), (0, 3), 1e-5 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (0, 3), 1e-5 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 # for dynamic_shape generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), (1, 2), 1e-5 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (1, 2), 1e-5 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if self.dims == 2 and len(self.dynamic_shape.max_input_shape) == 0: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape are not equal between gpu and tensorrt when input dim is 2." + ) + + def teller2(program_config, predictor_config): + if self.dims == 3: + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and tensorrt when input dim is 3.") + + def teller3(program_config, predictor_config): + if self.dims == 4: + return True + return False + + self.add_skip_case( + teller3, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and tensorrt when input dim is 4.") def test(self): + self.add_skip_trt_case() self.run_test() @@ -245,15 +285,26 @@ def clear_dynamic_shape(): self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), (1, 3), 1e-5 + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if self.dims == 2: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape are not equal between gpu and tensorrt when input dim is 2." + ) + def test(self): + self.add_skip_trt_case() self.run_test() class TrtConvertElementwiseTest_two_input_with_broadcast(TrtLayerAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: inputs = program_config.inputs - if len(inputs['input_data1'].shape) == 1 or len(inputs['input_data2'] - .shape) == 1: + if len(inputs['input_data1'].shape) != len(inputs['input_data2'].shape): return False return True @@ -264,24 +315,27 @@ def generate_input(shape): input1_shape_list = [[4, 32], [2, 4, 32], [4, 2, 4, 32]] input2_shape1_list = [[32], [4, 32], [2, 4, 32]] - input2_shape2_list = [[1, 32], [1, 1, 32], [1, 1, 1, 32]] - input2_shape3_list = [[1, 32], [1, 4, 32], [4, 32]] + input2_shape2_list = [[4, 1], [2, 4, 1], [4, 2, 4, 1]] + input2_shape3_list = [[32], [2, 1, 1], [4, 2, 1, 1]] + input2_shape4_list = [[32], [4, 32], [4, 1, 1, 1]] input2_shape_list = [ - input2_shape1_list, input2_shape2_list, input2_shape3_list + input2_shape1_list, input2_shape2_list, input2_shape3_list, + input2_shape4_list ] axis1_list = [[-1], [1, -1], [1, -1]] - axis2_list = [[-1], [-1], [-1]] - axis3_list = [[-1], [-1], [2, -1]] - axis_list = [axis1_list, axis2_list, axis3_list] + axis2_list = [[-1], [0], [0]] + axis3_list = [[-1], [0], [0]] + axis4_list = [[-1], [-1], [0]] + axis_list = [axis1_list, axis2_list, axis3_list, axis4_list] for i in range(3): input1_shape = input1_shape_list[i] - for j in range(3): + for j in range(4): input2_shape = input2_shape_list[j][i] for op_type in ["elementwise_add", "elementwise_mul"]: for axis in axis_list[j][i]: - self.dims1 = len(input1_shape) - self.dims2 = len(input2_shape) + self.shape1 = input1_shape + self.shape2 = input2_shape dics = [{"axis": axis}] ops_config = [{ "op_type": op_type, @@ -318,16 +372,16 @@ def generate_dynamic_shape(attrs): opt_shape = [[32], [32, 32], [32, 32, 32], [32, 32, 32, 32]] self.dynamic_shape.min_input_shape = { - "input_data1": min_shape[self.dims1 - 1], - "input_data2": min_shape[self.dims2 - 1] + "input_data1": min_shape[len(self.shape1) - 1], + "input_data2": min_shape[len(self.shape2) - 1] } self.dynamic_shape.max_input_shape = { - "input_data1": max_shape[self.dims1 - 1], - "input_data2": max_shape[self.dims2 - 1] + "input_data1": max_shape[len(self.shape1) - 1], + "input_data2": max_shape[len(self.shape2) - 1] } self.dynamic_shape.opt_input_shape = { - "input_data1": opt_shape[self.dims1 - 1], - "input_data2": opt_shape[self.dims2 - 1] + "input_data1": opt_shape[len(self.shape1) - 1], + "input_data2": opt_shape[len(self.shape2) - 1] } def clear_dynamic_shape(): @@ -342,10 +396,11 @@ def clear_dynamic_shape(): # for static_shape clear_dynamic_shape() - self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), (1, 3), 1e-5 - self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (1, 3), 1e-5 + if self.shape1[0] == self.shape2[0]: + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), (1, 3), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), (1, 3), 1e-5 # for dynamic_shape generate_dynamic_shape(attrs) @@ -354,7 +409,19 @@ def clear_dynamic_shape(): self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), (1, 3), 1e-5 + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if len(self.shape1) == 2: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape are not equal between gpu and tensorrt when input dim is 2." + ) + def test(self): + self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py index f25a3b82476dca..d7b0bcd908085c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py @@ -252,7 +252,19 @@ def clear_dynamic_shape(): self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), (1, 4), 1e-5 + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Half and len( + self.dynamic_shape.min_input_shape) != 0: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and trt when dynamic fp16 mode.") + def test(self): + self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py index 0224f20ec747e1..b6b5aa9dbfe95c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py @@ -114,19 +114,33 @@ def generate_trt_nodes_num(attrs, dynamic_shape): clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), 1e-5 + attrs, False), (1e-5, 1e-5) self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), 1e-5 + attrs, False), (1e-5, 1e-5) # for dynamic_shape generate_dynamic_shape(attrs) - # self.trt_param.precision = paddle_infer.PrecisionType.Float32 - # yield self.create_inference_config(), generate_trt_nodes_num(attrs, True), 1e-5 - # self.trt_param.precision = paddle_infer.PrecisionType.Half - # yield self.create_inference_config(), generate_trt_nodes_num(attrs, True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) + + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if len(self.dynamic_shape.min_input_shape) != 0: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The goup_norm plugin will check dim not -1 failed when dynamic fp16 mode." + ) def test(self): + self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py index e772df522b5c50..0b98ab53fcc297 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py @@ -38,6 +38,7 @@ def generate_weight2(): return np.random.randn(768).astype(np.float32) for batch in [1, 2, 4]: + self.batch = batch for reshape_shape in [[0, 0, 12, 64]]: for dim1 in [128]: input2_shapes = [[batch, reshape_shape[2], dim1, dim1], @@ -417,18 +418,40 @@ def clear_dynamic_shape(): # for static_shape clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), (1, 4), 1e-5 + yield self.create_inference_config(), (1, 4), (1e-5, 1e-5) self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (1, 4), 1e-5 + yield self.create_inference_config(), (1, 4), (1e-5, 1e-5) # for dynamic_shape generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), (1, 3), 1e-5 + yield self.create_inference_config(), (1, 3), (1e-5, 1e-5) self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (1, 3), 1e-5 + yield self.create_inference_config(), (1, 3), (1e-5, 1e-5) + + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Half: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and trt in fp16 mode.") + + def teller2(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Float32 and len( + self.dynamic_shape.min_input_shape) != 0 and self.batch > 2: + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2." + ) def test(self): + self.add_skip_trt_case() self.run_test() From 2b7b752a1c8eb5ffd24d06729c4d3d6bcb1f6b1a Mon Sep 17 00:00:00 2001 From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com> Date: Mon, 11 Oct 2021 11:12:24 +0800 Subject: [PATCH 087/298] add mish trt plugin (#34123) * add mish trt plugin, compile & install success, run error. test=develop * modify code according to review * add TRT_NOEXCEPT for mish trt plugin * add unittest for mish trt plugin * remove unnecessary check of mish in op_teller.cc * fix some problem of trt8 * add check and modify unittest while converting mish to trt plugin Co-authored-by: dengkaipeng --- paddle/fluid/framework/ir/is_test_pass.cc | 2 +- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../inference/tensorrt/convert/mish_op.cc | 74 ++++++ .../tensorrt/convert/test_mish_op.cc | 47 ++++ paddle/fluid/inference/tensorrt/op_teller.cc | 41 ++- .../inference/tensorrt/plugin/CMakeLists.txt | 1 + .../tensorrt/plugin/mish_op_plugin.cu | 235 ++++++++++++++++++ .../tensorrt/plugin/mish_op_plugin.h | 175 +++++++++++++ .../ir/inference/test_trt_activation_pass.py | 36 +++ .../ir/inference/test_trt_convert_mish.py | 174 +++++++++++++ 11 files changed, 785 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/mish_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_mish_op.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 25bf03f426a1d9..a97873e82f4554 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -35,7 +35,7 @@ void IsTestPass::ApplyImpl(ir::Graph* graph) const { "hard_shrink", "hard_sigmoid", "relu6", "soft_relu", "swish", "thresholded_relu", "log", "square", "softplus", - "softsign", "silu"}; + "softsign", "silu", "mish"}; for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 804f035a2e2cac..3136e53e74d090 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1410,6 +1410,7 @@ USE_TRT_CONVERTER(reduce_mean); USE_TRT_CONVERTER(tile); USE_TRT_CONVERTER(conv3d); USE_TRT_CONVERTER(conv3d_transpose); +USE_TRT_CONVERTER(mish); #endif namespace paddle_infer { diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index c79915629b70d1..f2c7a4b62bbbb3 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -17,6 +17,7 @@ nv_library(tensorrt_converter gather_nd_op.cc tile_op.cc conv3d_op.cc + mish_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/mish_op.cc b/paddle/fluid/inference/tensorrt/convert/mish_op.cc new file mode 100644 index 00000000000000..6b646d9935b528 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/mish_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Mish converter from fluid to tensorRT. + */ +class MishOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert fluid Mish op to tensorrt Mish plugin"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + int input_num = op_desc.Input("X").size(); + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + + const float threshold = + op_desc.HasAttr("threshold") + ? BOOST_GET_CONST(float, op_desc.GetAttr("threshold")) + : 20.0f; + + nvinfer1::ILayer* layer = nullptr; + if (engine_->with_dynamic_shape()) { + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::MishPluginDynamic* plugin = + new plugin::MishPluginDynamic(threshold, with_fp16); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); + } else { + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::MishPlugin* plugin = new plugin::MishPlugin(threshold, with_fp16); + layer = engine_->AddPlugin(&input, input_num, plugin); + } + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "mish", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(mish, MishOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc new file mode 100644 index 00000000000000..c84c30255fa962 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(mish_op, test_mish) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("mish-X", nvinfer1::Dims3(3, 2, 2)); + validator.DeclOutputVar("mish-Out", nvinfer1::Dims3(3, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("mish"); + desc.SetInput("X", {"mish-X"}); + desc.SetOutput("Out", {"mish-Out"}); + + desc.SetAttr("threshold", 20.0f); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(mish); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 44c001b0bc595e..7a70ceda60c1fb 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -136,7 +136,8 @@ struct SimpleOpTypeSetTeller : public Teller { "reduce_sum", "reduce_mean", "conv3d", - "conv3d_transpose"}; + "conv3d_transpose", + "mish"}; }; bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, @@ -1048,6 +1049,44 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "mish") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "Invalid input X's size of mish TRT converter. " + "Expected 1, received " + << desc.Input("X").size() << "."; + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "Invalid output Out's size of mish TRT converter. " + "Expected 1, received " + << desc.Output("Out").size() << "."; + return false; + } + + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + if (x_shape.size() == 1) { + VLOG(3) << "mish op does not support input's dim is 1 in tensorrt."; + return false; + } + + if (!with_dynamic_shape) { + if (x_shape.size() == 2) { + VLOG(3) << "mish op does not support input's dim is 2 in tensorrt."; + return false; + } + } + } + if (op_type == "roi_align") { if (!with_dynamic_shape) { VLOG(3) << "TRT roi align plugin only accept the dynamic shape, " diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 311c2312a9f45b..e6bcb59fd092c8 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -9,6 +9,7 @@ nv_library(tensorrt_plugin yolo_box_op_plugin.cu roi_align_op_plugin.cu gather_nd_op_plugin.cu + mish_op_plugin.cu DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu new file mode 100644 index 00000000000000..6e268e7b0b330d --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu @@ -0,0 +1,235 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "glog/logging.h" +#include "paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +int MishPlugin::initialize() TRT_NOEXCEPT { return 0; } + +bool MishPlugin::supportsFormat( + nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT { + if (with_fp16_) { + return ((type == nvinfer1::DataType::kFLOAT || + type == nvinfer1::DataType::kHALF) && + (format == nvinfer1::PluginFormat::kLINEAR)); + } else { + return ((type == nvinfer1::DataType::kFLOAT) && + (format == nvinfer1::PluginFormat::kLINEAR)); + } +} + +nvinfer1::Dims MishPlugin::getOutputDimensions(int index, + const nvinfer1::Dims* in_dims, + int nb_inputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ(nb_inputs, 1, platform::errors::InvalidArgument( + "We expect [number of inputs] == 1" + "in TRT Mish op plugin, but got " + "[number of inputs] = %d.", + nb_inputs)); + PADDLE_ENFORCE_LT(index, this->getNbOutputs(), + platform::errors::InvalidArgument( + "We expect [index] < [number of outputs]" + "in TRT Mish op plugin, but got " + "[index] = %d, [number of outputs] = %d.", + index, this->getNbOutputs())); + nvinfer1::Dims const& input_dims = in_dims[0]; + nvinfer1::Dims output_dims = input_dims; + return output_dims; +} + +template +__device__ T kTanh(T x) { + return tanh(x); +} + +template <> +__device__ half kTanh(half x) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const float tmp = tanhf(__half2float(x)); + return __float2half(tmp); +#endif +} + +template +__device__ T kSoftplus(T x, T threshold) { + return x > threshold ? x : log(exp(x) + static_cast(1.0f)); +} + +template <> +__device__ half kSoftplus(half x, half threshold) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + return x > threshold ? x : hlog(hexp(x) + static_cast(1.0f)); +#endif +} + +template +__global__ void mish_kernel(float threshold, int n, const T* input, T* output) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + const T in = input[idx]; + output[idx] = in * kTanh(kSoftplus(in, static_cast(threshold))); + } +} + +template <> +__global__ void mish_kernel(float threshold, int n, const half* input, + half* output) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + const half in = input[idx]; + output[idx] = + in * kTanh(kSoftplus(in, static_cast(threshold))); + } +#endif +} + +#if IS_TRT_VERSION_LT(8000) +int MishPlugin::enqueue(int batchSize, const void* const* inputs, + void** outputs, +#else +int MishPlugin::enqueue(int batchSize, const void* const* inputs, + void* const* outputs, +#endif + void* workspace, cudaStream_t stream) TRT_NOEXCEPT { + const auto& input_dims = this->getInputDims(0); + int num = batchSize; + for (int i = 0; i < input_dims.nbDims; i++) { + num *= input_dims.d[i]; + } + + const int block_size = 256; + const int grid_size = (num + block_size - 1) / block_size; + + auto type = getDataType(); + if (type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32"; + const float* input = static_cast(inputs[0]); + float* output = static_cast(outputs[0]); + mish_kernel<<>>(threshold_, num, + input, output); + } else if (type == nvinfer1::DataType::kHALF) { + VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16"; + const half* input = static_cast(inputs[0]); + half* output = static_cast(outputs[0]); + mish_kernel<<>>(threshold_, num, + input, output); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The Mish TRT Plugin's input type should be float or half.")); + } + + return cudaGetLastError() != cudaSuccess; +} + +// Dynamic Plugin below. +int MishPluginDynamic::initialize() TRT_NOEXCEPT { + getPluginNamespace(); + return 0; +} + +size_t MishPluginDynamic::getSerializationSize() const TRT_NOEXCEPT { + return SerializedSize(threshold_) + SerializedSize(with_fp16_); +} + +void MishPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT { + SerializeValue(&buffer, threshold_); + SerializeValue(&buffer, with_fp16_); +} + +nvinfer1::DimsExprs MishPluginDynamic::getOutputDimensions( + int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT { + return inputs[0]; +} + +bool MishPluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs, + int nb_outputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_NOT_NULL( + in_out, platform::errors::InvalidArgument( + "The input of mish plugin shoule not be nullptr.")); + + PADDLE_ENFORCE_LT( + pos, nb_inputs + nb_outputs, + platform::errors::InvalidArgument("The pos(%d) should be less than the " + "num(%d) of the input and the output.", + pos, nb_inputs + nb_outputs)); + + const nvinfer1::PluginTensorDesc& in = in_out[pos]; + if (pos == 0) { + if (with_fp16_) { + return (in.type == nvinfer1::DataType::kFLOAT || + in.type == nvinfer1::DataType::kHALF) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } + } + const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1]; + // output + return in.type == prev.type && in.format == prev.format; +} + +nvinfer1::DataType MishPluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType* input_types, + int nb_inputs) const TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( + "The Mish Plugin only has one input, so the " + "index value should be 0, but get %d.", + index)); + return input_types[0]; +} + +int MishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, + const nvinfer1::PluginTensorDesc* output_desc, + const void* const* inputs, void* const* outputs, + void* workspace, + cudaStream_t stream) TRT_NOEXCEPT { + auto input_dims = input_desc[0].dims; + size_t num = ProductDim(input_dims); + const int block_size = 256; + const int grid_size = (num + block_size - 1) / block_size; + + auto input_type = input_desc[0].type; + if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32"; + const float* input = static_cast(inputs[0]); + float* output = static_cast(outputs[0]); + mish_kernel<<>>(threshold_, num, + input, output); + } else if (input_type == nvinfer1::DataType::kHALF) { + VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16"; + const half* input = static_cast(inputs[0]); + half* output = static_cast(outputs[0]); + mish_kernel<<>>(threshold_, num, + input, output); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The Mish TRT Plugin's input type should be float or half.")); + } + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h new file mode 100644 index 00000000000000..75390666ea097f --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h @@ -0,0 +1,175 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class MishPlugin : public PluginTensorRT { + private: + float threshold_; + + protected: + size_t getSerializationSize() const TRT_NOEXCEPT override { + return getBaseSerializationSize() + SerializedSize(threshold_); + } + + // TRT will call this func to serialize the configuration of TRT + // It should not be called by users. + void serialize(void* buffer) const TRT_NOEXCEPT override { + serializeBase(buffer); + SerializeValue(&buffer, threshold_); + } + + public: + explicit MishPlugin(const float threshold, const bool with_fp16) + : threshold_(threshold) { + with_fp16_ = with_fp16; + } + + // It was used for tensorrt deserialization. + // It should not be called by users. + MishPlugin(void const* serialData, size_t serialLength) { + deserializeBase(serialData, serialLength); + DeserializeValue(&serialData, &serialLength, &threshold_); + } + + ~MishPlugin() {} + MishPlugin* clone() const TRT_NOEXCEPT override { + return new MishPlugin(threshold_, with_fp16_); + } + + const char* getPluginType() const TRT_NOEXCEPT override { + return "mish_plugin"; + } + int getNbOutputs() const TRT_NOEXCEPT override { return 1; } + int initialize() TRT_NOEXCEPT override; + bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) + const TRT_NOEXCEPT override; + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, + int nbInputDims) TRT_NOEXCEPT override; +#if IS_TRT_VERSION_LT(8000) + int enqueue(int batchSize, const void* const* inputs, void** outputs, +#else + int enqueue(int batchSize, const void* const* inputs, void* const* outputs, +#endif + void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; +}; + +class MishPluginCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const TRT_NOEXCEPT override { + return "mish_plugin"; + } + + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin( + const char* name, const void* serial_data, + size_t serial_length) TRT_NOEXCEPT override { + return new MishPlugin(serial_data, serial_length); + } +}; + +REGISTER_TRT_PLUGIN_V2(MishPluginCreator); + +class MishPluginDynamic : public DynamicPluginTensorRT { + public: + explicit MishPluginDynamic(const float threshold, const bool with_fp16) + : threshold_(threshold) { + with_fp16_ = with_fp16; + } + MishPluginDynamic(void const* serialData, size_t serialLength) { + DeserializeValue(&serialData, &serialLength, &threshold_); + DeserializeValue(&serialData, &serialLength, &with_fp16_); + } + nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override { + return new MishPluginDynamic(threshold_, with_fp16_); + } + + const char* getPluginType() const TRT_NOEXCEPT override { + return "mish_plugin_dynamic"; + } + int getNbOutputs() const TRT_NOEXCEPT override { return 1; } + int initialize() TRT_NOEXCEPT override; + + size_t getSerializationSize() const TRT_NOEXCEPT override; + void serialize(void* buffer) const TRT_NOEXCEPT override; + + nvinfer1::DimsExprs getOutputDimensions( + int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override; + + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, + int nbOutputs) TRT_NOEXCEPT override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) TRT_NOEXCEPT override {} + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const TRT_NOEXCEPT override { + return 0; + } + + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) TRT_NOEXCEPT override; + nvinfer1::DataType getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, + int nbInputs) const TRT_NOEXCEPT override; + + void destroy() TRT_NOEXCEPT override { delete this; } + + private: + float threshold_; +}; + +class MishPluginDynamicCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const TRT_NOEXCEPT override { + return "mish_plugin_dynamic"; + } + + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin( + const char* name, const void* serial_data, + size_t serial_length) TRT_NOEXCEPT override { + auto plugin = new MishPluginDynamic(serial_data, serial_length); + return plugin; + } +}; + +REGISTER_TRT_PLUGIN_V2(MishPluginDynamicCreator); + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py index 8e196f5081f735..62825caf5185cb 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py @@ -139,6 +139,42 @@ def append_act(self, x): return fluid.layers.swish(x) +class TensorRTSubgraphPassMishTest(TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False) + + def append_act(self, x): + return fluid.layers.mish(x) + + +class TensorRTSubgraphPassMishFp16SerializeTest( + TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False) + + def append_act(self, x): + return fluid.layers.mish(x) + + +class TensorRTSubgraphPassDynamicMishFp16SerializeTest( + TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False) + self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam( + { + 'data': [1, 6, 8, 8] + }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False) + + def append_act(self, x): + return fluid.layers.mish(x) + + class TensorRTSubgraphPassPreluAllTest(TensorRTSubgraphPassActivationTest): def append_act(self, x): return fluid.layers.prelu(x, mode='all') diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py new file mode 100644 index 00000000000000..d223fd529ab174 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py @@ -0,0 +1,174 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set +import unittest + + +class TrtConvertMishTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def generate_input(batch, dim1, dim2, dim3): + shape = [batch] + if dim1 != 0: + shape.append(dim1) + if dim2 != 0: + shape.append(dim2) + if dim3 != 0: + shape.append(dim3) + return np.random.random(shape).astype(np.float32) + + for batch in [1, 4]: + for dim1 in [0, 3]: + for dim2 in [0, 16]: + for dim3 in [0, 32]: + for thre in [5.0, 20.0]: + self.dim1 = dim1 + self.dim2 = dim2 + self.dim3 = dim3 + + if dim1 == 0 and dim2 != 0: + continue + if dim1 == 0 and dim2 == 0 and dim3 != 0: + continue + + ops_config = [{ + "op_type": "mish", + "op_inputs": { + "X": ["input_data"] + }, + "op_outputs": { + "Out": ["mish_output_data"] + }, + "op_attrs": { + "threshold": thre + } + }] + + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": TensorConfig( + data_gen=partial(generate_input, batch, + dim1, dim2, dim3)) + }, + outputs=["mish_output_data"]) + + yield program_config + + def sample_predictor_configs(self, program_config): + def generate_dynamic_shape(attrs): + if self.dim1 == 0: + self.dynamic_shape.min_input_shape = {"input_data": [1], } + self.dynamic_shape.max_input_shape = {"input_data": [4], } + self.dynamic_shape.opt_input_shape = {"input_data": [2], } + else: + if self.dim2 == 0 and self.dim3 == 0: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 1], + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 64], + } + self.dynamic_shape.opt_input_shape = { + "input_data": [2, 3], + } + elif self.dim2 != 0 and self.dim3 != 0: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 1, 1, 1], + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 64, 128, 128], + } + self.dynamic_shape.opt_input_shape = { + "input_data": [2, 3, 16, 32], + } + elif self.dim3 == 0: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 1, 1], + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 64, 256], + } + self.dynamic_shape.opt_input_shape = { + "input_data": [2, 3, 128], + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 2 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if self.dim1 == 0 and self.dim2 == 0 and self.dim3 == 0: + return True + return False + + self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT, + "Trt does not support 1-dimensional input.") + + def teller2(program_config, predictor_config): + if (len(self.dynamic_shape.min_input_shape) == 0): + if self.dim1 != 0 and self.dim2 == 0 and self.dim3 == 0: + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_SUPPORT, + "Need to repair the case: the output of GPU and tensorrt has diff when the input dimension is 2 in static shape mode." + ) + + def test(self): + self.add_skip_trt_case() + self.run_test() + + +if __name__ == "__main__": + unittest.main() From ea76457c95fd5ab460c768f1d90a640b4b96a429 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AD=A3?= <2042519524@qq.com> Date: Mon, 11 Oct 2021 11:14:17 +0800 Subject: [PATCH 088/298] fix the hidden method in paddle.distributed.utils file (#36210) --- python/paddle/distributed/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py index 6d14b30d18c7f1..63585e167e8e32 100644 --- a/python/paddle/distributed/utils.py +++ b/python/paddle/distributed/utils.py @@ -489,9 +489,6 @@ def __ne__(self, pod): def parse_response(self, res_pods): pass - def rank(self): - return self.rank - def get_visible_gpus(self): r = "" for g in self.gpus: From 2bf82e7598bb319e6b959eb58579d39535c999e7 Mon Sep 17 00:00:00 2001 From: Feiyu Chan Date: Mon, 11 Oct 2021 11:24:40 +0800 Subject: [PATCH 089/298] fix fft axis (#36321) fix: `-1` is used when fft's axis is `0` --- python/paddle/tensor/fft.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/tensor/fft.py b/python/paddle/tensor/fft.py index 829399d14eaa08..f7990e3f89107b 100644 --- a/python/paddle/tensor/fft.py +++ b/python/paddle/tensor/fft.py @@ -1340,7 +1340,7 @@ def fft_c2c(x, n, axis, norm, forward, name): x = paddle.cast(x, _real_to_complex_dtype(x.dtype)) _check_normalization(norm) - axis = axis or -1 + axis = axis if axis is not None else -1 _check_fft_axis(x, axis) axes = [axis] axes = _normalize_axes(x, axes) @@ -1370,7 +1370,7 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name): if is_interger(x): x = paddle.cast(x, paddle.get_default_dtype()) _check_normalization(norm) - axis = axis or -1 + axis = axis if axis is not None else -1 _check_fft_axis(x, axis) axes = [axis] axes = _normalize_axes(x, axes) @@ -1409,7 +1409,7 @@ def fft_c2r(x, n, axis, norm, forward, name): elif is_floating_point(x): x = paddle.cast(x, _real_to_complex_dtype(x.dtype)) _check_normalization(norm) - axis = axis or -1 + axis = axis if axis is not None else -1 _check_fft_axis(x, axis) axes = [axis] axes = _normalize_axes(x, axes) From 642aaa2e18ed6c7b548fc3b109e8cf6eac4aac63 Mon Sep 17 00:00:00 2001 From: Xiaoxu Chen Date: Mon, 11 Oct 2021 11:30:12 +0800 Subject: [PATCH 090/298] use unified external error message for cufft api (#36114) --- cmake/third_party.cmake | 4 +-- paddle/fluid/operators/spectral_op.cu | 5 ++-- paddle/fluid/platform/enforce.h | 14 ++++++++++ paddle/fluid/platform/enforce_test.cc | 22 +++++++++++++++- paddle/fluid/platform/external_error.proto | 1 + tools/externalError/README.md | 30 +++++++++++++++++----- tools/externalError/spider.py | 29 ++++++++++++++++++++- tools/externalError/start.sh | 2 +- 8 files changed, 92 insertions(+), 15 deletions(-) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 892ae270267a79..b3260ba27b0729 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -251,8 +251,8 @@ if(WITH_GPU) include(external/cub) # download cub list(APPEND third_party_deps extern_cub) endif() - set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE) - file_download_and_uncompress(${URL} "externalError" MD5 061f3b7895aadcbe2c3ed592590f8b10) # download file externalErrorMsg.tar.gz + set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE) + file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa) # download file externalErrorMsg.tar.gz if(WITH_TESTING) # copy externalErrorMsg.pb, just for unittest can get error message correctly. set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data) diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu index 9aa5ca39d737e0..24dffaad41b5fc 100644 --- a/paddle/fluid/operators/spectral_op.cu +++ b/paddle/fluid/operators/spectral_op.cu @@ -83,9 +83,7 @@ static inline std::string get_cufft_error_info(cufftResult error) { } static inline void CUFFT_CHECK(cufftResult error) { - if (error != CUFFT_SUCCESS) { - PADDLE_THROW(platform::errors::External(get_cufft_error_info(error))); - } + PADDLE_ENFORCE_CUDA_SUCCESS(error); } // This struct is used to easily compute hashes of the @@ -413,6 +411,7 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out, ? framework::ToRealType(input.type()) : input.type(); auto fft_type = GetFFTTransformType(input.type(), output.type()); + PlanKey Key(framework::vectorize(input.dims()), framework::vectorize(output.dims()), signal_size, fft_type, value_type); diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index c420a5a64be068..7427060add8b10 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -31,6 +31,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #include +#include #include #include #include @@ -714,6 +715,7 @@ DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND); DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN); DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS); DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER); +DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS, CUFFT); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL); @@ -751,6 +753,8 @@ inline const char* GetErrorMsgUrl(T status) { return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/" "types.html#ncclresult-t"; break; + case platform::proto::ApiType::CUFFT: + return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult"; default: return "Unknown type of External API, can't get error message URL!"; break; @@ -839,6 +843,7 @@ template std::string GetExternalErrorMsg(curandStatus_t); template std::string GetExternalErrorMsg(cudnnStatus_t); template std::string GetExternalErrorMsg(cublasStatus_t); template std::string GetExternalErrorMsg(cusolverStatus_t); +template std::string GetExternalErrorMsg(cufftResult_t); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) template std::string GetExternalErrorMsg(ncclResult_t); #endif @@ -899,6 +904,15 @@ inline std::string build_nvidia_error_msg(cusolverStatus_t stat) { return sout.str(); } +/*************** CUFFT ERROR ***************/ +inline bool is_error(cufftResult_t stat) { return stat != CUFFT_SUCCESS; } + +inline std::string build_nvidia_error_msg(cufftResult_t stat) { + std::ostringstream sout; + sout << "CUFFT error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); +} + /**************** NCCL ERROR ****************/ #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) inline bool is_error(ncclResult_t nccl_result) { diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 95a852ad6e92a3..c6d5f171ddce4d 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -9,10 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/platform/enforce.h" + #include #include "gtest/gtest.h" -#include "paddle/fluid/platform/enforce.h" TEST(ENFORCE, OK) { PADDLE_ENFORCE(true, paddle::platform::errors::Unavailable( @@ -418,6 +419,25 @@ TEST(enforce, cuda_success) { "negative vector size, for example).To correct: ensure that all the " "parameters being passed have valid values")); + EXPECT_TRUE(CheckCudaStatusSuccess(CUFFT_SUCCESS)); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_PLAN, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_ALLOC_FAILED, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_TYPE, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_VALUE, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INTERNAL_ERROR, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_EXEC_FAILED, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_SETUP_FAILED, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_SIZE, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_UNALIGNED_DATA, "CUFFT error")); + EXPECT_TRUE( + CheckCudaStatusFailure(CUFFT_INCOMPLETE_PARAMETER_LIST, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_DEVICE, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_PARSE_ERROR, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NO_WORKSPACE, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_IMPLEMENTED, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_LICENSE_ERROR, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_SUPPORTED, "CUFFT error")); + #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error")); diff --git a/paddle/fluid/platform/external_error.proto b/paddle/fluid/platform/external_error.proto index 2094de7e10f69e..cbbf803492e64f 100644 --- a/paddle/fluid/platform/external_error.proto +++ b/paddle/fluid/platform/external_error.proto @@ -24,6 +24,7 @@ enum ApiType { CUBLAS = 3; CUSOLVER = 4; NCCL = 5; + CUFFT = 6; } message MessageDesc { diff --git a/tools/externalError/README.md b/tools/externalError/README.md index 029efd8cb94919..0c2ac626991da2 100644 --- a/tools/externalError/README.md +++ b/tools/externalError/README.md @@ -1,9 +1,25 @@ -Usage: +#### **Introduction for crawling new error message:** -Please run: -``` -bash start.sh -``` -If you want to update all external error message, you need to run command `bash start.sh` in current directory, -and upload the generated file `externalErrorMsg.tar.gz` to https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz + +1. add new spider code in spider.py for crawling error message from website. + +2. run `bash start.sh` in current directory to generate new externalErrorMsg_${date}.tar.gz file, for example `externalErrorMsg_20210928.tar.gz`. + +3. upload above tar file into bos https://paddlepaddledeps.bj.bcebos.com **paddlepaddledeps** bucket, and copy download link `${download_url}`. ***\*Be careful not to delete original tar file\****. + +4. compute md5 value of above tar file `${md5}`, and modify cmake/third_party.cmake file + + ``` + set(URL "${download_url}" CACHE STRING "" FORCE) + file_download_and_uncompress(${URL} "externalError" MD5 ${md5}) + ``` + + for example: + + ``` + set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE) + file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa) + ``` + +5. commit your changes, and create pull request. diff --git a/tools/externalError/spider.py b/tools/externalError/spider.py index a74d82f40ebebd..e07f05f561cb51 100644 --- a/tools/externalError/spider.py +++ b/tools/externalError/spider.py @@ -17,8 +17,10 @@ import urllib.request import json import collections -import sys, getopt +import sys +import getopt import external_error_pb2 +from html.parser import HTMLParser def parsing(externalErrorDesc): @@ -335,6 +337,31 @@ def parsing(externalErrorDesc): _Messages.message = "'%s'. %s" % (error[0], m_message) print("End crawling errorMessage for nvidia NCCL API!\n") + #*************************************************************************************************# + #*********************************** CUFFT Error Message **************************************# + print("start crawling errorMessage for nvidia CUFFT API--->") + url = 'https://docs.nvidia.com/cuda/cufft/index.html#cufftresult' + + allMessageDesc = externalErrorDesc.errors.add() + allMessageDesc.type = external_error_pb2.CUFFT + + html = urllib.request.urlopen(url).read().decode('utf-8') + + class CUFFTHTMLParser(HTMLParser): + '''CUFFTHTML Parser + ''' + + def handle_data(self, data): + if 'typedef enum cufftResult_t' in data: + for line in data.strip().splitlines()[1:-1]: + status, code, desc = re.split('=|//', line.strip()) + _Messages = allMessageDesc.messages.add() + _Messages.code = int(code.strip(' ,')) + _Messages.message = "'%s'. %s" % (status.strip(), + desc.strip()) + + CUFFTHTMLParser().feed(html) + def main(argv): try: diff --git a/tools/externalError/start.sh b/tools/externalError/start.sh index 32ef63c2612681..82715dd47326c1 100644 --- a/tools/externalError/start.sh +++ b/tools/externalError/start.sh @@ -32,4 +32,4 @@ fi protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/external_error.proto python3.7 spider.py -tar czvf externalErrorMsg.tar.gz externalErrorMsg.pb +tar czvf externalErrorMsg_$(date +'%Y%m%d').tar.gz externalErrorMsg.pb From 64d08c0e4b141fb951f984c7793180b255a060a9 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Mon, 11 Oct 2021 11:43:55 +0800 Subject: [PATCH 091/298] fix bug of upload third party to bos (#36311) --- paddle/scripts/paddle_build.bat | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 0283de66ba5af8..d675f4fdbdb617 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -395,15 +395,15 @@ if not exist %THIRD_PARTY_PATH% ( echo Getting third party: extracting ... tar -xf %md5%.tar.gz if !ERRORLEVEL! EQU 0 ( - echo Get third party from bos successfully + echo Get third party from bos successfully. ) else ( - echo Get third party failed, reason: extract failed, will build locally + echo Get third party failed, reason: extract failed, will build locally. ) del %md5%.tar.gz ) else ( - echo Get third party failed, reason: download failed, will build locally + echo Get third party failed, reason: download failed, will build locally. ) - if not exist %THIRD_PARTY_PATH% ( set UPLOAD_TP_FILE=ON ) + if not exist %THIRD_PARTY_PATH% set UPLOAD_TP_FILE=ON cd %work_dir%\%BUILD_DIR% ) else ( echo Found reusable third_party cache in %THIRD_PARTY_PATH%, will reuse it. @@ -540,18 +540,18 @@ if "%UPLOAD_TP_FILE%"=="ON" ( tar -zcf %md5%.tar.gz %md5% if !errorlevel! EQU 0 ( echo Uploading third_party: uploading ... - %PYTHON_ROOT%\python.exe %BCE_FILE% %md5%.tar.gz paddle-windows/third_party/%sub_dir% 1>nul + %PYTHON_ROOT%\python.exe !BCE_FILE! %md5%.tar.gz paddle-windows/third_party/%sub_dir% 1>nul if !errorlevel! EQU 0 ( - echo Upload third party to bos paddle-windows/third_party/%sub_dir% successfully + echo Upload third party %md5% to bos paddle-windows/third_party/%sub_dir% successfully. ) else ( - echo Failed upload third party to bos, reason: upload failed + echo Failed upload third party to bos, reason: upload failed. ) ) else ( - echo Failed upload third party to bos, reason: compress failed + echo Failed upload third party to bos, reason: compress failed. ) del %md5%.tar.gz ) else ( - echo Failed upload third party to bos, reason: install bce failed + echo Failed upload third party to bos, reason: install bce failed. ) cd %work_dir%\%BUILD_DIR% ) From 110613256898b2431654ab21cbd0ba869f99ec40 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Mon, 11 Oct 2021 12:17:21 +0800 Subject: [PATCH 092/298] [NPU] fix softmax_with_cross_entropy in dygraph, test=develop (#36297) --- .../operators/softmax_with_cross_entropy_op.cc | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 0c2d39e7519ef4..78e813edda930c 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -13,10 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" -#include -#include -#include -#include #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { @@ -54,8 +50,7 @@ class SoftmaxWithCrossEntropyOpMaker "exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, " "where labels is ont-hot." "Currently, the tensor is generated and used in npu kernel only. ") - .AsIntermediate() - .AsDispensable(); + .AsIntermediate(); #endif AddOutput("Loss", "(Tensor, default: Tensor), A tensor in same shape with " @@ -136,6 +131,11 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->HasOutput("Softmax"), true, platform::errors::InvalidArgument( "Output(Softmax) should be not null.")); +#ifdef PADDLE_WITH_ASCEND_CL + PADDLE_ENFORCE_EQ(ctx->HasOutput("Backprop"), true, + platform::errors::InvalidArgument( + "Output(Backprop) should be not null.")); +#endif PADDLE_ENFORCE_EQ( ctx->HasOutput("Loss"), true, platform::errors::InvalidArgument("Output(Loss) should be not null.")); @@ -225,6 +225,11 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true, platform::errors::InvalidArgument( "Input(Softmax) should be not null.")); +#ifdef PADDLE_WITH_ASCEND_CL + PADDLE_ENFORCE_EQ(ctx->HasInput("Backprop"), true, + platform::errors::InvalidArgument( + "Input(Backprop) should be not null.")); +#endif PADDLE_ENFORCE_EQ( ctx->HasInput("Label"), true, platform::errors::InvalidArgument("Input(Label) should be not null.")); From 83541fd45eb03d1d86e5403e17fd41274db65ced Mon Sep 17 00:00:00 2001 From: Qi Li Date: Mon, 11 Oct 2021 12:17:56 +0800 Subject: [PATCH 093/298] [NPU] fix set_value, test=develop (#36272) * [NPU] fix set_value, test=develop * fix typo, test=develop * fix typo, test=develop --- paddle/fluid/operators/set_value_op_npu.cc | 464 +++++------------- .../unittests/npu/test_set_value_op_npu.py | 334 ++++++------- 2 files changed, 274 insertions(+), 524 deletions(-) diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc index 3a8d81920f262c..e7b124d5bddd64 100644 --- a/paddle/fluid/operators/set_value_op_npu.cc +++ b/paddle/fluid/operators/set_value_op_npu.cc @@ -1,8 +1,11 @@ /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -10,291 +13,25 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/set_value_op.h" -#include "paddle/fluid/operators/assign_value_op.h" #include "paddle/fluid/operators/npu_op_runner.h" -#include "paddle/fluid/operators/slice_utils.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { -template -class SetValueNPUKernel : public framework::OpKernel { - private: - using Vector_Int64 = std::vector; - void GetNPUStartEndSteps(const Vector_Int64& start, const Vector_Int64& end, - const Vector_Int64& steps, const Vector_Int64& axes, - const framework::DDim& in_dim, - std::vector>& output) const { - int rank = in_dim.size(); - for (int i = 0; i < rank; ++i) { - int axis_size = in_dim[i]; - auto iter = find(axes.begin(), axes.end(), i); - if (iter != axes.end()) { - int idx = iter - axes.begin(); - output[0].push_back(start[idx]); // set as the same as raw input - output[1].push_back(end[idx]); - output[2].push_back(steps[idx]); - } else { - output[0].push_back(0); // begin 0 - output[1].push_back(axis_size); // end = last one - output[2].push_back(1); // step = 1 - } - } - } - - inline std::vector MininumPadNumberMakeSureLastDimGT8( - const std::vector>& npu_slice) const { - int rank = npu_slice[0].size(); - int last_dim_start = npu_slice[0][rank - 1]; - int last_dim_end = npu_slice[1][rank - 1]; - int last_dim_step = npu_slice[2][rank - 1]; - int min_end = last_dim_start + last_dim_step * min_last_dim_value_; - int raw_last_dim_len = (last_dim_end - last_dim_start) / last_dim_step; - return std::vector({std::max(0, min_end - last_dim_end), - min_last_dim_value_ - raw_last_dim_len}); - } - - inline void TileTensor(const framework::ExecutionContext* ctx, - const Tensor* input, Tensor* output) const { - VLOG(4) << "start to tile tensor function, which calls the npu operator " - "TileWithAxis"; - // UNSQUEEZE last dim + TILE last dim * min_last_dim_value_ - Tensor reshape_tensor; - auto reshape_dims = framework::vectorize(input->dims()); - reshape_dims.push_back(1); - reshape_tensor.ShareDataWith(*input); - reshape_tensor.Resize(framework::make_ddim(reshape_dims)); - - auto output_dims = framework::vectorize(input->dims()); - output_dims.push_back(min_last_dim_value_); - output->mutable_data(framework::make_ddim(output_dims), ctx->GetPlace()); - - framework::NPUAttributeMap attr; - attr["axis"] = static_cast(reshape_dims.size() - 1); - attr["tiles"] = min_last_dim_value_; - auto stream = - ctx->template device_context() - .stream(); - NpuOpRunner("TileWithAxis", {reshape_tensor}, {*output}, attr).Run(stream); - } - - inline void BroadcastToD(const framework::ExecutionContext* ctx, - const Tensor* input, - const std::vector* shape, - Tensor* output) const { - VLOG(4) << "Start BroadCast To"; - auto new_shape = std::vector(shape->begin(), shape->end()); - output->mutable_data(framework::make_ddim(new_shape), ctx->GetPlace()); - framework::NPUAttributeMap attr; - attr["shape"] = new_shape; - auto stream = - ctx->template device_context() - .stream(); - NpuOpRunner("BroadcastToD", {*input}, {*output}, attr).Run(stream); - } - - inline void CropTensor(const framework::ExecutionContext* ctx, - const Tensor* input, Tensor* output) const { - auto out_dims = output->dims(); - auto in_dims = input->dims(); - int rank = in_dims.size(); - in_dims[rank - 1] = 1; - output->Resize(in_dims); // unsqueeze output -> [..., 1] - framework::NPUAttributeMap attr; - attr["axis"] = 0; - attr["offsets"] = std::vector(rank, 0); - auto stream = - ctx->template device_context() - .stream(); - NpuOpRunner("Crop", {*input, *output}, {*output}, attr).Run(stream); - output->Resize(out_dims); // restore it - } - - void SliceAssignNPU(const framework::ExecutionContext* ctx, - const Tensor* value_tensor, Vector_Int64& start, - Vector_Int64& end, Vector_Int64& steps, - Vector_Int64& axes, Tensor* assigned_tensor) const { - // must ensure assigned_tensor and value_tensor have the same shape - // not support steps < 0 - // output is also the assigned_tensor. - VLOG(4) << "start function SliceAssignND"; - auto stream = - ctx->template device_context() - .stream(); - for (size_t i = 0; i < steps.size(); ++i) { - PADDLE_ENFORCE_GT(steps[i], 0, - platform::errors::InvalidArgument( - "Currently NPU set_value operator doesn't support " - "negative steps, but got %d as step", - steps[i])); - } - std::vector> npu_slice(3); - GetNPUStartEndSteps(start, end, steps, axes, assigned_tensor->dims(), - npu_slice); - auto tile_numbers = MininumPadNumberMakeSureLastDimGT8(npu_slice); - int assigned_tensor_tile_number = tile_numbers[0]; - int value_tensor_tile_number = tile_numbers[1]; - VLOG(4) << "tile number is : " << assigned_tensor_tile_number << " " - << value_tensor_tile_number; - - Tensor tiled_assigned_tns, tiled_value_tns; - if (assigned_tensor_tile_number > 0) { - TileTensor(ctx, assigned_tensor, &tiled_assigned_tns); - TileTensor(ctx, value_tensor, &tiled_value_tns); - // output have different shape, so use a tmp variable before_crop_output; - // add last dim = min_last_dim_value_ in slice - npu_slice[0].push_back(0); - npu_slice[1].push_back(min_last_dim_value_); - npu_slice[2].push_back(1); - } - - framework::NPUAttributeMap attr_input; - attr_input["begin"] = - std::vector(npu_slice[0].begin(), npu_slice[0].end()); - attr_input["end"] = - std::vector(npu_slice[1].begin(), npu_slice[1].end()); - attr_input["strides"] = - std::vector(npu_slice[2].begin(), npu_slice[2].end()); - attr_input["begin_mask"] = 0; - attr_input["end_mask"] = 0; - attr_input["ellipsis_mask"] = 0; - attr_input["new_axis_mask"] = 0; - attr_input["shrink_axis_mask"] = 0; - if (assigned_tensor_tile_number > 0) { - NpuOpRunner("StridedSliceAssignD", {tiled_assigned_tns, tiled_value_tns}, - {tiled_assigned_tns}, attr_input) - .Run(stream); // Remember, set output = input, and this op will - // change the input value. - } else { - NpuOpRunner("StridedSliceAssignD", {*assigned_tensor, *value_tensor}, - {*assigned_tensor}, attr_input) - .Run(stream); - } - if (assigned_tensor_tile_number > 0) { - CropTensor(ctx, &tiled_assigned_tns /*initialzied*/, - assigned_tensor /*initalized*/); - } - } - - void ModifyAxesAccordingNoneAxes(const Vector_Int64& none_axes, - Vector_Int64& axes_to_modify) const { - if (none_axes.empty()) return; - auto none_axes_copy = none_axes; - sort(none_axes_copy.begin(), none_axes_copy.end()); - for (size_t i = 0; i < axes_to_modify.size(); ++i) { - int axis = axes_to_modify[i]; - auto upper = - upper_bound(none_axes_copy.begin(), none_axes_copy.end(), axis); - // Example: none_axes = [1,3,4,5,7] - // axis = 4 - // find the element number less or equal than 4, which is - // 3(1,3,4) - // axis becomes 4 + 3 = 7 ; - axes_to_modify[i] = axis + (upper - none_axes_copy.begin()); - } - } - - void UnsqueezeAccordingNoneAxes(const Vector_Int64& none_axes, - Vector_Int64& slice_dims) const { - // note : axes will change, because new axes inserted. - // sum array to modify the axes. because more simply - if (none_axes.empty()) return; - Vector_Int64 slice_dims_with_none; - size_t none_axes_cur = 0; - for (size_t i = 0; i < slice_dims.size(); ++i) { - while (none_axes_cur < none_axes.size() && - none_axes[none_axes_cur] <= static_cast(i)) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - slice_dims_with_none.push_back(slice_dims[i]); - } - // if the none_axes.size() > slice_dims.size(), append 1 after last dim - while (none_axes_cur < none_axes.size()) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - slice_dims = slice_dims_with_none; - } +using NPUDeviceContext = platform::NPUDeviceContext; - void ModiftyDimsAccordingNoneAndDecrease(Vector_Int64& slice_dim, - Vector_Int64& value_dim, - Vector_Int64& axes, - Vector_Int64& none_axes, - Vector_Int64& dec_axes) const { - // change the value of slice_dim, value_dim, start, end, steps, axes by none - // and decrease axes - // after change, this values can be passed to SliceAssignNPU() directly. - - // Modity Slice Dim - UnsqueezeAccordingNoneAxes(none_axes, slice_dim); - ModifyAxesAccordingNoneAxes(none_axes, dec_axes); - ModifyAxesAccordingNoneAxes(none_axes, axes); - // Modity Value Dim by new slice dim - auto slice_dim_reverse = slice_dim; - auto value_dim_reverse = value_dim; - std::reverse(slice_dim_reverse.begin(), slice_dim_reverse.end()); - std::reverse(value_dim_reverse.begin(), value_dim_reverse.end()); - - Vector_Int64 new_value_dim; - PADDLE_ENFORCE_GE( - slice_dim.size(), value_dim.size(), - platform::errors::InvalidArgument("The size of expanded slice_dim(%d) " - "must greater than the value_dim(%d)", - slice_dim.size(), value_dim.size())); +template +class SetValueNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* in = ctx.Input("Input"); + auto* value_tensor = ctx.Input("ValueTensor"); + auto* out = ctx.Output("Out"); - size_t value_cur = 0; - size_t rank = slice_dim.size(); - for (size_t i = 0; i < rank; ++i) { - auto& xsize = slice_dim_reverse[i]; - if (value_cur >= value_dim_reverse.size()) { - new_value_dim.push_back(1); - continue; - } - auto& vsize = value_dim_reverse[value_cur]; - auto it = find(dec_axes.begin(), dec_axes.end(), rank - 1 - i); - if (it != dec_axes.end()) { - // found, insert one dim ; - PADDLE_ENFORCE_EQ(xsize, 1, platform::errors::InvalidArgument( - "The dims refered by decrease axes is " - "not equal to 1, some wrongs happen")); - new_value_dim.push_back(1); - continue; - } - if (xsize == vsize || vsize == 1) { - new_value_dim.push_back(vsize); - ++value_cur; - continue; - } - PADDLE_THROW(platform::errors::InvalidArgument( - "The shape of value_tensor can't be broadcast to value tensor, " - "please check input")); - } - for (; value_cur < value_dim_reverse.size(); ++value_cur) { - if (value_dim_reverse[value_cur] != 1) { - PADDLE_THROW(platform::errors::InvalidArgument( - "The shape of value_tensor can't be broadcast to value tensor, " - "please check input")); - } - } - std::reverse(new_value_dim.begin(), new_value_dim.end()); - value_dim = new_value_dim; - return; - } + auto starts_tensor_list = ctx.MultiInput("StartsTensorList"); + auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); + auto steps_tensor_list = ctx.MultiInput("StepsTensorList"); - public: - void Compute(const framework::ExecutionContext& ctx) const override { - VLOG(2) << "Start Set Value Npu Kernel"; - auto* in = ctx.Input("Input"); - auto* out = ctx.Output("Out"); - auto* value_tensor = ctx.Input("ValueTensor"); - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - auto steps_tensor_list = - ctx.MultiInput("StepsTensorList"); auto axes = ctx.Attr>("axes"); auto starts = ctx.Attr>("starts"); auto ends = ctx.Attr>("ends"); @@ -302,17 +39,6 @@ class SetValueNPUKernel : public framework::OpKernel { auto shape = ctx.Attr>("shape"); auto decrease_axes = ctx.Attr>("decrease_axes"); auto none_axes = ctx.Attr>("none_axes"); - auto dtype = in->type(); - - if (dtype == framework::proto::VarType::FP64 || - dtype == framework::proto::VarType::INT64 || - dtype == framework::proto::VarType::BOOL) { - auto value_type_name = GetValueName(dtype); - PADDLE_THROW(platform::errors::InvalidArgument( - "The NPU setvalue kernel currently only support FLOAT32 and INT32, " - "but got type: %s", - value_type_name.data())); - } if (!starts_tensor_list.empty()) { starts = GetDataFromTensorList(starts_tensor_list); @@ -327,65 +53,137 @@ class SetValueNPUKernel : public framework::OpKernel { auto in_dims = in->dims(); CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps); auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps); - auto place = ctx.GetPlace(); + auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes); + + auto slice_dims_for_assign = decrease_slice_dims; + if (!none_axes.empty()) { + std::vector slice_dims_with_none; + + size_t none_axes_cur = 0, decrease_axes_cur = 0; + for (int i = 0; i < slice_dims.size(); ++i) { + while (none_axes_cur < none_axes.size() && + none_axes[none_axes_cur] <= i) { + slice_dims_with_none.push_back(1); + none_axes_cur++; + } + if (decrease_axes_cur < decrease_axes.size() && + decrease_axes[decrease_axes_cur] == i) { + decrease_axes_cur++; + } else { + slice_dims_with_none.push_back(slice_dims[i]); + } + } + while (none_axes_cur < none_axes.size()) { + slice_dims_with_none.push_back(1); + none_axes_cur++; + } - // aforementioned code is copyed directly from CPU kernel. - // (@xiongkun03) the following is redesigned by xiongkun. because NPU can do - // step slice assignment. so we deal with all none_axes and decrease_axes - // here. - // 1. we insert 1 into assigned_tensor_shape according to none_axes; - // 2. we insert 1 into value_tensor_shape(value tensor) according to - // decrease_axes; - // 3. we reshape back the assigned_tensor. and return it. - // note : we use a tmp_value_tensor as value_tns. it shares data with - // value_tensor; - // I believe the logic is more simple than cpu logic. + slice_dims_for_assign = framework::make_ddim(slice_dims_with_none); + } + + TensorCopy(*in, ctx.GetPlace(), out); + + auto starts_indices = std::vector(in_dims.size(), 0); + auto ends_indices = std::vector(in_dims.size(), 0); + auto strides_indices = std::vector(in_dims.size(), 0); + + for (int i = 0; i < in_dims.size(); ++i) { + starts_indices[i] = 0; + ends_indices[i] = slice_dims[i]; + strides_indices[i] = 1; + } + for (size_t i = 0; i < axes.size(); i++) { + int axis_index = axes[i]; + starts_indices[axis_index] = starts[i]; + ends_indices[axis_index] = ends[i]; + strides_indices[axis_index] = steps[i]; + } + + int64_t stride_step = framework::product(in_dims); + std::vector index_indices(1, 0); + for (size_t i = 0; i < strides_indices.size(); ++i) { + auto index_size = index_indices.size(); + stride_step /= in_dims[i]; + for (size_t j = 0; j < index_size; ++j) { + auto start_index = *index_indices.begin(); + if (strides_indices[i] > 0) { + for (int64_t k = starts_indices[i]; k < ends_indices[i]; + k += strides_indices[i]) { + index_indices.push_back(start_index + k * stride_step); + } + } else { + for (int64_t k = starts_indices[i]; k > ends_indices[i]; + k += strides_indices[i]) { + index_indices.push_back(start_index + k * stride_step); + } + } + index_indices.erase(index_indices.begin()); + } + } - TensorCopy(*in, place, out); - Tensor value_t(dtype); + PADDLE_ENFORCE_EQ( + static_cast(index_indices.size()), + framework::product(slice_dims_for_assign), + platform::errors::InvalidArgument( + "OP(set_value) error index indices and value update not match ")); - if (value_tensor == nullptr) { + Tensor value_t(in->type()); + if (value_tensor != nullptr) { + value_t.ShareDataWith(*value_tensor); + } else { auto value_dims = framework::make_ddim(shape); - value_t.mutable_data(value_dims, place); - auto value_name = GetValueName(dtype); + CheckIsDimsMatch(slice_dims_for_assign, value_dims); + + value_t.mutable_data(value_dims, ctx.GetPlace()); + auto value_name = GetValueName(in->type()); CopyVecotorToTensor(value_name.c_str(), &value_t, ctx); value_t.Resize(value_dims); } - const Tensor* value_tensor_ptr = - (value_tensor == nullptr) ? &value_t : value_tensor; - auto value_dims_vec = framework::vectorize(value_tensor_ptr->dims()); - auto slice_dims_vec = framework::vectorize(slice_dims); - auto in_dims_vec = framework::vectorize(in_dims); - - UnsqueezeAccordingNoneAxes(none_axes, in_dims_vec); - ModiftyDimsAccordingNoneAndDecrease(slice_dims_vec, value_dims_vec, axes, - none_axes, - decrease_axes); // Modify and Check + auto stream = ctx.template device_context().stream(); - Tensor reshaped_value_tensor, broadcast_value_tensor; - reshaped_value_tensor.ShareDataWith(*value_tensor_ptr); - reshaped_value_tensor.Resize(framework::make_ddim(value_dims_vec)); - - BroadcastToD(&ctx, &reshaped_value_tensor, &slice_dims_vec, - &broadcast_value_tensor /*inner function initialized*/); + Tensor value_temp(in->type()); + if (slice_dims_for_assign == value_t.dims()) { + value_temp.ShareDataWith(value_t); + } else { + value_temp.Resize(slice_dims_for_assign); + value_temp.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(value_t) + .AddInput(framework::vectorize(slice_dims_for_assign)) + .AddOutput(value_temp) + .Run(stream); + } - out->Resize(framework::make_ddim(in_dims_vec)); - SliceAssignNPU(&ctx, &broadcast_value_tensor, starts, ends, steps, axes, - out); - out->Resize(in_dims); // Reshape Back + int64_t input_numel = framework::product(in_dims); + int64_t index_numel = index_indices.size(); + + Tensor in_temp, out_temp, val_temp; + in_temp.ShareDataWith(*in); + out_temp.ShareDataWith(*out); + val_temp.ShareDataWith(value_temp); + in_temp.Resize(framework::make_ddim({input_numel})); + out_temp.Resize(framework::make_ddim({input_numel})); + val_temp.Resize(framework::make_ddim({index_numel})); + + NpuOpRunner runner; + runner.SetType("ScatterUpdate") + .AddInput(in_temp) + .AddInput(std::move(index_indices)) + .AddInput(val_temp) + .AddOutput(out_temp) + .Run(stream); } - - private: - const int min_last_dim_value_ = - 32 / sizeof(T); // 16 for float16 , 8 for float32 }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - set_value, ops::SetValueNPUKernel, - ops::SetValueNPUKernel) + +REGISTER_OP_NPU_KERNEL(set_value, ops::SetValueNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::SetValueNPUKernel, +#endif + ops::SetValueNPUKernel) diff --git a/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py index e819f422f2b441..421ea1df4cff09 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py @@ -23,13 +23,15 @@ import paddle.fluid as fluid from paddle.fluid import core -SEED = 2021 - class TestSetValueBase(unittest.TestCase): - def set_input(self): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def setUp(self): + paddle.enable_static() self.set_npu() - paddle.device.set_device('npu') self.set_dtype() self.set_value() self.set_shape() @@ -51,9 +53,6 @@ def _call_setitem(self, x): def _get_answer(self): self.data[0, 0] = self.value - def set_npu(self): - self.__class__.use_npu = True - class TestSetValueApi(TestSetValueBase): def _run_static(self): @@ -62,13 +61,13 @@ def _run_static(self): x = paddle.ones(shape=self.shape, dtype=self.dtype) self._call_setitem(x) - exe = paddle.static.Executor(paddle.NPUPlace(0)) + exe = paddle.static.Executor(self.place) out = exe.run(self.program, fetch_list=[x]) paddle.disable_static() return out def _run_dynamic(self): - paddle.disable_static(paddle.NPUPlace(0)) + paddle.disable_static(self.place) x = paddle.ones(shape=self.shape, dtype=self.dtype) self._call_setitem(x) out = x.numpy() @@ -76,7 +75,6 @@ def _run_dynamic(self): return out def test_api(self): - self.set_input() static_out = self._run_static() dynamic_out = self._run_dynamic() self._get_answer() @@ -134,23 +132,22 @@ def _get_answer(self): self.data[0:, 1:2, :] = self.value -""" FIXEME : it seams that NPU don't support while operator ??? -class TestSetValueItemSliceInWhile(TestSetValueApi): - def _call_setitem(self, x): - def cond(i, x): - return i < 1 +# TODO(qili93): Fix this after NPU support while_loop +# class TestSetValueItemSliceInWhile(TestSetValueApi): +# def _call_setitem(self, x): +# def cond(i, x): +# return i < 1 - def body(i, x): - x[i] = self.value - i = i + 1 - return i, x - with paddle.static.device_guard("npu"): - i = paddle.zeros(shape=(1, ), dtype='int32') - i, x = paddle.fluid.layers.while_loop(cond, body, [i, x]) +# def body(i, x): +# x[i] = self.value +# i = i + 1 +# return i, x - def _get_answer(self): - self.data[0] = self.value -""" +# i = paddle.zeros(shape=(1, ), dtype='int32') +# i, x = paddle.fluid.layers.while_loop(cond, body, [i, x]) + +# def _get_answer(self): +# self.data[0] = self.value # 1.2.2 step > 1 @@ -192,6 +189,60 @@ def _get_answer(self): self.data[0:, 1:2:2, :] = self.value +# 1.2.3 step < 0 +class TestSetValueItemSliceNegetiveStep(TestSetValueApi): + def set_shape(self): + self.shape = [5, 2] + + def set_value(self): + self.value = np.array([3, 4]) + + def _call_setitem(self, x): + x[5:2:-1] = self.value + + def _get_answer(self): + self.data[5:2:-1] = self.value + + +class TestSetValueItemSliceNegetiveStep2(TestSetValueApi): + def set_shape(self): + self.shape = [5] + + def set_value(self): + self.value = np.array([3, 4]) + + def _call_setitem(self, x): + x[1::-1] = self.value + + def _get_answer(self): + self.data[1::-1] = self.value + + +class TestSetValueItemSliceNegetiveStep3(TestSetValueApi): + def set_shape(self): + self.shape = [3] + + def set_value(self): + self.value = np.array([3, 4, 5]) + + def _call_setitem(self, x): + x[::-1] = self.value + + def _get_answer(self): + self.data[::-1] = self.value + + +class TestSetValueItemSliceNegetiveStep4(TestSetValueApi): + def set_shape(self): + self.shape = [3, 4, 5] + + def _call_setitem(self, x): + x[2:0:-1, 0:2, ::-1] = self.value + + def _get_answer(self): + self.data[2:0:-1, 0:2, ::-1] = self.value + + # 1.3 item is Ellipsis @@ -277,6 +328,19 @@ def _get_answer(self): self.data[0:, 1:2:2, :] = self.value +class TestSetValueItemTensor6(TestSetValueApi): + def set_shape(self): + self.shape = [3, 4, 5] + + def _call_setitem(self, x): + minus1 = paddle.full([1], -1, dtype="int32") + zero = paddle.full([1], 0, dtype="int32") + x[2:zero:minus1, 0:2, 10:-6:minus1] = self.value + + def _get_answer(self): + self.data[2:0:-1, 0:2, ::-1] = self.value + + # 1.5 item is None class TestSetValueItemNone1(TestSetValueApi): def _call_setitem(self, x): @@ -350,133 +414,99 @@ def _get_answer(self): self.data[None, :, 1, ..., None] = np.zeros(self.shape)[0, 0, :, None] -""" FIXME : current NPU set_value don't support negative step !!! - @xiongkun03 +# 1.5 item is list or Tensor of bol +class TestSetValueItemBool1(TestSetValueApi): + def _call_setitem(self, x): + x[[True, False]] = self.value -class TestSetValueItemTensor6(TestSetValueApi): - def set_shape(self): - self.shape = [3, 4, 5] + def _get_answer(self): + self.data[[True, False]] = self.value + +class TestSetValueItemBool2(TestSetValueApi): def _call_setitem(self, x): - minus1 = paddle.full([1], -1, dtype="int32") - zero = paddle.full([1], 0, dtype="int32") - x[2:zero:minus1, 0:2, 10:-6:minus1] = self.value + x[[False, False]] = self.value def _get_answer(self): - self.data[2:0:-1, 0:2, ::-1] = self.value -""" + self.data[[False, False]] = self.value -# 2. Test different type of value: int, float, numpy.ndarray, Tensor -# 2.1 value is int32, int64, float32, float64, bool +class TestSetValueItemBool3(TestSetValueApi): + def _call_setitem(self, x): + x[[False, True]] = np.zeros(self.shape[2]) -def create_test_value_int32(parent): - class TestValueInt(parent): - def set_value(self): - self.value = 7 + def _get_answer(self): + self.data[[False, True]] = np.zeros(self.shape[2]) - def set_dtype(self): - self.dtype = "int32" - - cls_name = "{0}_{1}".format(parent.__name__, "ValueInt32") - TestValueInt.__name__ = cls_name - globals()[cls_name] = TestValueInt +class TestSetValueItemBool4(TestSetValueApi): + def _call_setitem(self, x): + idx = paddle.assign(np.array([False, True])) + x[idx] = np.zeros(self.shape[2]) -create_test_value_int32(TestSetValueItemInt) -create_test_value_int32(TestSetValueItemSlice) -create_test_value_int32(TestSetValueItemSlice2) -create_test_value_int32(TestSetValueItemSlice3) -create_test_value_int32(TestSetValueItemSlice4) + def _get_answer(self): + self.data[np.array([False, True])] = np.zeros(self.shape[2]) -def create_test_value_numpy_fp32(parent): - class TestValueInt(parent): - def set_value(self): - self.value = np.array([1]) +class TestSetValueItemBool5(TestSetValueApi): + def _call_setitem(self, x): + idx = paddle.assign( + np.array([[False, True, False], [True, True, False]])) + x[idx] = self.value - def set_dtype(self): - self.dtype = "float32" + def _get_answer(self): + self.data[np.array([[False, True, False], [True, True, False] + ])] = self.value - cls_name = "{0}_{1}".format(parent.__name__, "ValueNumpyFp32") - TestValueInt.__name__ = cls_name - globals()[cls_name] = TestValueInt +class TestSetValueItemBool6(TestSetValueApi): + def _call_setitem(self, x): + x[0, ...] = 0 + x[x > 0] = self.value -create_test_value_numpy_fp32(TestSetValueItemInt) -create_test_value_numpy_fp32(TestSetValueItemSlice) -create_test_value_numpy_fp32(TestSetValueItemSlice2) -create_test_value_numpy_fp32(TestSetValueItemSlice3) -create_test_value_numpy_fp32(TestSetValueItemSlice4) + def _get_answer(self): + self.data[0, ...] = 0 + self.data[self.data > 0] = self.value -def create_test_value_numpy_fp64(parent): +def create_test_value_int32(parent): class TestValueInt(parent): def set_value(self): - self.value = np.array([2**127]).astype("float64") - - def set_dtype(self): - self.dtype = "float64" - - cls_name = "{0}_{1}".format(parent.__name__, "ValueNumpyFp64") - TestValueInt.__name__ = cls_name - globals()[cls_name] = TestValueInt - - -create_test_value_numpy_fp64(TestSetValueItemInt) -create_test_value_numpy_fp64(TestSetValueItemSlice) -create_test_value_numpy_fp64(TestSetValueItemSlice2) -create_test_value_numpy_fp64(TestSetValueItemSlice3) -create_test_value_numpy_fp64(TestSetValueItemSlice4) - + self.value = 7 -# 2.3 value is a Paddle Tensor (int32, int64, float32, float64, bool) -def create_test_value_tensor_int32(parent): - class TestValueInt(parent): def set_dtype(self): self.dtype = "int32" - def _call_setitem(self, x): - value = paddle.full(shape=[1], fill_value=3, dtype=self.dtype) - x[0, 1] = value - - def _get_answer(self): - self.data[0, 1] = 3 - - cls_name = "{0}_{1}".format(parent.__name__, "ValueTensorInt32") + cls_name = "{0}_{1}".format(parent.__name__, "ValueInt32") TestValueInt.__name__ = cls_name globals()[cls_name] = TestValueInt -create_test_value_tensor_int32(TestSetValueItemInt) -create_test_value_tensor_int32(TestSetValueItemSlice) -create_test_value_tensor_int32(TestSetValueItemSlice2) -create_test_value_tensor_int32(TestSetValueItemSlice3) -create_test_value_tensor_int32(TestSetValueItemSlice4) +create_test_value_int32(TestSetValueItemInt) +create_test_value_int32(TestSetValueItemSlice) +create_test_value_int32(TestSetValueItemSlice2) +create_test_value_int32(TestSetValueItemSlice3) +create_test_value_int32(TestSetValueItemSlice4) -def create_test_value_tensor_int64(parent): +def create_test_value_int64(parent): class TestValueInt(parent): + def set_value(self): + self.value = 7 + def set_dtype(self): self.dtype = "int64" - def _call_setitem(self, x): - value = paddle.full(shape=[1], fill_value=3, dtype=self.dtype) - x[0, 1] = value - - def _get_answer(self): - self.data[0, 1] = 3 - - cls_name = "{0}_{1}".format(parent.__name__, "ValueTensorInt64") + cls_name = "{0}_{1}".format(parent.__name__, "ValueInt64") TestValueInt.__name__ = cls_name globals()[cls_name] = TestValueInt -create_test_value_tensor_int64(TestSetValueItemInt) -create_test_value_tensor_int64(TestSetValueItemSlice) -create_test_value_tensor_int64(TestSetValueItemSlice2) -create_test_value_tensor_int64(TestSetValueItemSlice3) -create_test_value_tensor_int64(TestSetValueItemSlice4) +create_test_value_int64(TestSetValueItemInt) +create_test_value_int64(TestSetValueItemSlice) +create_test_value_int64(TestSetValueItemSlice2) +create_test_value_int64(TestSetValueItemSlice3) +create_test_value_int64(TestSetValueItemSlice4) def create_test_value_tensor_fp32(parent): @@ -503,30 +533,6 @@ def _get_answer(self): create_test_value_tensor_fp32(TestSetValueItemSlice4) -def create_test_value_tensor_fp64(parent): - class TestValueInt(parent): - def set_dtype(self): - self.dtype = "float64" - - def _call_setitem(self, x): - value = paddle.full(shape=[1], fill_value=3, dtype=self.dtype) - x[0, 1] = value - - def _get_answer(self): - self.data[0, 1] = 3 - - cls_name = "{0}_{1}".format(parent.__name__, "ValueTensorFp64") - TestValueInt.__name__ = cls_name - globals()[cls_name] = TestValueInt - - -create_test_value_tensor_fp64(TestSetValueItemInt) -create_test_value_tensor_fp64(TestSetValueItemSlice) -create_test_value_tensor_fp64(TestSetValueItemSlice2) -create_test_value_tensor_fp64(TestSetValueItemSlice3) -create_test_value_tensor_fp64(TestSetValueItemSlice4) - - # 3. Test different shape of value class TestSetValueValueShape1(TestSetValueApi): def set_value(self): @@ -589,59 +595,5 @@ def _get_answer(self): self.data[:, 0] = self.value -# 4. Test error -class TestError(TestSetValueBase): - def _value_type_error(self): - with self.assertRaisesRegexp( - TypeError, - "Only support to assign an integer, float, numpy.ndarray or paddle.Tensor" - ): - x = paddle.ones(shape=self.shape, dtype=self.dtype) - value = [1] - x[0] = value - - def _dtype_error(self): - with self.assertRaisesRegexp( - TypeError, - "When assign a numpy.ndarray, integer or float to a paddle.Tensor, " - ): - y = paddle.ones(shape=self.shape, dtype="float16") - y[0] = 1 - - def _step_error(self): - with self.assertRaisesRegexp(ValueError, "step can not be 0"): - x = paddle.ones(shape=self.shape, dtype=self.dtype) - x[0:1:0] = self.value - - def _ellipsis_error(self): - with self.assertRaisesRegexp( - IndexError, "An index can only have a single ellipsis"): - x = paddle.ones(shape=self.shape, dtype=self.dtype) - x[..., ...] = self.value - with self.assertRaisesRegexp(ValueError, "the start or end is None"): - x = paddle.ones(shape=self.shape, dtype=self.dtype) - one = paddle.ones([1]) - x[::one] = self.value - - def _broadcast_mismatch(self): - program = paddle.static.Program() - with paddle.static.program_guard(program): - x = paddle.ones(shape=self.shape, dtype=self.dtype) - value = np.array([3, 4, 5, 6, 7]) - x[0] = value - exe = paddle.static.Executor(paddle.CPUPlace()) - with self.assertRaises(ValueError): - exe.run(program) - - def test_error(self): - self.set_input() - paddle.enable_static() - with paddle.static.program_guard(self.program): - self._value_type_error() - self._dtype_error() - self._step_error() - self._broadcast_mismatch() - - if __name__ == '__main__': unittest.main() From 7850f7ce0ac70cb52dd071579aea64cdd235efd5 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Mon, 11 Oct 2021 14:12:56 +0800 Subject: [PATCH 094/298] [NPU] fix matmul_v2 and utils.run_check, test=develop (#36164) * [NPU] fix matmul_v2 and utils.run_check, test=develop * remove debug files, test=develop * fix install_check, test=develop * fix doc, test=develop * fix review comments, test=develop --- paddle/fluid/operators/matmul_v2_op_npu.cc | 477 ++++++++++++----- python/paddle/fluid/framework.py | 70 +++ .../fluid/tests/unittests/npu/CMakeLists.txt | 1 + .../unittests/npu/test_matmulv2_op_npu.py | 504 +++++++++++------- python/paddle/static/__init__.py | 2 + python/paddle/utils/install_check.py | 58 +- 6 files changed, 768 insertions(+), 344 deletions(-) diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc index b23b408e9c59a7..6d7e8f3478c848 100644 --- a/paddle/fluid/operators/matmul_v2_op_npu.cc +++ b/paddle/fluid/operators/matmul_v2_op_npu.cc @@ -21,166 +21,387 @@ limitations under the License. */ namespace paddle { namespace operators { -template +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template +static void MatMul2D(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y) { + Out->mutable_data(ctx.GetPlace()); + const auto& runner = + NpuOpRunner("MatMul", {X, Y}, {*Out}, + {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}}); + runner.Run(stream); +} + +template +static void MatMulND(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y) { + Out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("BatchMatMul", {X, Y}, {*Out}, + {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); + runner.Run(stream); +} + +template +static void ReduceDims(const framework::ExecutionContext& ctx, + const aclrtStream& stream, + const std::vector& dims, + const std::vector& brd_dims, const Tensor& in, + Tensor* out) { + std::vector axes; + int64_t size = brd_dims.size(); + int64_t diff = brd_dims.size() - dims.size(); + for (int64_t i = 0; i < size; ++i) { + if (i < diff) { + axes.push_back(i); + continue; + } + if (brd_dims[i] > dims[i - diff]) { + axes.push_back(i); + } + } + out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); +} + +template class MatMulV2NPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - bool transpose_x = ctx.Attr("trans_x"); - bool transpose_y = ctx.Attr("trans_y"); - - if (x->dims().size() == 2) { - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner( - "MatMul", {*x, *y}, {*out}, - {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* Out = ctx.Output("Out"); + const bool trans_x = ctx.Attr("trans_x"); + const bool trans_y = ctx.Attr("trans_y"); + + std::vector x_dims = framework::vectorize(X->dims()); + std::vector y_dims = framework::vectorize(Y->dims()); + std::vector out_dims = framework::vectorize(Out->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); - } else if (x->dims().size() > 2) { - out->mutable_data(ctx.GetPlace()); + auto stream = ctx.template device_context().stream(); - const auto& runner = - NpuOpRunner("BatchMatMul", {*x, *y}, {*out}, - {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}}); + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + PADDLE_ENFORCE_EQ( + X->numel(), Y->numel(), + platform::errors::InvalidArgument( + "X's numbers must be equal to Y's numbers," + "when X/Y's dims =1. But received X has [%d] elements," + "received Y has [%d] elements", + X->numel(), Y->numel())); + Out->Resize({1}); + Out->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); + const auto& runner = NpuOpRunner("Dot", {*X, *Y}, {*Out}); runner.Run(stream); + return; + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(framework::make_ddim(x_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(framework::make_ddim(y_dims)); + y_ndim = 2; + out_ndim += 1; + } + + const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + if (trans_y) { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1])); + } else { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2])); } + + // Case 2: [M, K] x [K, N] = [M, N] + if (x_ndim == 2 && y_ndim == 2) { + MatMul2D(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y); + return; + } + + // Case 3: [B, M, K] x [K, N] = [B, M, N], when trans_x = false + // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] + if (trans_x == false && y_ndim == 2) { + std::vector vec_dim = {x_temp.numel() / K, K}; + x_temp.Resize(framework::make_ddim(vec_dim)); + MatMul2D(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y); + return; + } + + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_broadcast_dims(out_ndim, 1); + std::vector y_broadcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); + + Tensor x_temp_brd(X->type()); + if (x_dims == x_broadcast_dims) { + x_temp_brd.ShareDataWith(*X); + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + } else { + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + x_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(x_temp) + .AddInput(std::move(x_broadcast_dims)) + .AddOutput(x_temp_brd) + .Run(stream); + } + + Tensor y_temp_brd(Y->type()); + if (y_dims == y_broadcast_dims) { + y_temp_brd.ShareDataWith(*Y); + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + } else { + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + y_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(y_temp) + .AddInput(std::move(y_broadcast_dims)) + .AddOutput(y_temp_brd) + .Run(stream); + } + MatMulND(ctx, stream, x_temp_brd, y_temp_brd, Out, trans_x, trans_y); } }; -template +template class MatMulV2GradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - bool transpose_y = ctx.Attr("trans_y"); - auto stream = - ctx.template device_context() - .stream(); - - if (x->dims().size() == 2) { - if (transpose_y) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - - runner_dx.Run(stream); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", {*dout, *x}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* dOut = ctx.Input(framework::GradVarName("Out")); + auto* dX = ctx.Output(framework::GradVarName("X")); + auto* dY = ctx.Output(framework::GradVarName("Y")); + const bool trans_x = ctx.Attr("trans_x"); + const bool trans_y = ctx.Attr("trans_y"); - runner_dy.Run(stream); - } + std::vector x_dims = framework::vectorize(X->dims()); + std::vector y_dims = framework::vectorize(Y->dims()); + std::vector out_dims = framework::vectorize(dOut->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); - } else { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); + auto stream = ctx.template device_context().stream(); - runner_dx.Run(stream); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", {*x, *dout}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + Tensor dout_temp(dOut->type()); + dout_temp.Resize(X->dims()); + dout_temp.mutable_data(ctx.GetPlace()); + NpuOpRunner runner; + runner.SetType("BroadcastTo") + .AddInput(*dOut) + .AddInput(std::move(x_dims)) + .AddOutput(dout_temp) + .Run(stream); - runner_dy.Run(stream); + if (dX) { + dX->mutable_data(ctx.GetPlace()); + const auto& runner_dx = NpuOpRunner("Mul", {dout_temp, *Y}, {*dX}, {}); + runner_dx.Run(stream); + } + if (dY) { + dY->mutable_data(ctx.GetPlace()); + const auto& runner_dy = NpuOpRunner("Mul", {dout_temp, *X}, {*dY}, {}); + runner_dy.Run(stream); + } + return; + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp, dout_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + dout_temp.ShareDataWith(*dOut); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(framework::make_ddim(x_dims)); + dout_temp.Resize(framework::make_ddim(out_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(framework::make_ddim(y_dims)); + dout_temp.Resize(framework::make_ddim(out_dims)); + y_ndim = 2; + out_ndim += 1; + } + + // Case 2: [M, K] x [K, N] = [M, N] + if (out_ndim == 2) { + if (dX) { + dX->Resize(framework::make_ddim(x_dims)); + if (trans_x) { + MatMul2D(ctx, stream, y_temp, dout_temp, dX, trans_y, true); + } else { + MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !trans_y); } + dX->Resize(X->dims()); } - } else if (x->dims().size() > 2) { - if (transpose_y) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, - {{"adj_x1", false}, {"adj_x2", false}}); - - runner_dx.Run(stream); + if (dY) { + dY->Resize(framework::make_ddim(y_dims)); + if (trans_y) { + MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, trans_x); + } else { + MatMul2D(ctx, stream, x_temp, dout_temp, dY, !trans_x, false); } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); + dY->Resize(Y->dims()); + } + return; + } + + const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1]; - runner_dy.Run(stream); + // Case 3: [B, M, K] x [K, N] = [B, M, N], when trans_x = false + // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] + if (trans_x == false && y_ndim == 2) { + std::vector x_vec_dim = {x_temp.numel() / K, K}; + dout_temp.Resize( + framework::make_ddim(std::vector{dout_temp.numel() / N, N})); + if (dX) { + dX->Resize(framework::make_ddim(x_vec_dim)); + MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !trans_y); + dX->Resize(X->dims()); + } + if (dY) { + x_temp.Resize(framework::make_ddim(x_vec_dim)); + if (trans_y) { + MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, false); + } else { + MatMul2D(ctx, stream, x_temp, dout_temp, dY, true, false); } - } else { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, - {{"adj_x1", false}, {"adj_x2", true}}); + } + return; + } + + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_broadcast_dims(out_ndim, 1); + std::vector y_broadcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); + + Tensor x_temp_brd(X->type()); + if (x_dims == x_broadcast_dims) { + x_temp_brd.ShareDataWith(*X); + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + } else { + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + x_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(x_temp) + .AddInput(std::move(x_broadcast_dims)) + .AddOutput(x_temp_brd) + .Run(stream); + } - runner_dx.Run(stream); + Tensor y_temp_brd(Y->type()); + if (y_dims == y_broadcast_dims) { + y_temp_brd.ShareDataWith(*Y); + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + } else { + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + y_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(y_temp) + .AddInput(std::move(y_broadcast_dims)) + .AddOutput(y_temp_brd) + .Run(stream); + } + + if (dX) { + if (x_dims == x_broadcast_dims) { + if (trans_x) { + MatMulND(ctx, stream, y_temp_brd, dout_temp, dX, trans_y, true); + } else { + MatMulND(ctx, stream, dout_temp, y_temp_brd, dX, false, !trans_y); } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - if ((x->dims().size() == 3) && (dout->dims().size() == 3) && - (dy->dims().size() == 2)) { - framework::Tensor dout_tmp; - dout_tmp.ShareDataWith(*dout); - std::vector vec_dim = - framework::vectorize(dout_tmp.dims()); - std::vector vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]}; - dout_tmp.Resize(framework::make_ddim(vec_dim_v)); - - framework::Tensor x_tmp; - x_tmp.ShareDataWith(*x); - std::vector vec_dim_x = - framework::vectorize(x_tmp.dims()); - std::vector vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1], - vec_dim_x[2]}; - x_tmp.Resize(framework::make_ddim(vec_dim_x_v)); - const auto& runner_dy = - NpuOpRunner("MatMul", {x_tmp, dout_tmp}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); - runner_dy.Run(stream); - } else { - const auto& runner_dy = - NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); - runner_dy.Run(stream); - } + } else { + Tensor dx_temp(X->type()); + dx_temp.Resize(framework::make_ddim(x_broadcast_dims)); + if (trans_x) { + MatMulND(ctx, stream, y_temp_brd, dout_temp, &dx_temp, trans_y, + true); + } else { + MatMulND(ctx, stream, dout_temp, y_temp_brd, &dx_temp, false, + !trans_y); } + ReduceDims(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX); + } + } + if (dY) { + if (y_dims == y_broadcast_dims) { + if (trans_y) { + MatMulND(ctx, stream, dout_temp, x_temp_brd, dY, true, trans_x); + } else { + MatMulND(ctx, stream, x_temp_brd, dout_temp, dY, !trans_x, false); + } + } else { + Tensor dy_temp(Y->type()); + dy_temp.Resize(framework::make_ddim(y_broadcast_dims)); + if (trans_y) { + MatMulND(ctx, stream, dout_temp, x_temp_brd, &dy_temp, true, + trans_x); + } else { + MatMulND(ctx, stream, x_temp_brd, dout_temp, &dy_temp, !trans_x, + false); + } + ReduceDims(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY); } } } }; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - matmul_v2, - ops::MatMulV2NPUKernel, - ops::MatMulV2NPUKernel); -REGISTER_OP_NPU_KERNEL( - matmul_v2_grad, - ops::MatMulV2GradNPUKernel, - ops::MatMulV2GradNPUKernel); +REGISTER_OP_NPU_KERNEL(matmul_v2, ops::MatMulV2NPUKernel, + ops::MatMulV2NPUKernel); +REGISTER_OP_NPU_KERNEL(matmul_v2_grad, ops::MatMulV2GradNPUKernel, + ops::MatMulV2GradNPUKernel); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 7f2937b9af7643..4d90b9159470eb 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -55,6 +55,7 @@ 'is_compiled_with_cuda', 'is_compiled_with_rocm', 'is_compiled_with_xpu', + 'is_compiled_with_npu', 'Variable', 'require_version', 'device_guard', @@ -380,6 +381,15 @@ def _xpu_ids(): return device_ids +def _npu_ids(): + npus_env = os.getenv("FLAGS_selected_npus") + if npus_env: + device_ids = [int(s) for s in npus_env.split(",")] + else: + device_ids = six.moves.range(core.get_npu_device_count()) + return device_ids + + def is_compiled_with_xpu(): """ Whether this whl package can be used to run the model on XPU. @@ -395,6 +405,21 @@ def is_compiled_with_xpu(): return core.is_compiled_with_xpu() +def is_compiled_with_npu(): + """ + Whether this whl package can be used to run the model on NPU. + + Returns (bool): support npu or not. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + support_npu = fluid.is_compiled_with_npu() + """ + return core.is_compiled_with_npu() + + def disable_signal_handler(): """ Reset signal handler registered by Paddle. @@ -538,6 +563,47 @@ def xpu_places(device_ids=None): return [core.XPUPlace(dev_id) for dev_id in device_ids] +def npu_places(device_ids=None): + """ + **Note**: + For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device. + + This function creates a list of :code:`paddle.NPUPlace` objects. + If :code:`device_ids` is None, environment variable of + :code:`FLAGS_selected_npus` would be checked first. For example, if + :code:`FLAGS_selected_npus=0,1,2`, the returned list would + be [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)]. + If :code:`FLAGS_selected_npus` is not set, all visible + npu places would be returned. + If :code:`device_ids` is not None, it should be the device + ids of NPUs. For example, if :code:`device_ids=[0,1,2]`, + the returned list would be + [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)]. + + Parameters: + device_ids (list or tuple of int, optional): list of NPU device ids. + Returns: + list of paddle.NPUPlace: Created NPU place list. + Examples: + .. code-block:: python + + # required: npu + + import paddle + import paddle.static as static + + paddle.enable_static() + npu_places = static.npu_places() + """ + assert core.is_compiled_with_npu(), \ + "Not compiled with NPU" + if device_ids is None: + device_ids = _npu_ids() + elif not isinstance(device_ids, (list, tuple)): + device_ids = [device_ids] + return [core.NPUPlace(dev_id) for dev_id in device_ids] + + def cpu_places(device_count=None): """ This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list. @@ -1927,6 +1993,10 @@ def set_value(self, value, scope=None): p = core.Place() p.set_place(t._place()) place = core.XPUPlace(p.xpu_device_id()) + elif p.is_npu_place(): + p = core.Place() + p.set_place(t._place()) + place = core.NPUPlace(p.npu_device_id()) else: p = core.Place() p.set_place(t._place()) diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt index 44b3c6764a7cfa..4e81bb9544ceb9 100644 --- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt @@ -20,4 +20,5 @@ if (WITH_ASCEND_CL) set_tests_properties(test_stack_op_npu PROPERTIES TIMEOUT 300) set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200) set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300) + set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300) endif() diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py index 53766c5eb61b7a..882043ef6eb911 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py @@ -21,56 +21,35 @@ from op_test import OpTest import paddle import paddle.fluid as fluid +from test_matmul_v2_op import reference_matmul paddle.enable_static() SEED = 2021 -def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): - """Reference forward implementation using np.matmul.""" - # np.matmul does not support the transpose flags, so we manually - # transpose X and Y appropriately. - if transpose_X: - if X.ndim == 1: - X = X.reshape((X.size)) - elif X.ndim == 2: - X = X.T - else: - dim = [i for i in range(len(X.shape))] - dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] - X = np.transpose(X, tuple(dim)) - if transpose_Y: - if Y.ndim == 1: - Y = Y.reshape((Y.size)) - else: - dim = [i for i in range(len(Y.shape))] - dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] - Y = np.transpose(Y, tuple(dim)) - - Out = np.matmul(X, Y) - if not Out.shape: - # We do not support 0-dimensional Tensors (scalars). So where - # np.matmul outputs a scalar, we must convert to a Tensor of - # shape (1) instead. - # Everywhere else, we are compatible with np.matmul. - Out = np.array([Out], dtype="float64") - return Out - - -class TestMatMul(OpTest): +class TestMatMulV2Op(OpTest): + """ + case 1 + """ + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + def config(self): - self.x_shape = (100, 24) - self.y_shape = (24, 100) + self.x_shape = (100, ) + self.y_shape = (100, ) self.trans_x = False self.trans_y = False + def init_kernel_type(self): + self.dtype = "float32" + def setUp(self): self.set_npu() - self.op_type = "matmul_v2" - self.place = paddle.NPUPlace(0) - self.init_dtype() + self.init_kernel_type() self.config() - np.random.seed(SEED) + self.op_type = "matmul_v2" x = np.random.random(self.x_shape).astype(self.dtype) y = np.random.random(self.y_shape).astype(self.dtype) # -0.1 ~ 0.1 @@ -85,201 +64,314 @@ def setUp(self): self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y} self.outputs = {'Out': result} - def set_npu(self): - self.__class__.use_npu = True - self.__class__.no_need_check_grad = True - - def init_dtype(self): - self.dtype = np.float32 - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-5) + self.check_output_with_place(self.place, atol=1e-7) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # -class TestMatMul2(TestMatMul): +class TestMatMuklOp2(TestMatMulV2Op): """ case 2 """ def config(self): - self.x_shape = (32, 24) - self.y_shape = (32, 24) + self.x_shape = (100, ) + self.y_shape = (1, 3, 2, 100) self.trans_x = False self.trans_y = True -class TestMatMul3(TestMatMul): +class TestMatMuklOp3(TestMatMulV2Op): """ case 3 """ - def init_dtype(self): - self.dtype = np.float16 + def config(self): + self.x_shape = (100, ) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False -class TestMatMul4(TestMatMul): +class TestMatMuklOp4(TestMatMulV2Op): """ - case 4 dim=3 + case 4 """ def config(self): - self.x_shape = (2, 3, 4) - self.y_shape = (2, 4, 3) + self.x_shape = (100, ) + self.y_shape = (1, 2, 100, 2) self.trans_x = False self.trans_y = False -class TestMatMulNet(unittest.TestCase): - def _test(self, run_npu=True): - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - np.random.seed(SEED) - - a_np = np.random.random(size=(2, 3)).astype('float32') - b_np = np.random.random(size=(2, 3)).astype('float32') - c_np = np.random.random(size=(3, 2)).astype('float32') - d_np = np.random.random(size=(3, 2)).astype('float32') - label_np = np.random.randint(2, size=(2, 1)).astype('int64') - - with paddle.static.program_guard(main_prog, startup_prog): - a = paddle.static.data(name="a", shape=[2, 3], dtype='float32') - b = paddle.static.data(name="b", shape=[2, 3], dtype='float32') - c = paddle.static.data(name="c", shape=[3, 2], dtype='float32') - d = paddle.static.data(name="d", shape=[3, 2], dtype='float32') - label = paddle.static.data( - name="label", shape=[2, 1], dtype='int64') - - sum_1 = paddle.add(a, b) - sum_2 = paddle.add(c, d) - result = paddle.matmul(sum_1, sum_2) - - fc_1 = fluid.layers.fc(input=result, size=8) - prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.reduce_mean(cost) - sgd = fluid.optimizer.SGD(learning_rate=0.01) - sgd.minimize(loss) - - if run_npu: - place = paddle.NPUPlace(0) - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - print("Start run on {}".format(place)) - for epoch in range(100): - - pred_res, loss_res = exe.run(main_prog, - feed={ - "a": a_np, - "b": b_np, - "c": c_np, - "d": d_np, - "label": label_np - }, - fetch_list=[prediction, loss]) - if epoch % 10 == 0: - print("Epoch {} | Prediction[0]: {}, Loss: {}".format( - epoch, pred_res[0], loss_res)) - - return pred_res, loss_res - - def test_npu(self): - cpu_pred, cpu_loss = self._test(False) - npu_pred, npu_loss = self._test(True) - - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) - - -# The precision is aligned in NPU and GPU separately, which is only used for the usage method. - - -class TestMatMulNet3_2(unittest.TestCase): - def _test(self, run_npu=True): - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - np.random.seed(SEED) - self._dtype = "float32" - - a_np = np.random.random(size=(2, 1, 3)).astype(self._dtype) - b_np = np.random.random(size=(2, 1, 3)).astype(self._dtype) - c_np = np.random.random(size=(3, 2)).astype(self._dtype) - d_np = np.random.random(size=(3, 2)).astype(self._dtype) - label_np = np.random.randint(2, size=(2, 1)).astype('int64') - - with paddle.static.program_guard(main_prog, startup_prog): - a = paddle.static.data(name="a", shape=[2, 1, 3], dtype=self._dtype) - b = paddle.static.data(name="b", shape=[2, 1, 3], dtype=self._dtype) - c = paddle.static.data(name="c", shape=[3, 2], dtype=self._dtype) - d = paddle.static.data(name="d", shape=[3, 2], dtype=self._dtype) - label = paddle.static.data( - name="label", shape=[2, 1], dtype='int64') - - sum_1 = paddle.add(a, b) - sum_2 = paddle.add(c, d) - sum_1 = paddle.cast(sum_1, 'float16') - sum_2 = paddle.cast(sum_2, 'float16') - if not run_npu: - sum_1 = paddle.cast(sum_1, 'float32') - sum_2 = paddle.cast(sum_2, 'float32') - - result = paddle.matmul(sum_1, sum_2) - if run_npu: - result = paddle.cast(result, 'float32') - - result = paddle.reshape(result, shape=[2, 2]) - fc_1 = fluid.layers.fc(input=result, size=8) - prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.reduce_mean(cost) - sgd = fluid.optimizer.SGD(learning_rate=0.01) - sgd.minimize(loss) - - if run_npu: +class TestMatMuklOp5(TestMatMulV2Op): + """ + case 5 + """ + + def config(self): + self.x_shape = (1, 1, 100, 1) + self.y_shape = (100, ) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp6(TestMatMulV2Op): + """ + case 6 + """ + + def config(self): + self.x_shape = (1, 2, 102, 1) + self.y_shape = (102, ) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp7(TestMatMulV2Op): + """ + case 7 + """ + + def config(self): + self.x_shape = (1, 2, 1, 100) + self.y_shape = (100, ) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp8(TestMatMulV2Op): + """ + case 8 + """ + + def config(self): + self.x_shape = (1, 1, 2, 100) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp9(TestMatMulV2Op): + """ + case 9 + """ + + def config(self): + self.x_shape = (1, 1, 1, 100) + self.y_shape = (2, 1, 2, 100) + self.trans_x = False + self.trans_y = True + + +class TestMatMuklOp10(TestMatMulV2Op): + """ + case 10 + """ + + def config(self): + self.x_shape = (1, 1, 25, 4) + self.y_shape = (1, 2, 4, 25) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp11(TestMatMulV2Op): + """ + case 11 + """ + + def config(self): + self.x_shape = (2, 1, 2, 100) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp12(TestMatMulV2Op): + """ + case 12 + """ + + def config(self): + self.x_shape = (2, 1, 4, 25) + self.y_shape = (1, 1, 4, 25) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp13(TestMatMulV2Op): + """ + case 13 + """ + + def config(self): + self.x_shape = (2, 2, 10, 10) + self.y_shape = (2, 2, 10, 10) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp14(TestMatMulV2Op): + """ + case 14_1 + """ + + def config(self): + self.x_shape = (3, 1, 6, 6) + self.y_shape = (1, 2, 6, 9) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp15(TestMatMulV2Op): + """ + case 14_2 + """ + + def config(self): + self.x_shape = (3, 1, 6, 6) + self.y_shape = (1, 2, 6, 9) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp16(TestMatMulV2Op): + """ + case 16 : to check the gradient for special case + """ + + def config(self): + self.x_shape = (100) + self.y_shape = (1, 2, 2, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp17(TestMatMulV2Op): + """ + case 17 : to check the gradient for special case + """ + + def config(self): + self.x_shape = (2, 1, 100) + self.y_shape = (100) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOpBroadcast1(TestMatMulV2Op): + """ + case 14_3 + """ + + def config(self): + self.x_shape = (3, 1, 10, 10) + self.y_shape = (1, 2, 10, 10) + self.trans_x = True + self.trans_y = True + + +class TestMatMuklOpBroadcast2(TestMatMulV2Op): + """ + case 14_4 + """ + + def config(self): + self.x_shape = (3, 1, 10, 10) + self.y_shape = (1, 2, 10, 10) + self.trans_x = False + self.trans_y = True + + +#--------------------test matmul fp16-------------------- + + +def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5): + class TestMatMulOpFp16Case(parent): + def init_kernel_type(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=atol) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], + 'Out', + max_relative_error=max_relative_error) + + cls_name = "{0}_{1}".format(parent.__name__, "Fp16") + TestMatMulOpFp16Case.__name__ = cls_name + globals()[cls_name] = TestMatMulOpFp16Case + + +create_test_fp16_class(TestMatMulV2Op) +create_test_fp16_class(TestMatMuklOp2) +create_test_fp16_class(TestMatMuklOp3) +create_test_fp16_class(TestMatMuklOp4) +create_test_fp16_class(TestMatMuklOp5) +create_test_fp16_class(TestMatMuklOp6) +create_test_fp16_class(TestMatMuklOp7) +create_test_fp16_class(TestMatMuklOp8) +create_test_fp16_class(TestMatMuklOp9) +create_test_fp16_class(TestMatMuklOp10) +create_test_fp16_class(TestMatMuklOp11) +create_test_fp16_class(TestMatMuklOp12) +create_test_fp16_class(TestMatMuklOp13) +create_test_fp16_class(TestMatMuklOp14) +create_test_fp16_class(TestMatMuklOp15) +create_test_fp16_class(TestMatMuklOp16) +create_test_fp16_class(TestMatMuklOp17) + + +class TestMatMulV2API(unittest.TestCase): + def setUp(self): + self.places = [paddle.CPUPlace()] + if paddle.is_compiled_with_npu(): + self.places.append(paddle.NPUPlace(0)) + + def check_static_result(self, place): + with fluid.program_guard(fluid.Program(), fluid.Program()): + input_x = fluid.data(name="input_x", shape=[4, 3], dtype="float32") + input_y = fluid.data(name="input_y", shape=[3, 4], dtype="float32") + + result = paddle.matmul(input_x, input_y) + + x_np = np.random.random([4, 3]).astype("float32") + y_np = np.random.random([3, 4]).astype("float32") + + exe = fluid.Executor(place) + fetches = exe.run(fluid.default_main_program(), + feed={"input_x": x_np, + "input_y": y_np}, + fetch_list=[result]) + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + def test_dygraph(self): + for place in self.places: + with fluid.dygraph.guard(place): + input_x = np.random.random([4, 3]).astype("float32") + input_y = np.random.random([3, 4]).astype("float32") + x = paddle.to_tensor(input_x) + y = paddle.to_tensor(input_y) + result = paddle.matmul(x, y) + + def test_dygraph_fp16(self): + if paddle.is_compiled_with_npu(): place = paddle.NPUPlace(0) - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - print("Start run on {}".format(place)) - for epoch in range(100): - - pred_res, loss_res = exe.run(main_prog, - feed={ - "a": a_np, - "b": b_np, - "c": c_np, - "d": d_np, - "label": label_np - }, - fetch_list=[prediction, loss]) - if epoch % 10 == 0: - print("Epoch {} | Prediction[0]: {}, Loss: {}".format( - epoch, pred_res[0], loss_res)) - - return pred_res, loss_res - - def test_npu(self): - cpu_pred, cpu_loss = self._test(False) - npu_pred, npu_loss = self._test(True) - - self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-4)) - self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-4)) + with fluid.dygraph.guard(place): + input_x = np.random.random([4, 3]).astype("float16") + input_y = np.random.random([3, 4]).astype("float16") + x = paddle.to_tensor(input_x) + y = paddle.to_tensor(input_y) + result = paddle.matmul(x, y) if __name__ == '__main__': diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index 0f463b0c7d9418..20af4158df48fd 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -43,6 +43,7 @@ from ..fluid.framework import cpu_places # noqa: F401 from ..fluid.framework import cuda_places # noqa: F401 from ..fluid.framework import xpu_places # noqa: F401 +from ..fluid.framework import npu_places # noqa: F401 from ..fluid.framework import Variable # noqa: F401 from ..fluid.layers.control_flow import Print # noqa: F401 from ..fluid.layers.nn import py_func # noqa: F401 @@ -99,6 +100,7 @@ 'cpu_places', 'cuda_places', 'xpu_places', + 'npu_places', 'Variable', 'create_global_var', 'accuracy', diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py index 69baa4facfa96c..efdc6847f00561 100644 --- a/python/paddle/utils/install_check.py +++ b/python/paddle/utils/install_check.py @@ -74,7 +74,22 @@ def _is_cuda_available(): return False -def _run_dygraph_single(use_cuda): +def _is_npu_available(): + """ + Check whether NPU is avaiable. + """ + try: + assert len(paddle.static.npu_places()) > 0 + return True + except Exception as e: + logging.warning( + "You are using NPU version PaddlePaddle, but there is no NPU " + "detected on your machine. Maybe NPU devices is not set properly." + "\n Original Error is {}".format(e)) + return False + + +def _run_dygraph_single(use_cuda, use_npu): """ Testing the simple network in dygraph mode using one CPU/GPU. @@ -84,6 +99,8 @@ def _run_dygraph_single(use_cuda): paddle.disable_static() if use_cuda: paddle.set_device('gpu') + elif use_npu: + paddle.set_device('npu') else: paddle.set_device('cpu') weight_attr = paddle.ParamAttr( @@ -102,7 +119,7 @@ def _run_dygraph_single(use_cuda): opt.step() -def _run_static_single(use_cuda): +def _run_static_single(use_cuda, use_npu): """ Testing the simple network with executor running directly, using one CPU/GPU. @@ -119,8 +136,14 @@ def _run_static_single(use_cuda): param_grads = paddle.static.append_backward( out, parameter_list=[weight.name])[0] - exe = paddle.static.Executor( - paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()) + if use_cuda: + place = paddle.CUDAPlace(0) + elif use_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) exe.run(startup_prog) exe.run(train_prog, feed={input.name: _prepare_data(1)}, @@ -128,7 +151,7 @@ def _run_static_single(use_cuda): paddle.disable_static() -def _run_static_parallel(use_cuda, device_list): +def _run_static_parallel(use_cuda, use_npu, device_list): """ Testing the simple network in data parallel mode, using multiple CPU/GPU. @@ -150,8 +173,15 @@ def _run_static_parallel(use_cuda, device_list): train_prog).with_data_parallel( loss_name=loss.name, places=device_list) - exe = paddle.static.Executor( - paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()) + if use_cuda: + place = paddle.CUDAPlace(0) + elif use_npu: + place = paddle.NPUPlace(0) + compiled_prog = train_prog + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) exe.run(startup_prog) exe.run(compiled_prog, feed={input.name: _prepare_data(len(device_list))}, @@ -182,23 +212,31 @@ def run_check(): if paddle.is_compiled_with_cuda(): use_cuda = _is_cuda_available() + use_npu = False + elif paddle.is_compiled_with_npu(): + use_npu = _is_npu_available() + use_cuda = False else: + use_npu = False use_cuda = False if use_cuda: device_str = "GPU" device_list = paddle.static.cuda_places() + elif use_npu: + device_str = "NPU" + device_list = paddle.static.npu_places() else: device_str = "CPU" device_list = paddle.static.cpu_places(device_count=2) device_count = len(device_list) - _run_static_single(use_cuda) - _run_dygraph_single(use_cuda) + _run_static_single(use_cuda, use_npu) + _run_dygraph_single(use_cuda, use_npu) print("PaddlePaddle works well on 1 {}.".format(device_str)) try: - _run_static_parallel(use_cuda, device_list) + _run_static_parallel(use_cuda, use_npu, device_list) print("PaddlePaddle works well on {} {}s.".format(device_count, device_str)) print( From 71cb3ff805c1abc4762e6f302c7f8c46942e6f7c Mon Sep 17 00:00:00 2001 From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com> Date: Mon, 11 Oct 2021 14:41:01 +0800 Subject: [PATCH 095/298] enhance yolobox trt plugin (#34128) * enhance yolobox plugin --- .../inference/tensorrt/convert/yolo_box_op.cc | 9 ++- .../tensorrt/plugin/yolo_box_op_plugin.cu | 65 ++++++++++++++----- .../tensorrt/plugin/yolo_box_op_plugin.h | 3 + .../ir/inference/test_trt_yolo_box_op.py | 51 +++++++++++++++ 4 files changed, 111 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc index 2d12eaf736b754..17d217dff43fdb 100644 --- a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc @@ -48,13 +48,20 @@ class YoloBoxOpConverter : public OpConverter { float conf_thresh = BOOST_GET_CONST(float, op_desc.GetAttr("conf_thresh")); bool clip_bbox = BOOST_GET_CONST(bool, op_desc.GetAttr("clip_bbox")); float scale_x_y = BOOST_GET_CONST(float, op_desc.GetAttr("scale_x_y")); + bool iou_aware = op_desc.HasAttr("iou_aware") + ? BOOST_GET_CONST(bool, op_desc.GetAttr("iou_aware")) + : false; + float iou_aware_factor = + op_desc.HasAttr("iou_aware_factor") + ? BOOST_GET_CONST(float, op_desc.GetAttr("iou_aware_factor")) + : 0.5; int type_id = static_cast(engine_->WithFp16()); auto input_dim = X_tensor->getDimensions(); auto* yolo_box_plugin = new plugin::YoloBoxPlugin( type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors, class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, - input_dim.d[1], input_dim.d[2]); + iou_aware, iou_aware_factor, input_dim.d[1], input_dim.d[2]); std::vector yolo_box_inputs; yolo_box_inputs.push_back(X_tensor); diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu index 10123cd4fa0e1b..57177cfa8b421e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include #include #include @@ -29,7 +27,8 @@ YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type, const std::vector& anchors, const int class_num, const float conf_thresh, const int downsample_ratio, const bool clip_bbox, - const float scale_x_y, const int input_h, + const float scale_x_y, const bool iou_aware, + const float iou_aware_factor, const int input_h, const int input_w) : data_type_(data_type), class_num_(class_num), @@ -37,6 +36,8 @@ YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type, downsample_ratio_(downsample_ratio), clip_bbox_(clip_bbox), scale_x_y_(scale_x_y), + iou_aware_(iou_aware), + iou_aware_factor_(iou_aware_factor), input_h_(input_h), input_w_(input_w) { anchors_.insert(anchors_.end(), anchors.cbegin(), anchors.cend()); @@ -45,6 +46,7 @@ YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type, assert(class_num_ > 0); assert(input_h_ > 0); assert(input_w_ > 0); + assert((iou_aware_factor_ > 0 && iou_aware_factor_ < 1)); cudaMalloc(&anchors_device_, anchors.size() * sizeof(int)); cudaMemcpy(anchors_device_, anchors.data(), anchors.size() * sizeof(int), @@ -59,6 +61,8 @@ YoloBoxPlugin::YoloBoxPlugin(const void* data, size_t length) { DeserializeValue(&data, &length, &downsample_ratio_); DeserializeValue(&data, &length, &clip_bbox_); DeserializeValue(&data, &length, &scale_x_y_); + DeserializeValue(&data, &length, &iou_aware_); + DeserializeValue(&data, &length, &iou_aware_factor_); DeserializeValue(&data, &length, &input_h_); DeserializeValue(&data, &length, &input_w_); } @@ -133,8 +137,19 @@ __device__ inline void GetYoloBox(float* box, const T* x, const int* anchors, __device__ inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num, int an_stride, int stride, - int entry) { - return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; + int entry, bool iou_aware) { + if (iou_aware) { + return (batch * an_num + an_idx) * an_stride + + (batch * an_num + an_num + entry) * stride + hw_idx; + } else { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; + } +} + +__device__ inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num, + int an_stride, int stride) { + return batch * an_num * an_stride + (batch * an_num + an_idx) * stride + + hw_idx; } template @@ -178,7 +193,8 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize, const int w, const int an_num, const int class_num, const int box_num, int input_size_h, int input_size_w, bool clip_bbox, const float scale, - const float bias) { + const float bias, bool iou_aware, + const float iou_aware_factor) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; float box[4]; @@ -193,11 +209,16 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize, int img_height = imgsize[2 * i]; int img_width = imgsize[2 * i + 1]; - int obj_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4); + int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4, + iou_aware); float conf = sigmoid(static_cast(input[obj_idx])); - int box_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0); + if (iou_aware) { + int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num); + float iou = sigmoid(input[iou_idx]); + conf = powf(conf, 1. - iou_aware_factor) * powf(iou, iou_aware_factor); + } + int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0, + iou_aware); if (conf < conf_thresh) { for (int i = 0; i < 4; ++i) { @@ -212,8 +233,8 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize, box_idx = (i * box_num + j * grid_num + k * w + l) * 4; CalcDetectionBox(boxes, box, box_idx, img_height, img_width, clip_bbox); - int label_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5); + int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, + 5, iou_aware); int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, grid_num); @@ -240,7 +261,8 @@ int YoloBoxPlugin::enqueue_impl(int batch_size, const void* const* inputs, reinterpret_cast(inputs[1]), reinterpret_cast(outputs[0]), reinterpret_cast(outputs[1]), conf_thresh_, anchors_device_, n, h, w, an_num, class_num_, box_num, - input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias); + input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias, iou_aware_, + iou_aware_factor_); return cudaGetLastError() != cudaSuccess; } @@ -274,6 +296,8 @@ size_t YoloBoxPlugin::getSerializationSize() const TRT_NOEXCEPT { serialize_size += SerializedSize(scale_x_y_); serialize_size += SerializedSize(input_h_); serialize_size += SerializedSize(input_w_); + serialize_size += SerializedSize(iou_aware_); + serialize_size += SerializedSize(iou_aware_factor_); return serialize_size; } @@ -285,6 +309,8 @@ void YoloBoxPlugin::serialize(void* buffer) const TRT_NOEXCEPT { SerializeValue(&buffer, downsample_ratio_); SerializeValue(&buffer, clip_bbox_); SerializeValue(&buffer, scale_x_y_); + SerializeValue(&buffer, iou_aware_); + SerializeValue(&buffer, iou_aware_factor_); SerializeValue(&buffer, input_h_); SerializeValue(&buffer, input_w_); } @@ -326,8 +352,8 @@ void YoloBoxPlugin::configurePlugin( nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const TRT_NOEXCEPT { return new YoloBoxPlugin(data_type_, anchors_, class_num_, conf_thresh_, - downsample_ratio_, clip_bbox_, scale_x_y_, input_h_, - input_w_); + downsample_ratio_, clip_bbox_, scale_x_y_, + iou_aware_, iou_aware_factor_, input_h_, input_w_); } YoloBoxPluginCreator::YoloBoxPluginCreator() {} @@ -367,6 +393,8 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin( float scale_x_y = 1.; int h = -1; int w = -1; + bool iou_aware = false; + float iou_aware_factor = 0.5; for (int i = 0; i < fc->nbFields; ++i) { const std::string field_name(fc->fields[i].name); @@ -386,6 +414,10 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin( clip_bbox = *static_cast(fc->fields[i].data); } else if (field_name.compare("scale_x_y")) { scale_x_y = *static_cast(fc->fields[i].data); + } else if (field_name.compare("iou_aware")) { + iou_aware = *static_cast(fc->fields[i].data); + } else if (field_name.compare("iou_aware_factor")) { + iou_aware_factor = *static_cast(fc->fields[i].data); } else if (field_name.compare("h")) { h = *static_cast(fc->fields[i].data); } else if (field_name.compare("w")) { @@ -397,7 +429,8 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin( return new YoloBoxPlugin( type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors, - class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, h, w); + class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, iou_aware, + iou_aware_factor, h, w); } nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::deserializePlugin( diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h index c9e9f9a0567aee..ae9a6739cedd34 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h @@ -31,6 +31,7 @@ class YoloBoxPlugin : public nvinfer1::IPluginV2Ext { const std::vector& anchors, const int class_num, const float conf_thresh, const int downsample_ratio, const bool clip_bbox, const float scale_x_y, + const bool iou_aware, const float iou_aware_factor, const int input_h, const int input_w); YoloBoxPlugin(const void* data, size_t length); ~YoloBoxPlugin() override; @@ -89,6 +90,8 @@ class YoloBoxPlugin : public nvinfer1::IPluginV2Ext { float scale_x_y_; int input_h_; int input_w_; + bool iou_aware_; + float iou_aware_factor_; std::string namespace_; }; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py index 2166bbaa98b2fe..b0124f055b4e19 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py @@ -116,5 +116,56 @@ def test_check_output(self): PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) +class TRTYoloBoxIoUAwareTest(InferencePassTest): + def setUp(self): + self.set_params() + with fluid.program_guard(self.main_program, self.startup_program): + image_shape = [self.bs, self.channel, self.height, self.width] + image = fluid.data(name='image', shape=image_shape, dtype='float32') + image_size = fluid.data( + name='image_size', shape=[self.bs, 2], dtype='int32') + boxes, scores = self.append_yolobox(image, image_size) + + self.feeds = { + 'image': np.random.random(image_shape).astype('float32'), + 'image_size': np.random.randint( + 32, 64, size=(self.bs, 2)).astype('int32'), + } + self.enable_trt = True + self.trt_parameters = TRTYoloBoxTest.TensorRTParam( + 1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [scores, boxes] + + def set_params(self): + self.bs = 4 + self.channel = 258 + self.height = 64 + self.width = 64 + self.class_num = 80 + self.anchors = [10, 13, 16, 30, 33, 23] + self.conf_thresh = .1 + self.downsample_ratio = 32 + self.iou_aware = True + self.iou_aware_factor = 0.5 + + def append_yolobox(self, image, image_size): + return fluid.layers.yolo_box( + x=image, + img_size=image_size, + class_num=self.class_num, + anchors=self.anchors, + conf_thresh=self.conf_thresh, + downsample_ratio=self.downsample_ratio, + iou_aware=self.iou_aware, + iou_aware_factor=self.iou_aware_factor) + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + if __name__ == "__main__": unittest.main() From 414c252ae79fa2ca31b2159d3b2c56e491d55cd4 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Mon, 11 Oct 2021 16:48:56 +0800 Subject: [PATCH 096/298] Fix, test=document_fix (#36336) --- paddle/scripts/paddle_build.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 0c2580929081d0..2cc4bd8d05fb8c 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1076,7 +1076,6 @@ function get_quickly_disable_ut() { function card_test() { set -m - echo "$2 bengingggggg!!!!!" case_count $1 $2 ut_startTime_s=`date +%s` From 7a724ddb30c677b994b907e967b308a42ac8c7ad Mon Sep 17 00:00:00 2001 From: yaoxuefeng Date: Mon, 11 Oct 2021 17:02:01 +0800 Subject: [PATCH 097/298] fix multi-node (#36329) --- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 10 +++++++++- paddle/fluid/platform/collective_helper.cc | 8 ++++---- python/paddle/fluid/dataset.py | 2 ++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index b7e8bbb3694922..fa2ff6cbdb8c78 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -117,6 +117,15 @@ class PSGPUWrapper { resource_ = std::make_shared(dev_ids); resource_->enable_p2p(); keys_tensor.resize(resource_->total_gpu()); +#ifdef PADDLE_WITH_GLOO + auto gloo = paddle::framework::GlooWrapper::GetInstance(); + if (gloo->Size() > 1) { + multi_node_ = 1; + } +#else + PADDLE_THROW( + platform::errors::Unavailable("heter ps need compile with GLOO")); +#endif if (multi_node_) { int dev_size = dev_ids.size(); // init inner comm @@ -127,7 +136,6 @@ class PSGPUWrapper { // init inter comm #ifdef PADDLE_WITH_GLOO inter_comms_.resize(dev_size); - auto gloo = paddle::framework::GlooWrapper::GetInstance(); if (gloo->Rank() == 0) { for (int i = 0; i < dev_size; ++i) { platform::dynload::ncclGetUniqueId(&inter_ncclids_[i]); diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index a765f344daf8aa..03359d932b5ab9 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -148,7 +148,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( paddle::platform::errors::InvalidArgument( "dev ids = [%d], it should greater than 0.", dev_ids.size())); const int kDevices = dev_ids.size(); - VLOG(3) << "Begin CreateNCCLCommMultiTrainer. device number: " << kDevices + VLOG(1) << "Begin CreateNCCLCommMultiTrainer. device number: " << kDevices << ", ntrainers: " << ntrainers << ", train_id: " << train_id << ", rind_id: " << ring_id; ncclComm_t comms[kDevices]; @@ -162,10 +162,10 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( #endif platform::dynload::ncclCommInitRank(comms + i, kDevices * ntrainers, *nccl_id, train_id * kDevices + i); - VLOG(3) << "ncclCommInitRank: " << i; + VLOG(1) << "ncclCommInitRank: " << i; } PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd()); - VLOG(3) << "nccl group end seccessss"; + VLOG(1) << "nccl group end seccessss"; } PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0, platform::errors::InvalidArgument( @@ -174,7 +174,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( for (int i = 0; i < kDevices; ++i) { AssignNCCLComm(comms[i], kDevices * ntrainers, train_id * kDevices + i, dev_ids[i], ring_id); - VLOG(3) << "nccl communicator of train_id " << train_id * kDevices + i + VLOG(1) << "nccl communicator of train_id " << train_id * kDevices + i << " in ring " << ring_id << " has been created on device " << dev_ids[i]; } diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index 438831208b66ac..d683e36fbe5ab3 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -396,6 +396,8 @@ def set_feed_type(self, data_feed_type): Set data_feed_desc """ self.proto_desc.name = data_feed_type + if (self.proto_desc.name == "SlotRecordInMemoryDataFeed"): + self.dataset = core.Dataset("SlotRecordDataset") @deprecated( since="2.0.0", From c38b04883e8b3079d8321b5cce03f9ec07df1fd1 Mon Sep 17 00:00:00 2001 From: caozhou <48191911+Caozhou1995@users.noreply.github.com> Date: Mon, 11 Oct 2021 17:45:18 +0800 Subject: [PATCH 098/298] add reshard module (#35779) * add reshard module * fix conflict * update reshard module * update and add unitest * update reshard module and unitest * add more unitests --- .../distributed/auto_parallel/__init__.py | 2 + .../distributed/auto_parallel/completion.py | 170 +++ .../distributed/auto_parallel/context.py | 3 + .../auto_parallel/operators/dist_embedding.py | 14 +- .../distributed/auto_parallel/parallelizer.py | 9 +- .../distributed/auto_parallel/reshard.py | 1002 +++++++++++++++++ .../fluid/tests/unittests/CMakeLists.txt | 12 + .../unittests/test_auto_parallel_reshard.py | 287 +++++ .../test_auto_parallel_reshard_dpmppp.py | 173 +++ .../test_auto_parallel_reshard_mppp.py | 231 ++++ .../test_auto_parallel_reshard_serial.py | 184 +++ 11 files changed, 2083 insertions(+), 4 deletions(-) create mode 100644 python/paddle/distributed/auto_parallel/reshard.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py index 5b0fdc1f1f1665..31f92e2575a1f8 100644 --- a/python/paddle/distributed/auto_parallel/__init__.py +++ b/python/paddle/distributed/auto_parallel/__init__.py @@ -19,5 +19,7 @@ from .interface import set_pipeline_stage # noqa: F401 from .interface import ProcessMesh # noqa: F401 from .completion import complete_annotation # noqa: F401 +from .completion import complete_backward_annotation # noqa: F401 +from .reshard import reshard # noqa: F401 __all__ = [] diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index 6e886d09d67bde..3fdbad6950db51 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -23,6 +23,7 @@ from .utils import print_program_with_distributed_attr from .context import get_default_distributed_context from .operators import find_best_compatible_distributed_operator_impl +from .attribute import OperatorDistributedAttribute, TensorDistributedAttribute ELEMENTWISE_LIKE_OP_LIST = ["elementwise_add", "gelu", "dropout", "cast"] @@ -597,3 +598,172 @@ def sort_key_fun(node): dist_context.amend_distributed_attr_for_program() return program + + +def complete_backward_annotation(auto_parallel_main_prog, dist_context): + """Complete the annotation of vars and ops in the backward phase for parallel program.""" + + def _is_grad_var_name(name): + if "@GRAD" in name: + return True + return False + + grad_start_idx = None + for idx, op in enumerate(auto_parallel_main_prog.global_block().ops): + for var_name in op.output_arg_names: + # TODO: use _is_loss_op to judge + if "@GRAD" in var_name and op.type == "fill_constant": + grad_start_idx = idx + break + assert grad_start_idx is not None, "No backward procedure found in this program." + + ops = list(auto_parallel_main_prog.global_block().ops) + vars = auto_parallel_main_prog.global_block().vars + for idx in range(grad_start_idx, len(ops)): + # complete the loss op + if idx == grad_start_idx: + grad_var = vars[ops[idx].output_arg_names[0]] + grad_var_name = grad_var.name + forward_var_name = grad_var_name[:grad_var_name.find("@GRAD")] + forward_var = vars[forward_var_name] + tensor_attr = TensorDistributedAttribute(grad_var, dist_context) + process_mesh = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_process_mesh() + dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_dims_mapping() + tensor_attr.set_dims_mapping(dims_mapping) + tensor_attr.set_process_mesh(process_mesh) + dist_context.set_tensor_distributed_attr_for_program(grad_var, + tensor_attr) + op_attr = OperatorDistributedAttribute(ops[idx], dist_context) + op_attr.set_process_mesh(process_mesh) + dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr) + + # in the data parallel mode, the loss op followed by scale op. + if ops[idx + 1].type == "scale" and grad_var_name in ops[idx + 1].input_arg_names \ + and grad_var_name in ops[idx + 1].output_arg_names: + op_attr = OperatorDistributedAttribute(ops[idx + 1], + dist_context) + op_attr.set_process_mesh(process_mesh) + dist_context.set_op_distributed_attr_for_program(ops[idx + 1], + op_attr) + continue + + # complete the annotation of the optimizer op. + # TODO: use _is_optimizer_op to judge + if "Grad" in ops[idx].input_names and "Param" in ops[idx].input_names: + assert len(ops[idx].input( + "Param")) == 1, "Only support one-to-one now." + assert len(ops[idx].input( + "Grad")) == 1, "Only support one-to-one now." + var = vars[ops[idx].input("Param")[0]] + grad_var = vars[ops[idx].input("Grad")[0]] + process_mesh = dist_context.get_tensor_distributed_attr_for_program( + var).get_process_mesh() + dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + var).get_dims_mapping() + op_attr = OperatorDistributedAttribute(ops[idx], dist_context) + op_attr.set_process_mesh(process_mesh) + op_attr.set_input_dims_mapping(grad_var.name, dims_mapping) + dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr) + continue + + # complete the c_allreduce_sum op for gradient in the data parallel mode. + if ops[idx].type == "c_allreduce_sum" and ops[ + idx].input_arg_names == ops[idx].output_arg_names: + grad_var = vars[ops[idx].output_arg_names[0]] + op_attr = OperatorDistributedAttribute(ops[idx], dist_context) + process_mesh = dist_context.get_tensor_distributed_attr_for_program( + grad_var).get_process_mesh() + op_attr.set_process_mesh(process_mesh) + dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr) + continue + + # complete the annotation of grad op + grad_op = ops[idx] + for i, op in enumerate(ops[:grad_start_idx]): + match_op = None + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc, + set(), + []) + grad_op_input = [] + for input_arg_name in grad_op.desc.input_arg_names(): + if "@GRAD" in input_arg_name: + name = input_arg_name[:input_arg_name.find("@GRAD") + 5] + grad_op_input.append(name) + else: + grad_op_input.append(input_arg_name) + + # like sum op: the count of grad op will larger than 1 + if len(grad_op_desc_list) > 1: + for grad_op_desc in grad_op_desc_list: + if grad_op_input == grad_op_desc.input_arg_names() \ + and grad_op.desc.type() == grad_op_desc.type(): + match_op = op + break + elif len(grad_op_desc_list) == 1: + if grad_op_input == grad_op_desc_list[0].input_arg_names() \ + and grad_op.desc.type() == grad_op_desc_list[0].type(): + match_op = op + + if match_op is not None: + op_attr = dist_context.get_op_distributed_attr_for_program(op) + grad_op_attr = OperatorDistributedAttribute(grad_op, + dist_context) + grad_op_attr.set_process_mesh(op_attr.get_process_mesh()) + for var_name in grad_op.input_arg_names: + if "@GRAD" in var_name: + dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + vars[var_name]).get_dims_mapping() + grad_op_attr.set_input_dims_mapping(var_name, + dims_mapping) + else: + dims_mapping = op_attr.get_input_dims_mapping(var_name) + grad_op_attr.set_input_dims_mapping(var_name, + dims_mapping) + dist_context.set_op_distributed_attr_for_program(grad_op, + grad_op_attr) + + for var_name in grad_op.output_arg_names: + if "@GRAD" in var_name: + forward_var = vars[var_name[:var_name.find("@GRAD")]] + tensor_attr = TensorDistributedAttribute(vars[var_name], + dist_context) + process_mesh = grad_op_attr.get_process_mesh() + dims_mapping = grad_op_attr.get_input_dims_mapping( + forward_var.name) + tensor_attr.set_process_mesh(process_mesh) + tensor_attr.set_dims_mapping(dims_mapping) + dist_context.set_tensor_distributed_attr_for_program( + vars[var_name], tensor_attr) + break + + # complete the annotation of sum op for multiple renamed grad var + if grad_op.type == "sum" and all( + map(_is_grad_var_name, grad_op.input_arg_names)): + assert len(grad_op.output_arg_names + ) == 1, "The output count of sum op should be one." + grad_op_attr = OperatorDistributedAttribute(grad_op, dist_context) + for var_name in grad_op.input_arg_names: + if "@GRAD" in var_name: + forward_var = vars[var_name[:var_name.find("@GRAD")]] + dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_dims_mapping() + grad_op_attr.set_input_dims_mapping(var_name, dims_mapping) + for var_name in grad_op.output_arg_names: + forward_var = vars[var_name[:var_name.find("@GRAD")]] + tensor_attr = TensorDistributedAttribute(vars[var_name], + dist_context) + process_mesh = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_process_mesh() + dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_dims_mapping() + tensor_attr.set_dims_mapping(dims_mapping) + tensor_attr.set_process_mesh(process_mesh) + dist_context.set_tensor_distributed_attr_for_program( + vars[var_name], tensor_attr) + grad_op_attr.set_process_mesh( + dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_process_mesh()) + dist_context.set_op_distributed_attr_for_program(grad_op, + grad_op_attr) diff --git a/python/paddle/distributed/auto_parallel/context.py b/python/paddle/distributed/auto_parallel/context.py index 4958c5adfae910..5e6565aa3d84cb 100644 --- a/python/paddle/distributed/auto_parallel/context.py +++ b/python/paddle/distributed/auto_parallel/context.py @@ -59,6 +59,9 @@ def __init__(self): if self._process_mesh.ndim == 1: self._data_parallel_axis = 0 self._model_parallel_axis = 0 + elif self._process_mesh.ndim == 3: + self._data_parallel_axis = 1 + self._model_parallel_axis = 2 else: self._data_parallel_axis = 0 self._model_parallel_axis = 1 diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py index 141c3d14a7fb26..3f8fbf9cc3a7af 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py @@ -146,8 +146,18 @@ def static_handle(dst_block, assert mesh_shape <= 2, "row_parallel_embedding only support 1 or 2 dimensional process mesh, but got {}".format( process_mesh_shape) num_partition = process_mesh_shape[embedding_row_dim_mapping] - # TODO generalize here, support any mesh group + # TODO generalize here, support any mesh group + model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( + )._get_model_parallel_info() if mesh_shape == 1: + if rank_id not in process_mesh_group: + assert len( + process_mesh.topology + ) == 2, " row_parallel_embedding process mapping only support 2 dimensional process mesh, \ + but got {}".format(len(process_mesh.topology)) + rank_id = process_mesh_group[ + process_mesh.process_group.index(rank_id) % + process_mesh_shape[0]] relative_idx = process_mesh_group.index(rank_id) else: relative_idx = rank_id % num_partition @@ -156,8 +166,6 @@ def static_handle(dst_block, relative_idx = relative_idx * per_part_size # TODO caculate ring id - model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( - )._get_model_parallel_info() group_ranks = _get_comm_group(process_mesh.process_group, process_mesh.topology, model_parallel_axis, rank_id) diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index a08da13a39cafa..2994d35ef9202a 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -17,9 +17,10 @@ import paddle.fluid.core as core from .context import DistributedContext from .context import get_default_distributed_context -from .completion import complete_annotation +from .completion import complete_annotation, complete_backward_annotation from .partitioner import Partitioner from .process import get_all_process_groups +from .reshard import reshard class AutoParallelizer: @@ -85,10 +86,16 @@ def parallelize(self, # instantiate communication by process_mapping. all_process_groups = get_all_process_groups() for process_group in all_process_groups: + if rank not in process_group._ranks: + continue process_group.instantiate() # The last step: remove all distributed attributes to be compatiable # with inference. self._remove_distributed_attrs(partitioned_main_prog) + complete_backward_annotation(partitioned_main_prog, self._dist_context) + reshard(partitioned_main_prog, partitioned_startup_prog, rank, + self._dist_context) + return dist_optimize_ops, dist_params_grads, partitioned_startup_prog, partitioned_main_prog diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py new file mode 100644 index 00000000000000..d66d799c6e0f91 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/reshard.py @@ -0,0 +1,1002 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +import copy +from functools import reduce + +import paddle +import paddle.fluid.core as core +from paddle.utils import unique_name +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.framework import Program, OpProtoHolder +import paddle.fluid.layers.utils as utils +from ..collective import _get_global_env +from .context import DistributedContext +from .attribute import OperatorDistributedAttribute, TensorDistributedAttribute +from .process import new_process_group, ProcessGroup, PROCESS_GROUP_MAP + + +class AllGatherOpDesc: + """ + Describe the allgather op in the reshard phase. + + Args: + group (list): Process group. + """ + + def __init__(self, group): + self._group = group + self._desc = "all_gather" + + @property + def group(self): + return self._group + + @property + def desc(self): + return self._desc + + def __repr__(self): + return f"op: {self._desc}, group: {self._group}." + + +class SendOpDesc: + """ + Describe the send op in the reshard phase. + + Args: + partition_index (list): The index of partition in complete tensor. + dst (int): The destination process to receive. + """ + + def __init__(self, partition_index, dst): + self._dst = dst + self._partition_index = partition_index + self._desc = "send" + + @property + def partition_index(self): + return self._partition_index + + @property + def dst(self): + return self._dst + + @property + def desc(self): + return self._desc + + def __repr__(self): + return f"op: {self._desc}, partition_index: {self._partition_index}, dst: {self._dst}." + + +class RecvOpDesc: + """ + Describe the recv op in the reshard op. + + Args: + partition_index (list): The index of partition in complete tensor. + src (int): The source process to send. + """ + + def __init__(self, partition_index, src): + self._src = src + self._partition_index = partition_index + self._desc = "recv" + + @property + def partition_index(self): + return self._partition_index + + @property + def src(self): + return self._src + + @property + def desc(self): + return self._desc + + def __repr__(self): + return f"op: {self._desc}, partition_index: {self._partition_index}, src: {self._src}." + + +class SliceOpDesc: + """ + Describe the slice op in the reshard phase. + + Args: + starts (list): It represents starting indices of corresponding axis in ``axes``. + ends (list): It represents ending indices of corresponding axis in ``axes``. + axes (list): Axes that `starts` and `ends` apply to . + """ + + def __init__(self, starts, ends, axes): + self._starts = starts + self._ends = ends + self._axes = axes + self._desc = "slice" + + @property + def starts(self): + return self._starts + + @property + def ends(self): + return self._ends + + @property + def axes(self): + return self._axes + + @property + def desc(self): + return self._desc + + def __repr__(self): + return f"op: {self._desc}, starts: {self._starts}, ends: {self._ends}, axes: {self._axes}." + + +class ConcatOpDesc: + """ + Describe the concat op in the reshard phase. + + Args: + partition_index_list (list): A list contains all partition index. + """ + + def __init__(self, partition_index_list): + self._partition_index_list = partition_index_list + self._desc = "concat" + + @property + def partition_index_list(self): + return self._partition_index_list + + @property + def desc(self): + return self._desc + + def __repr__(self): + return f"op: {self._desc}, partition_index_list: {self._partition_index_list}." + + +def _compute_partition_shape(complete_shape, dims_mapping, process_shape): + """Compute the shape of partition.""" + partition_shape = [] + for idx, item in enumerate(complete_shape): + if dims_mapping[idx] == -1: + partition_shape.append(item) + else: + partition_shape.append(item // process_shape[dims_mapping[idx]]) + + return partition_shape + + +def _compute_process_index(process, process_group, process_shape): + """Compute the index of process_shape corresponding to the process.""" + relative_process = process_group.index(process) + process_index = [] + product = reduce(lambda x, y: x * y, process_shape) + + for i in range(len(process_shape)): + idx = relative_process // (product // process_shape[i]) + product = product // process_shape[i] + relative_process = relative_process - relative_process // product * product + process_index.append(idx) + + return process_index + + +def _compute_partition_index(process, complete_shape, dims_mapping, + process_shape, process_group): + """Compute the partition index in complete tensor.""" + partition_shape = _compute_partition_shape(complete_shape, dims_mapping, + process_shape) + process_index = _compute_process_index(process, process_group, + process_shape) + partition_index = [] + + for i in range(len(complete_shape)): + if dims_mapping[i] == -1: + partition_index.append([0, partition_shape[i]]) + else: + partition_index.append([ + process_index[dims_mapping[i]] * partition_shape[i], + (process_index[dims_mapping[i]] + 1) * partition_shape[i] + ]) + + return partition_index + + +def _compute_concat_info(partition_index_x, partition_index_y): + """Judge whether two partition can be concatenated and compute concatenated partition index.""" + differ_count = 0 + concat_axis = -1 + first_order = 0 + new_partition = [] + + for idx, item in enumerate(partition_index_x): + if item != partition_index_y[idx]: + differ_count += 1 + if item[1] == partition_index_y[idx][0] and item[ + 0] < partition_index_y[idx][1]: + concat_axis = idx + new_partition.append([item[0], partition_index_y[idx][1]]) + elif item[0] == partition_index_y[idx][1] and item[ + 1] > partition_index_y[idx][0]: + first_order = 1 + concat_axis = idx + new_partition.append([partition_index_y[idx][0], item[1]]) + else: + new_partition.append(item) + + if differ_count == 1: + return concat_axis, first_order, new_partition + else: + return -1, first_order, new_partition + + +def _concat_partitions(partition_index_list, partition_index): + """Concat the given partitions without inserting concat op.""" + if not partition_index_list: + partition_index_list.append(partition_index) + else: + i = 0 + has_concat = False + while i < len(partition_index_list): + concat_axis, _, new_partition = _compute_concat_info( + partition_index_list[i], partition_index) + if concat_axis != -1: + has_concat = True + partition_index_list.pop(i) + _concat_partitions(partition_index_list, new_partition) + break + i += 1 + if not has_concat: + partition_index_list.append(partition_index) + + +def _is_overlapped(shape_x, shape_y): + """Judge whether two partitions intersect on the specified dimension.""" + overlapped = False + if (shape_y[0] <= shape_x[0] < shape_y[1]) or ( + shape_x[0] <= shape_y[0] < shape_x[1]): + overlapped = True + return overlapped + + +def _need_reshard(tensor_dist_attr, op_dist_attr): + """Judge the tensor whether needs to be resharded.""" + is_reshard = False + tensor_dims_mapping = tensor_dist_attr.get_dims_mapping() + tensor_process_mesh = tensor_dist_attr.get_process_mesh() + op_input_dims_mapping = op_dist_attr.get_input_dims_mapping( + tensor_dist_attr.get_owner_tensor().name) + op_process_mesh = op_dist_attr.get_process_mesh() + if all( + map(lambda x: x is not None, [ + tensor_dims_mapping, tensor_process_mesh, op_input_dims_mapping, + op_process_mesh + ])): + if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh._id != op_process_mesh._id: + is_reshard = True + return is_reshard + + +def _compute_complete_shape(slice_shape, process_shape, dims_mapping): + """compute the complete shape of the slice tensor with its process mesh and dims mapping""" + complete_shape = [] + for idx, item in enumerate(slice_shape): + if dims_mapping[idx] == -1: + complete_shape.append(item) + else: + complete_shape.append(item * process_shape[dims_mapping[idx]]) + return complete_shape + + +def find_op_desc_seq(source_tensor, tensor_dist_attr, op_dist_attr): + """ + Find the op description sequence to reshard the source tensor for matching the op requirement. + + Args: + source_tensor (Variable): A tensor with distributed attribute. + tensor_dist_attr (TensorDistributedAttribute): The distributed attribute of tensor. + op_dist_attr (OperatorDistributedAttribute): The distributed attribute of operator. + + Returns: + Dict, the dict represents the required op description sequence corresponding to process, The key of dict is + process and value is a list containing op description. + """ + source_dims_mapping = tensor_dist_attr.get_dims_mapping() + source_process_mesh = tensor_dist_attr.get_process_mesh() + source_process_group = source_process_mesh.process_group + source_process_shape = source_process_mesh.topology + + target_process_mesh = op_dist_attr.get_process_mesh() + target_dims_mapping = op_dist_attr.get_input_dims_mapping( + tensor_dist_attr.get_owner_tensor().name) + target_process_group = target_process_mesh.process_group + target_process_shape = target_process_mesh.topology + + complete_shape = _compute_complete_shape( + source_tensor.shape, source_process_shape, source_dims_mapping) + op_desc_seq = {} + + # TODO: if the target process group has the same process with source process group + if set(target_process_group).intersection(set( + source_process_group)) and set(target_process_group).difference( + set(source_process_group)): + pass + + # in the different process group, it will use send, recv, concat and slice op + elif target_process_group != source_process_group: + partition_process_mapping_list = [] + for source_process in source_process_group: + source_partition_index = _compute_partition_index(source_process, complete_shape, source_dims_mapping, \ + source_process_shape, source_process_group) + if not partition_process_mapping_list: + partition_process_mapping_list.append( + [source_partition_index, [source_process], [False]]) + else: + partition_list = list( + [item[0] for item in partition_process_mapping_list]) + process_list = list( + [item[1] for item in partition_process_mapping_list]) + has_used = list( + [item[2] for item in partition_process_mapping_list]) + if partition_list.count(source_partition_index) == 1: + index = partition_list.index(source_partition_index) + process_list[index].append(source_process) + has_used[index].append(False) + else: + partition_process_mapping_list.append( + [source_partition_index, [source_process], [False]]) + + for target_process in target_process_group: + has_sent = [] + target_partition_index = _compute_partition_index( + target_process, complete_shape, target_dims_mapping, + target_process_shape, target_process_group) + partition_index_list = [] + all_partition_index_list = [] + for source_process in source_process_group: + source_partition_index = _compute_partition_index( + source_process, complete_shape, source_dims_mapping, + source_process_shape, source_process_group) + to_send_process = None + if all(_ for _ in list(map(_is_overlapped, source_partition_index, target_partition_index))) \ + and source_partition_index not in has_sent: + idx = list([ + item[0] for item in partition_process_mapping_list + ]).index(source_partition_index) + has_used = list( + [item[2] + for item in partition_process_mapping_list])[idx] + process_list = list( + [item[1] + for item in partition_process_mapping_list])[idx] + i = 0 + while i < len(has_used): + if not has_used[i]: + to_send_process = process_list[i] + has_used[i] = True + break + i += 1 + if i == len(has_used): + has_used = list(map(lambda x: False, has_used)) + to_send_process = process_list[0] + has_used[0] = True + assert to_send_process is not None, "Failed to find the send process." + + if to_send_process not in op_desc_seq.keys(): + op_desc_seq[to_send_process] = [] + if target_process not in op_desc_seq.keys(): + op_desc_seq[target_process] = [] + all_partition_index_list.append(source_partition_index) + + # append send and recv op desc + send_op_desc = SendOpDesc(source_partition_index, + target_process) + recv_op_desc = RecvOpDesc(source_partition_index, + to_send_process) + op_desc_seq[to_send_process].append(send_op_desc) + op_desc_seq[target_process].append(recv_op_desc) + has_sent.append(source_partition_index) + _concat_partitions(partition_index_list, + source_partition_index) + + # append concat op desc + op_desc_seq[target_process].append( + ConcatOpDesc(all_partition_index_list)) + + # append slice op desc + slice_starts = [] + slice_ends = [] + slices_axes = [] + concatenated_partition_index = partition_index_list[0] + for idx, item in enumerate(concatenated_partition_index): + slice_starts.append(target_partition_index[idx][0] - item[0]) + slice_ends.append(target_partition_index[idx][1] - item[0]) + slices_axes.append(idx) + op_desc_seq[target_process].append( + SliceOpDesc(slice_starts, slice_ends, slices_axes)) + + # in the same process group, it will use allgahther and slice op + else: + partition_index_list = [] + all_partition_index_list = [] + process_index = [] + for source_process in source_process_group: + source_partition_index = _compute_partition_index( + source_process, complete_shape, source_dims_mapping, + source_process_shape, source_process_group) + if source_partition_index not in partition_index_list: + partition_index_list.append(source_partition_index) + process_index.append( + [[source_process, ], source_partition_index]) + else: + process_index[partition_index_list.index( + source_partition_index)][0].append(source_process) + + for i in range(len(process_index[0][0])): + group = [] + for j in range(len(process_index)): + group.append(process_index[j][0][i]) + if i == 0: + all_partition_index_list.append(process_index[j][1]) + for process in group: + # append slice op desc + slice_starts = [] + slice_ends = [] + slices_axes = [] + target_partition_index = _compute_partition_index( + process, complete_shape, target_dims_mapping, + target_process_shape, target_process_group) + for idx, item in enumerate(target_partition_index): + slice_starts.append(item[0]) + slice_ends.append(item[1]) + slices_axes.append(idx) + + slice_op_desc = SliceOpDesc( + starts=slice_starts, ends=slice_ends, axes=slices_axes) + op_desc_seq[process] = [AllGatherOpDesc(group=group), + ConcatOpDesc(partition_index_list=all_partition_index_list), slice_op_desc] \ + if len(group) > 1 else [slice_op_desc] + + return op_desc_seq + + +def _insert_send_op(block, idx, tensor, dst): + """Insert send op into block at the given index.""" + op_type = 'send_v2' + block._insert_op( + idx, + type=op_type, + inputs={'X': [tensor]}, + attrs={ + 'ring_id': 0, + 'peer': dst, + 'use_calc_stream': True, + }) + + +def _insert_recv_op(block, idx, tensor, src): + """Insert recv op into block at the given index.""" + op_type = 'recv_v2' + block._insert_op( + idx, + type=op_type, + inputs={'X': [tensor]}, + outputs={'Out': [tensor]}, + attrs={ + 'ring_id': 0, + 'peer': src, + 'out_shape': tensor.shape, + 'dtype': tensor.dtype, + 'use_calc_stream': True, + }) + + +def _insert_concat_op(block, idx, tensors, axis): + """Insert concat op into block at the given block.""" + inputs = {'X': tensors} + attrs = {} + attrs['axis'] = axis + helper = LayerHelper('concat', **locals()) + with paddle.static.program_guard(block.program): + out = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) + block._insert_op( + idx, type='concat', inputs=inputs, outputs={'Out': [out]}, attrs=attrs) + return out + + +def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name): + """Insert slice op into block at the given block.""" + inputs = {'Input': tensor} + infer_flags = list(1 for i in range(len(axes))) + attrs = { + "axes": axes, + "starts": starts, + "ends": ends, + "infer_flags": infer_flags + } + helper = LayerHelper('slice', **locals()) + out = block.create_var( + name=new_var_name, + dtype=tensor.dtype, + type=core.VarDesc.VarType.LOD_TENSOR) + block._insert_op( + idx, type="slice", inputs=inputs, outputs={'Out': [out]}, attrs=attrs) + return out + + +def _insert_split_op(block, idx, tensor, num_or_sections): + """Insert split op into block at the given index.""" + helper = LayerHelper('split', **locals()) + input_shape = tensor.shape + inputs = {'X': tensor} + attrs = {'num': num_or_sections, "axis": 0} + with paddle.static.program_guard(block.program): + outs = [ + helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) for i in range(num_or_sections) + ] + block._insert_op( + idx, type="split", inputs=inputs, outputs={'Out': outs}, attrs=attrs) + return outs + + +def _insert_allgather_op(block, idx, tensor, ranks): + """Insert allgather op into block at the given index.""" + + def _insert_fill_constant_op(block, idx): + """Insert fill constant op into block at the given index.""" + helper = LayerHelper("fill_constant", **locals()) + with paddle.static.program_guard(block.program): + out = helper.create_variable_for_type_inference(dtype="int32") + inputs = {} + attrs = {'force_cpu': False} + attrs['str_value'] = str(int("1")) + attrs['value'] = int("1") + attrs['dtype'] = out.dtype + utils.get_shape_tensor_inputs( + inputs=inputs, attrs=attrs, shape=[0], op_type='fill_constant') + block._insert_op( + idx, + type='fill_constant', + inputs=inputs, + outputs={'Out': [out]}, + attrs=attrs) + out.stop_gradient = True + return out + + tensor_list = [] + group = new_process_group(ranks) + idx_offset = 0 + + # instant process group before insert allgather op. + if not group.is_instantiate(): + # insert fill_constant op + fill_constant_out = _insert_fill_constant_op(block, idx) + fill_constant_out.stop_gradient = True + + # insert c_allreduce_sum op + block._insert_op( + idx + 1, + type="c_allreduce_sum", + inputs={'X': [fill_constant_out]}, + outputs={'Out': [fill_constant_out]}, + attrs={'ring_id': 0, + 'use_calc_stream': True}) + + # insert c_sync_calc_stream op + block._insert_op( + idx + 2, + type="c_sync_calc_stream", + inputs={'X': [fill_constant_out]}, + outputs={'Out': [fill_constant_out]}) + idx_offset = 3 + + # insert c_allgather op + op_type = 'c_allgather' + helper = LayerHelper(op_type, **locals()) + with paddle.static.program_guard(block.program): + allgather_out = helper.create_variable_for_type_inference( + dtype=tensor.dtype) + block._insert_op( + idx + idx_offset, + type=op_type, + inputs={'X': [tensor]}, + outputs={'Out': [allgather_out]}, + attrs={ + 'ring_id': group.id, + 'use_calc_stream': True, + 'nranks': group._nranks + }) + idx_offset += 1 + + # insert split op + split_out = _insert_split_op(block, idx + idx_offset, allgather_out, + group._nranks) + idx_offset += 1 + tensor_list.extend(split_out) + return tensor_list, idx_offset + + +def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index, + block, idx): + """Concat the tensors and insert concat op.""" + if not partition_tensor_list: + partition_tensor_list.append((tensor, partition_index)) + else: + i = 0 + has_concat = False + while i < len(partition_tensor_list): + concat_axis, first_order, new_partition = _compute_concat_info( + partition_tensor_list[i][1], partition_index) + if concat_axis != -1: + has_concat = True + _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis) \ + if first_order == 0 else \ + _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis) + partition_tensor_list.pop(i) + idx[0] += 1 + _concat_partitions_with_op(partition_tensor_list, _, + new_partition, block, idx) + break + i += 1 + if not has_concat: + partition_tensor_list.append((tensor, partition_index)) + + +def _init_comm_for_send_recv(): + if not PROCESS_GROUP_MAP["global_group"].is_instantiate(): + PROCESS_GROUP_MAP["global_group"].instantiate() + + +HAS_SENT = {} +HAS_RECV = {} +HAS_ALLGATHER = {} + + +def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op, + dist_context): + """Parse op desc sequence and insert op in the block""" + global HAS_SENT + global HAS_RECV + global HAS_ALLGATHER + tensor_list = [] + partition_tensor_list = [] + if rank_id not in op_desc_seq.keys(): + return + op_desc_list = op_desc_seq[rank_id] + block = program.global_block() + assert var_name in block.vars.keys( + ), "The {} cannot be found in the {} program.".format(var_name, rank_id) + + idx = None + for index, op in list(enumerate(block.ops)): + if op.desc.id == reshard_op.desc.id: + idx = index + break + assert idx is not None, "The op for reshard cannot be found in the rank {} program.".format( + rank_id) + + matched_op = block.ops[idx] + source_tensor = block.vars[var_name] + for op_desc in op_desc_list: + if isinstance(op_desc, AllGatherOpDesc): # noqa: F401 + if var_name not in HAS_ALLGATHER.keys(): + HAS_ALLGATHER[var_name] = [] + if not HAS_ALLGATHER[var_name] or op_desc.group not in list( + map(lambda x: x[0], HAS_ALLGATHER[var_name])): + tensor_list, idx_offset = _insert_allgather_op( + block, idx, source_tensor, op_desc.group) + idx += idx_offset + tensor_name_list = [var.name for var in tensor_list] + HAS_ALLGATHER[var_name].append( + [op_desc.group, tensor_name_list]) + else: + for item in HAS_ALLGATHER[var_name]: + if op_desc.group == item[0]: + tensor_list = [ + program.global_block().vars[var_name] + for var_name in item[1] + ] + break + assert tensor_list, "The result of parsing allgather op should not be None." + + elif isinstance(op_desc, SendOpDesc): + _init_comm_for_send_recv() + if var_name not in HAS_SENT.keys(): + HAS_SENT[var_name] = [] + if op_desc.dst not in HAS_SENT[var_name]: + _insert_send_op(block, idx, source_tensor, op_desc.dst) + idx += 1 + HAS_SENT[var_name].append(op_desc.dst) + + elif isinstance(op_desc, RecvOpDesc): + _init_comm_for_send_recv() + if var_name not in HAS_RECV.keys(): + HAS_RECV[var_name] = {} + if op_desc.src not in HAS_RECV[var_name].keys(): + partition_index = op_desc.partition_index + shape = [] + for index in partition_index: + shape.append(index[1] - index[0]) + recv_tensor = block.create_var( + name=unique_name.generate(var_name + "@recv"), + shape=shape, + dtype=source_tensor.dtype) + _insert_recv_op(block, idx, recv_tensor, op_desc.src) + tensor_list.append(recv_tensor) + idx += 1 + HAS_RECV[var_name][op_desc.src] = recv_tensor + else: + tensor_list.append(HAS_RECV[var_name][op_desc.src]) + + elif isinstance(op_desc, ConcatOpDesc): + partition_index_list = op_desc.partition_index_list + idx_list = [idx] + for index, tensor in enumerate(tensor_list): + _concat_partitions_with_op(partition_tensor_list, tensor, + partition_index_list[index], block, + idx_list) + idx = idx_list[0] + + elif isinstance(op_desc, SliceOpDesc): + assert len(partition_tensor_list) == 1 or not partition_tensor_list + to_slice_tensor = partition_tensor_list[0][0] if len( + partition_tensor_list) == 1 else source_tensor + new_name = unique_name.generate(var_name + "@RESHARD") + target_tensor = _insert_slice_op( + block, + idx, + to_slice_tensor, + starts=op_desc.starts, + ends=op_desc.ends, + axes=op_desc.axes, + new_var_name=new_name) + + tensor_attr = TensorDistributedAttribute(target_tensor, + dist_context) + process_mesh = dist_context.get_op_distributed_attr_for_program( + matched_op).get_process_mesh() + dims_mapping = dist_context.get_op_distributed_attr_for_program( + matched_op).get_input_dims_mapping(var_name) + tensor_attr.set_dims_mapping(dims_mapping) + tensor_attr.set_process_mesh(process_mesh) + dist_context.set_tensor_distributed_attr_for_program(target_tensor, + tensor_attr) + + # rename op input name according to new name + for op in block.ops: + for name in op.input_arg_names: + op_dist_attr = dist_context.get_op_distributed_attr_for_program( + op) + if name == var_name and op_dist_attr is not None: + op_process_mesh = op_dist_attr.get_process_mesh() + op_input_dims_mapping = op_dist_attr.get_input_dims_mapping( + var_name) + if op_process_mesh._id == process_mesh._id and op_input_dims_mapping == dims_mapping: + op.desc._rename_input(name, target_tensor.name) + op_dist_attr.set_input_dims_mapping( + target_tensor.name, dims_mapping) + op_dist_attr._dims_mapping.pop(name, None) + + +def _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id): + """Remove no need ops in the main program""" + not_remove_op_ref = [ + "create_py_reader", "create_double_buffer_reader", "read" + ] + remove_op_idx = [] + block = auto_parallel_main_prog.global_block() + ops = block.ops + vars = block.vars + for idx, op in enumerate(ops): + # handle read op in the pipeline scene specially, it will be removed in the future. + if op.type == "read": + dim_list = [] + for var_name in op.output_arg_names: + dim_list.extend(vars[var_name].shape) + for i in range(idx, -1, -1): + if ops[i].type == "create_py_reader": + ops[i]._set_attr("shape_concat", dim_list) + break + continue + + # replace the input and output of c_sync_comm_stream op when in pipeline scene. + if op.type == "c_sync_comm_stream": + need_save = [] + for var_name in op.input_arg_names: + process_mesh = dist_context.get_tensor_distributed_attr_for_program( + vars[var_name]).get_process_mesh() + if rank_id in process_mesh.process_group: + need_save.append(var_name) + if not need_save: + remove_op_idx.append(idx) + continue + + proto = OpProtoHolder.instance().get_op_proto(op.type) + op.desc.set_input(proto.inputs[0].name, need_save) + op.desc.set_output(proto.outputs[0].name, need_save) + continue + + # judge the other op whether should be removed. + op_dist_attr = dist_context.get_op_distributed_attr_for_program(op) + if op_dist_attr is not None: + op_process_mesh = op_dist_attr.get_process_mesh() + if rank_id not in op_process_mesh.process_group and op.type not in not_remove_op_ref: + remove_op_idx.append(idx) + + for idx in remove_op_idx[::-1]: + block._remove_op(idx) + + +def _remove_no_need_vars(auto_parallel_main_prog): + """Remove no need vars in the main program""" + remove_vars = set() + block = auto_parallel_main_prog.global_block() + ops = block.ops + vars = block.vars + need_vars = set() + for op in ops: + for var_name in op.input_arg_names: + if var_name in vars: + need_vars.add(var_name) + for var_name in op.output_arg_names: + if var_name in vars: + need_vars.add(var_name) + for var in vars: + if var not in need_vars: + remove_vars.add(var) + for var in remove_vars: + block._remove_var(var) + + +def remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id): + """Remove no need vars and ops in the main program.""" + _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id) + _remove_no_need_vars(auto_parallel_main_prog) + + +def remove_no_need_in_startup(auto_parallel_main_prog, + auto_parallel_startup_prog): + """Remove no need vars and ops in the startup program.""" + main_input_vars = set() + main_ops = auto_parallel_main_prog.global_block().ops + for op in main_ops: + for var_name in op.input_arg_names: + main_input_vars.add(var_name) + + startup_block = auto_parallel_startup_prog.global_block() + startup_output_vars = set() + startup_ops = startup_block.ops + for op in startup_ops: + # skip c_sync_comm_stream op + if op.type == "c_sync_comm_stream": + continue + for var_name in op.output_arg_names: + startup_output_vars.add(var_name) + + need_vars = set() + for var_name in startup_output_vars: + if var_name in main_input_vars: + need_vars.add(var_name) + + startup_ops = startup_block.ops + actual_need_vars = set() + for idx, op in enumerate(startup_ops): + is_need_op = False + if op.type == "c_sync_comm_stream": + continue + for var_name in op.output_arg_names: + if var_name in need_vars: + is_need_op = True + break + if is_need_op: + for var_name in op.output_arg_names: + actual_need_vars.add(var_name) + for var_name in op.input_arg_names: + actual_need_vars.add(var_name) + + remove_vars = set() + for var_name in startup_block.vars: + if var_name not in actual_need_vars: + remove_vars.add(var_name) + for var in remove_vars: + startup_block._remove_var(var) + + remove_op_idx = [] + vars = startup_block.vars + for idx, op in enumerate(startup_block.ops): + is_no_need_op = False + if op.type == "c_sync_comm_stream": + var_names = [] + for var_name in op.input_arg_names: + if var_name in vars: + var_names.append(var_name) + if not var_names: + remove_op_idx.append(idx) + else: + proto = OpProtoHolder.instance().get_op_proto(op.type) + op.desc.set_input(proto.inputs[0].name, var_names) + op.desc.set_output(proto.outputs[0].name, var_names) + continue + + for var_name in op.output_arg_names: + if var_name not in vars: + is_no_need_op = True + break + if is_no_need_op: + remove_op_idx.append(idx) + for idx in remove_op_idx[::-1]: + startup_block._remove_op(idx) + + +def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id, + dist_context): + """ + Reshard tensor in the program according to its dist attr and corresponding op dist attr. + + Args: + auto_parallel_main_prog (Program): An auto parallel main program. + auto_parallel_startup_prog (Program): An auto parallel startup program. + rank_id (int): The process id. + """ + assert isinstance(auto_parallel_main_prog, Program), "The type of auto_parallel_main_prog should be Program, " \ + "but got {}.".format(type(auto_parallel_main_prog)) + assert isinstance(auto_parallel_main_prog, Program), "The type of auto_parallel_startup_prog should be Program, " \ + "but got {}.".format(type(auto_parallel_startup_prog)) + assert isinstance(rank_id, int), "The type of rank_id should be int, " \ + "but got {}.".format(type(rank_id)) + assert isinstance(dist_context, DistributedContext), "The type of dist_context should be DistributedContext, " \ + "but got {}.".format(type(dist_context)) + + block = auto_parallel_main_prog.global_block() + idx = 0 + while idx < len(block.ops): + pre_op_count = len(block.ops) + op = block.ops[idx] + op_dist_attr = dist_context.get_op_distributed_attr_for_program(op) + if op_dist_attr is not None: + idx_offset = 0 + for var_name in op.input_arg_names: + # skip lod_tensor_blocking_queue_0 + if var_name == "lod_tensor_blocking_queue_0": + continue + var = block.vars[var_name] + tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_program( + var) + if tensor_dist_attr is not None and _need_reshard( + tensor_dist_attr, op_dist_attr): + reshard_op_desc = find_op_desc_seq(var, tensor_dist_attr, + op_dist_attr) + parse_op_desc(auto_parallel_main_prog, rank_id, + reshard_op_desc, var_name, op, dist_context) + cur_op_count = len(block.ops) + idx_offset = idx_offset + cur_op_count - pre_op_count + pre_op_count = cur_op_count + idx = idx + idx_offset + 1 + else: + idx += 1 + + # remove no need vars and ops in the main program + remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id) + + # remove no need vars and ops in the startip program + remove_no_need_in_startup(auto_parallel_main_prog, + auto_parallel_startup_prog) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 61a43aeb44e848..0c2731bc45258d 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -86,6 +86,10 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto) list(APPEND MIXED_DIST_TEST_OPS test_fleet_static_mp_layers) list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner) list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner_gpt) +list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard) +list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_serial) +list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_mppp) +list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_dpmppp) foreach(TEST_OP ${MIXED_DIST_TEST_OPS}) list(REMOVE_ITEM TEST_OPS ${TEST_OP}) endforeach() @@ -225,6 +229,10 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_serial) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_mppp) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_dpmppp) elseif(WITH_GPU) if (${CUDNN_VERSION} VERSION_LESS 7100) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) @@ -589,6 +597,10 @@ if(WITH_DISTRIBUTE) py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_auto_parallel_partitioner MODULES test_auto_parallel_partitioner ENVS ${dist_ENVS}) py_test_modules(test_auto_parallel_partitioner_gpt MODULES test_auto_parallel_partitioner_gpt ENVS ${dist_ENVS}) + py_test_modules(test_auto_parallel_reshard MODULES test_auto_parallel_reshard ENVS ${dist_ENVS}) + py_test_modules(test_auto_parallel_reshard_serial MODULES test_auto_parallel_reshard_serial ENVS ${dist_ENVS}) + py_test_modules(test_auto_parallel_reshard_mppp MODULES test_auto_parallel_reshard_mppp ENVS ${dist_ENVS}) + py_test_modules(test_auto_parallel_reshard_dpmppp MODULES test_auto_parallel_reshard_dpmppp ENVS ${dist_ENVS}) endif(NOT WIN32) endif(NOT APPLE) if(WITH_DGC) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py new file mode 100644 index 00000000000000..89e9b7e817f457 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py @@ -0,0 +1,287 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.context import DistributedContext +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.completion import complete_backward_annotation +from paddle.distributed.auto_parallel.reshard import reshard + +paddle.enable_static() +_global_parallel_strategy = None +_global_process_mesh = None +ROOT_MESH = auto.ProcessMesh([0, 1]) +PP_MESH_0 = None +PP_MESH_1 = None + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + + def forward(self, input): + if _global_parallel_strategy == "pp": + auto.shard_tensor( + self.linear0.weight, PP_MESH_0, dim_mapping=[-1, -1]) + auto.shard_tensor( + self.linear1.weight, PP_MESH_1, dim_mapping=[-1, -1]) + else: + auto.shard_tensor( + self.linear0.weight, _global_process_mesh, + dim_mapping=[-1, -1]) + auto.shard_tensor( + self.linear1.weight, _global_process_mesh, + dim_mapping=[-1, -1]) + + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + + return out + + +def mlp_forward(train_program, start_program): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 1024 + sequence_len = 512 + input = static.data( + name="input", shape=[batch_size, hidden_size], dtype='float32') + label = static.data( + name="label", shape=[batch_size, 1], dtype='float32') + + if _global_parallel_strategy == "pp": + auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1, -1]) + auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1]) + elif _global_parallel_strategy == "dp": + auto.shard_tensor(input, _global_process_mesh, dim_mapping=[0, -1]) + else: + auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1]) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + initializer_range=0.02) + + predict = mlp(input) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + + return loss, train_program, start_program + + +def get_dist_prog(train_program, startup_program, dist_context, rank_id): + global _global_process_mesh + dist_context.set_process_mesh(_global_process_mesh) + loss, train_program, startup_program = mlp_forward(train_program, + startup_program) + + # auto completion + complete_train_program = auto.complete_annotation(train_program, + dist_context) + + dist_strategy = fleet.DistributedStrategy() + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + # logical partition + auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + dist_params_grads = partitioner.apply_backward( + loss, complete_train_program, startup_program, auto_parallel_main_prog, + auto_parallel_startup_prog) + optimizer = paddle.fluid.optimizer.AdamOptimizer() + opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, + auto_parallel_main_prog, + auto_parallel_startup_prog) + return auto_parallel_main_prog, auto_parallel_startup_prog + + +def check_backward_dist_attr(dist_context, dist_main_prog, op_need_check): + has_dist_attr = True + vars = dist_main_prog.global_block().vars + + op_dist_attr = dist_context.get_op_distributed_attr_for_program( + op_need_check) + if not op_dist_attr or not op_dist_attr.get_process_mesh(): + has_dist_attr = False + + for var_name in op_need_check.input_arg_names: + if not op_dist_attr.get_input_dims_mapping(var_name) or \ + not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_dims_mapping() or \ + not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_process_mesh(): + has_dist_attr = False + break + + if has_dist_attr: + for var_name in op_need_check.output_arg_names: + if not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_dims_mapping() or \ + not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_process_mesh(): + has_dist_attr = False + break + + return has_dist_attr + + +def check_send_recv_result(dist_main_prog, rank_id): + send_result = False + recv_result = False + ops = dist_main_prog.global_block().ops + if rank_id == 0: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[ + 0]: + recv_result = True + else: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[ + 0]: + recv_result = True + + return send_result and recv_result + + +def check_initialization(dist_startup_prog, rank_id): + if rank_id == 0: + need_check_params = [ + "layer_norm_0.b_0", "layer_norm_0.w_0", "linear_0.w_0", + "linear_0.b_0" + ] + else: + need_check_params = ['linear_1.w_0', 'linear_1.b_0'] + + params = [] + for var_name, var in dist_startup_prog.global_block().vars.items(): + if var.is_parameter: + params.append(var_name) + + return params == need_check_params + + +def check_initialization_for_dp(dist_startup_prog): + need_check_params = [ + "layer_norm_0.b_0", "layer_norm_0.w_0", "linear_0.w_0", "linear_0.b_0" + ] + ['linear_1.w_0', 'linear_1.b_0'] + params = [] + for var_name, var in dist_startup_prog.global_block().vars.items(): + if var.is_parameter: + params.append(var_name) + broadcast_varnames = [] + for op in dist_startup_prog.global_block().ops: + if op.type == "c_broadcast": + broadcast_varnames.append(op.output_arg_names[0]) + + return params == need_check_params == broadcast_varnames + + +class TestMLPReshard(unittest.TestCase): + def test_complete_backward_annotation(self): + global _global_process_mesh + _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH) + + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 0 + dist_main_prog, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, 0) + complete_backward_annotation(dist_main_prog, dist_context) + + op_need_check = None + for op in dist_main_prog.global_block().ops: + if op.type == "gelu_grad": + op_need_check = op + break + + # grad op should have dist attr + self.assertTrue( + check_backward_dist_attr(dist_context, dist_main_prog, + op_need_check)) + + def test_mlp_pp(self): + global _global_parallel_strategy + _global_parallel_strategy = "pp" + global _global_process_mesh + _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH) + global PP_MESH_0 + PP_MESH_0 = auto.ProcessMesh(mesh=[0], parent=ROOT_MESH) + global PP_MESH_1 + PP_MESH_1 = auto.ProcessMesh(mesh=[1], parent=ROOT_MESH) + + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 1 + dist_main_prog, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + complete_backward_annotation(dist_main_prog, dist_context) + reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) + + # check send and recv result + self.assertTrue(check_send_recv_result(dist_main_prog, rank_id)) + + # parameter initialization of every rank should be different in the pipeline scene + self.assertTrue(check_initialization(dist_startup_prog, rank_id)) + + def test_mlp_dp(self): + global _global_parallel_strategy + _global_parallel_strategy = "dp" + global _global_process_mesh + _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH) + + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 0 + dist_main_prog, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + complete_backward_annotation(dist_main_prog, dist_context) + reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) + # send and recv should not exist in dp scene. + self.assertFalse(check_send_recv_result(dist_main_prog, rank_id)) + + # all parameters should be initialized in dp scene + self.assertTrue(check_initialization_for_dp(dist_startup_prog)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py new file mode 100644 index 00000000000000..1e134eebfd23bb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py @@ -0,0 +1,173 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.context import DistributedContext +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.completion import complete_backward_annotation +from paddle.distributed.auto_parallel.reshard import reshard + +paddle.enable_static() +_global_parallel_strategy = "dp_mp_pp" +ROOT_MESH = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]]) +_global_process_mesh = auto.ProcessMesh( + [[[0, 1], [4, 5]], [[2, 3], [6, 7]]], parent=ROOT_MESH) +PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], parent=ROOT_MESH) +PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], parent=ROOT_MESH) + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + + def forward(self, input): + auto.shard_tensor(self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 1]) + auto.shard_tensor(self.linear1.weight, PP_MESH_1, dim_mapping=[1, -1]) + + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + + return out + + +def mlp_forward(train_program, start_program): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 1024 + sequence_len = 512 + input = static.data( + name="input", shape=[batch_size, hidden_size], dtype='float32') + label = static.data( + name="label", shape=[batch_size, 1], dtype='float32') + + auto.shard_tensor(input, PP_MESH_0, dim_mapping=[0, -1]) + auto.shard_tensor(label, PP_MESH_1, dim_mapping=[0, -1]) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + initializer_range=0.02) + + predict = mlp(input) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + + return loss, train_program, start_program + + +def get_dist_prog(train_program, startup_program, dist_context, rank_id): + global _global_process_mesh + dist_context.set_process_mesh(_global_process_mesh) + loss, train_program, startup_program = mlp_forward(train_program, + startup_program) + + # auto completion + complete_train_program = auto.complete_annotation(train_program, + dist_context) + + dist_strategy = fleet.DistributedStrategy() + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + # logical partition + auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + dist_params_grads = partitioner.apply_backward( + loss, complete_train_program, startup_program, auto_parallel_main_prog, + auto_parallel_startup_prog) + optimizer = paddle.fluid.optimizer.AdamOptimizer() + opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, + auto_parallel_main_prog, + auto_parallel_startup_prog) + return auto_parallel_main_prog, auto_parallel_startup_prog + + +def check_send_recv_result(dist_main_prog, rank_id): + send_result = False + recv_result = False + ops = dist_main_prog.global_block().ops + if rank_id in [0, 1, 4, 5]: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[ + 0]: + recv_result = True + else: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[ + 0]: + recv_result = True + + return send_result and recv_result + + +def check_initialization_for_dpmppp(dist_startup_prog): + broadcast_varnames = [] + for op in dist_startup_prog.global_block().ops: + if op.type == "c_broadcast": + broadcast_varnames.append(op.output_arg_names[0]) + result = len(broadcast_varnames) > 0 + return result + + +class TestMLPReshard(unittest.TestCase): + def test_mlp_dpmppp(self): + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 2 + dist_main_prog, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + print(dist_main_prog) + complete_backward_annotation(dist_main_prog, dist_context) + reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) + print(dist_main_prog) + print(dist_startup_prog) + # check send and recv result + self.assertTrue(check_send_recv_result(dist_main_prog, rank_id)) + + # check parameter initialization + self.assertTrue(check_initialization_for_dpmppp(dist_startup_prog)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py new file mode 100644 index 00000000000000..5a10a218345705 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py @@ -0,0 +1,231 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.context import DistributedContext +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.completion import complete_backward_annotation +from paddle.distributed.auto_parallel.reshard import reshard + +paddle.enable_static() +_global_parallel_strategy = "mp_pp" +ROOT_MESH = auto.ProcessMesh([[0, 1], [2, 3]]) +_global_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]], parent=ROOT_MESH) +PP_MESH_0 = auto.ProcessMesh([0, 1], parent=ROOT_MESH) +PP_MESH_1 = auto.ProcessMesh([2, 3], parent=ROOT_MESH) + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.word_embeddings = nn.Embedding( + hidden_size, + hidden_size, + weight_attr=paddle.ParamAttr( + name="word_embeddings", + initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range))) + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.linear2 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + + def forward(self, input): + auto.shard_tensor( + self.word_embeddings.weight, PP_MESH_0, dim_mapping=[0, -1]) + auto.shard_tensor(self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 0]) + auto.shard_tensor(self.linear1.weight, PP_MESH_1, dim_mapping=[0, -1]) + auto.shard_tensor(self.linear2.weight, PP_MESH_1, dim_mapping=[0, -1]) + w_out = self.word_embeddings(input) + out = self.linear0(w_out) + gelu_out = F.gelu(out, approximate=True) + out = self.linear1(gelu_out) + out1 = self.linear2(gelu_out) + out = out + out1 + + return out + + +def mlp_forward(train_program, start_program): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 1024 + sequence_len = 512 + input = static.data(name="input", shape=[batch_size], dtype='int32') + label = static.data( + name="label", shape=[batch_size, 1], dtype='float32') + + auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1]) + auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1]) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + initializer_range=0.02) + + predict = mlp(input) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + + return loss, train_program, start_program + + +def get_dist_prog(train_program, startup_program, dist_context, rank_id): + global _global_process_mesh + dist_context.set_process_mesh(_global_process_mesh) + loss, train_program, startup_program = mlp_forward(train_program, + startup_program) + + # auto completion + complete_train_program = auto.complete_annotation(train_program, + dist_context) + + dist_strategy = fleet.DistributedStrategy() + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + # logical partition + auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + dist_params_grads = partitioner.apply_backward( + loss, complete_train_program, startup_program, auto_parallel_main_prog, + auto_parallel_startup_prog) + optimizer = paddle.fluid.optimizer.AdamOptimizer() + opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, + auto_parallel_main_prog, + auto_parallel_startup_prog) + return auto_parallel_main_prog, auto_parallel_startup_prog + + +def check_send_recv_result(dist_main_prog, rank_id): + send_result = False + recv_result = False + ops = dist_main_prog.global_block().ops + if rank_id in [0, 1]: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[ + 0]: + recv_result = True + else: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names[ + 0]: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[ + 0]: + recv_result = True + + return send_result and recv_result + + +def check_initialization_for_mppp(dist_startup_prog, rank_id): + if rank_id in [0, 1]: + need_check_params = [] + else: + need_check_params = ["linear_1.b_0", "linear_2.b_0"] + broadcast_varnames = [] + for op in dist_startup_prog.global_block().ops: + if op.type == "c_broadcast": + broadcast_varnames.append(op.output_arg_names[0]) + + return need_check_params == broadcast_varnames + + +def check_allgather(dist_main_program): + allgather_out = "x@RESHARD_0" + var_result = False + op_result = False + vars = dist_main_program.global_block().vars + if allgather_out in vars and vars[allgather_out].shape == (4, 4): + var_result = True + for op in dist_main_program.global_block().ops: + if op.type == "matmul_v2": + if allgather_out in op.input_arg_names: + op_result = True + return var_result and op_result + + +class TestMLPReshard(unittest.TestCase): + def test_mlp_mppp(self): + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 2 + dist_main_prog, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + complete_backward_annotation(dist_main_prog, dist_context) + reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) + + # check send and recv result + self.assertTrue(check_send_recv_result(dist_main_prog, rank_id)) + + # parameter which not been sliced should be the same in the mp scene + self.assertTrue( + check_initialization_for_mppp(dist_startup_prog, rank_id)) + + def test_allgather(self): + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + process_mesh = auto.ProcessMesh(mesh=[0, 3], parent=ROOT_MESH) + with static.program_guard(train_program, startup_program): + x = paddle.static.data(name="x", shape=[4, 4], dtype='float32') + x = auto.shard_tensor(x, process_mesh, dim_mapping=[0, -1]) + + w = paddle.static.data(name="w", shape=[4, 4], dtype='float32') + w = auto.shard_tensor(w, process_mesh, dim_mapping=[-1, -1]) + + y = paddle.distributed.shard_op(paddle.matmul, process_mesh, { + x.name: [-1, -1], + w.name: [-1, -1] + }, **{"x": x, + "y": w})[0] + + rank_id = 0 + dist_context = DistributedContext() + dist_strategy = fleet.DistributedStrategy() + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + complete_train_program = auto.complete_annotation(train_program, + dist_context) + auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + reshard(auto_parallel_main_prog, startup_program, rank_id, dist_context) + # the x should not be slice + self.assertTrue(check_allgather(auto_parallel_main_prog)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py new file mode 100644 index 00000000000000..bf2ba9f061fd85 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py @@ -0,0 +1,184 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import os +if os.getenv("CUDA_VISIBLE_DEVICES", None) is None: + os.environ["CUDA_VISIBLE_DEVICES"] = '0' + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.context import get_default_distributed_context +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.reshard import reshard +from paddle.distributed.auto_parallel.process import new_process_group + +paddle.enable_static() +_global_parallel_strategy = None +_global_process_mesh = None +ROOT_MESH = auto.ProcessMesh([0]) + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + + def forward(self, input): + if _global_parallel_strategy == "pp": + auto.shard_tensor( + self.linear0.weight, PP_MESH_0, dim_mapping=[-1, -1]) + auto.shard_tensor( + self.linear1.weight, PP_MESH_1, dim_mapping=[-1, -1]) + else: + auto.shard_tensor( + self.linear0.weight, _global_process_mesh, + dim_mapping=[-1, -1]) + auto.shard_tensor( + self.linear1.weight, _global_process_mesh, + dim_mapping=[-1, -1]) + + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + + return out + + +def mlp_forward(train_program, start_program): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 1024 + sequence_len = 512 + input = static.data( + name="input", shape=[batch_size, hidden_size], dtype='float32') + label = static.data( + name="label", shape=[batch_size, 1], dtype='float32') + + if _global_parallel_strategy == "pp": + auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1, -1]) + auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1]) + elif _global_parallel_strategy == "dp": + auto.shard_tensor(input, _global_process_mesh, dim_mapping=[0, -1]) + else: + auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1]) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + initializer_range=0.02) + + predict = mlp(input) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + + return loss, train_program, start_program + + +def get_dist_prog_with_parallelizer(train_program, startup_program, + dist_context): + global _global_process_mesh + + dist_strategy = fleet.DistributedStrategy() + dist_strategy.amp = False + dist_strategy.pipeline = False + dist_strategy.recompute = False + + # init parallel optimizer + dist_strategy.semi_auto = True + fleet.init(is_collective=True, strategy=dist_strategy) + + loss, train_program, startup_program = mlp_forward(train_program, + startup_program) + + optimizer = paddle.fluid.optimizer.AdamOptimizer( + learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + optimizer = fleet.distributed_optimizer(optimizer) + + # fake a comm group + pg = new_process_group([3, 4]) + _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( + loss, startup_program) + + return distributed_main_program, distributed_startup_program + + +def check_send_recv_result(dist_main_prog, rank_id): + send_result = False + recv_result = False + ops = dist_main_prog.global_block().ops + if rank_id == 0: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[ + 0]: + recv_result = True + else: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[ + 0]: + recv_result = True + + return send_result and recv_result + + +class TestMLPReshard(unittest.TestCase): + def test_mlp_serial(self): + global _global_parallel_strategy + _global_parallel_strategy = None + global _global_process_mesh + _global_process_mesh = auto.ProcessMesh(mesh=[0], parent=ROOT_MESH) + + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = get_default_distributed_context() + rank_id = 0 + dist_main_prog, dist_startup_prog = get_dist_prog_with_parallelizer( + train_program, startup_program, dist_context) + # send and recv should not exist in serial scene. + self.assertFalse(check_send_recv_result(dist_main_prog, rank_id)) + + +if __name__ == "__main__": + unittest.main() From 00245cfd2e5fe175a80d13a67b5c75e27930ce59 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Mon, 11 Oct 2021 18:40:07 +0800 Subject: [PATCH 099/298] [Paddle-ASP] Revise 4d tensor sparsity mask pattern for conv2d sparsity (#36054) Sparse tensor core for convolution requires the input channel dimension is 2:4 structed sparse. So we have to mask the input channel dimension for using sparse tensor core --- python/paddle/fluid/contrib/sparsity/utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py index bb030cbac1beaf..a72ea4d9b85108 100644 --- a/python/paddle/fluid/contrib/sparsity/utils.py +++ b/python/paddle/fluid/contrib/sparsity/utils.py @@ -518,9 +518,13 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4): t = t.reshape(shape[0], shape[1]) elif len(shape) == 3: t = t.reshape(shape[0] * shape[1], shape[2]) - # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op + # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op elif len(shape) == 4: - t = t.reshape(shape[0], shape[1] * shape[2] * shape[3]) + t = t.transpose([0, 1, 3, 2]).reshape(shape[0] * shape[1] * shape[3], + shape[2]) + mask = func(t, n=n, m=m) + return mask.reshape([shape[0], shape[1], shape[3], + shape[2]]).transpose([0, 1, 3, 2]).astype(dtype) else: raise ValueError("The dimension of input tensor is not supported in create_mask, " \ "Only dimension < 4 is supported but got {}".format(len(shape))) @@ -572,9 +576,10 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4): t = t.reshape(shape[0], shape[1]) elif len(shape) == 3: t = t.reshape(shape[0] * shape[1], shape[2]) - # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op + # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op elif len(shape) == 4: - t = t.reshape(shape[0], shape[1] * shape[2] * shape[3]) + t = t.transpose([0, 1, 3, 2]).reshape( + [shape[0] * shape[1] * shape[3], shape[2]]) else: raise ValueError("The dimension of input tensor is not supported in create_mask, " \ "Only dimension < 4 is supported but got {}".format(len(shape))) From 1026052caa2dc18747790b002572c21970f6c6b5 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Mon, 11 Oct 2021 19:01:49 +0800 Subject: [PATCH 100/298] fix_dp_grad_merge_with_grad_clip_by_global_norm (#36334) --- python/paddle/fluid/clip.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 5a9ea1a445e2da..4cca41b527bc2f 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -28,6 +28,7 @@ from .data_feeder import check_variable_and_dtype from .framework import in_dygraph_mode from .layer_helper import LayerHelper +from .framework import default_main_program __all__ = [ 'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue', @@ -547,7 +548,12 @@ def _static_clip(self, params_grads): scale_input = (scale_var.astype('float16') if g.dtype == core.VarDesc.VarType.FP16 else scale_var) - p.block.append_op( + # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g + # will be in different blocks with the gradient clip related ops. + # We need to handle the correct block, otherwise will encounter + # a 'NotFoundError' during compile time. + block = default_main_program().current_block() + block.append_op( type='elementwise_mul', inputs={'X': g, 'Y': scale_input}, From fc5415d66859712bfdf37c2e0d330d1aa5d52679 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Mon, 11 Oct 2021 19:18:40 +0800 Subject: [PATCH 101/298] change exit code of pip install dependencies to 5 (#36016) --- paddle/scripts/paddle_build.bat | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index d675f4fdbdb617..c4528fdc75e233 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -89,7 +89,7 @@ if "%WITH_PYTHON%" == "ON" ( pip install -r %work_dir%\python\requirements.txt --user if !ERRORLEVEL! NEQ 0 ( echo pip install requirements.txt failed! - exit /b 7 + exit /b 5 ) ) @@ -309,7 +309,7 @@ if %GENERATOR% == "Ninja" ( pip install ninja if %errorlevel% NEQ 0 ( echo pip install ninja failed! - exit /b 7 + exit /b 5 ) ) @@ -627,7 +627,7 @@ git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON pip install -r %work_dir%\python\unittest_py\requirements.txt --user if %ERRORLEVEL% NEQ 0 ( echo pip install unittest requirements.txt failed! - exit /b 7 + exit /b 5 ) for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%# From eaeeb884f17d5c60f1faf4d1f26c63d14944af97 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Mon, 11 Oct 2021 19:18:51 +0800 Subject: [PATCH 102/298] fix bug of clear third_party cache every 10 days (#36332) --- paddle/scripts/paddle_build.bat | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index c4528fdc75e233..e6320d5bd154d4 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -138,6 +138,17 @@ if %day_now% NEQ %day_before% ( echo %day_now% > %cache_dir%\day.txt type %cache_dir%\day.txt rmdir %BUILD_DIR% /s/q + + : clear third party cache every once in a while + if %day_now% EQU 21 ( + rmdir %cache_dir%\third_party /s/q + ) + if %day_now% EQU 11 ( + rmdir %cache_dir%\third_party /s/q + ) + if %day_now% EQU 01 ( + rmdir %cache_dir%\third_party /s/q + ) goto :mkbuild ) @@ -333,24 +344,6 @@ rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000 rem clcache.exe -M 21474836480 rem ------set third_party cache dir------ -: clear third party cache every once in a while -for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%# -set day_now=%datetime:~6,2% -set day_before=-1 -set /p day_before=< %cache_dir%\day.txt -if %day_now% NEQ %day_before% ( - echo %day_now% > %cache_dir%\day.txt - type %cache_dir%\day.txt - if %day_now% EQU 21 ( - rmdir %cache_dir%\third_party /s/q - ) - if %day_now% EQU 11 ( - rmdir %cache_dir%\third_party /s/q - ) - if %day_now% EQU 01 ( - rmdir %cache_dir%\third_party /s/q - ) -) if "%WITH_TPCACHE%"=="OFF" ( set THIRD_PARTY_PATH=%work_dir:\=/%/%BUILD_DIR%/third_party From 830debc2da15fb42ca9a03f4d331e446248c643e Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Mon, 11 Oct 2021 19:29:07 +0800 Subject: [PATCH 103/298] Add functor_primitives.h for kernel primtive api (#36203) * Add functor_primitives.h for kernel primtive api * update * move namespace kps * subFunctor init_data * delete InvalidArgumentError --- .../kernel_primitives/functor_primitives.h | 230 ++++++++++++++++++ .../kernel_primitives/kernel_primitives.h | 1 + 2 files changed, 231 insertions(+) create mode 100644 paddle/fluid/operators/kernel_primitives/functor_primitives.h diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h new file mode 100644 index 00000000000000..fcfcdc28b1f009 --- /dev/null +++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h @@ -0,0 +1,230 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace paddle { +namespace operators { +namespace kernel_primitives { +namespace details { + +static __device__ __forceinline__ platform::float16 Exp(platform::float16 x) { + return ::Eigen::numext::exp(x); +} + +static __device__ __forceinline__ float Exp(float x) { return expf(x); } + +static __device__ __forceinline__ double Exp(double x) { return exp(x); } + +static __device__ __forceinline__ platform::float16 Log(platform::float16 x) { + return ::Eigen::numext::log(x); +} + +static __device__ __forceinline__ float Log(float x) { return logf(x); } + +static __device__ __forceinline__ double Log(double x) { return log(x); } + +} // namespace details + +/******************************** Unary Functor *******************************/ + +/** + * @brief Default unary exp functor + */ +template +struct ExpFunctor { + HOSTDEVICE inline ExpFunctor() {} + + HOSTDEVICE explicit inline ExpFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(details::Exp(x)); + } +}; + +/** + * @brief Default unary identity functor + */ +template +struct IdentityFunctor { + HOSTDEVICE inline IdentityFunctor() {} + + HOSTDEVICE explicit inline IdentityFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(x); + } +}; + +/** + * @brief Default unary div functor. Divide by a constant + */ +template +struct DivideFunctor { + HOSTDEVICE inline DivideFunctor() { n_inv = static_cast(1.0f); } + + HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((Tx)(1.0 / n)) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(x * n_inv); + } + + private: + Tx n_inv; +}; + +/** + * @brief Default unary square functor + */ +template +struct SquareFunctor { + HOSTDEVICE inline SquareFunctor() {} + + HOSTDEVICE explicit inline SquareFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(x) * static_cast(x); + } +}; + +/****************************** Binary Functor ********************************/ + +/** + * @brief Default binary min functor + */ +template +struct MinFunctor { + inline T initial() { return static_cast(std::numeric_limits::max()); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return (b < a) ? b : a; + } +}; + +/** + * @brief Default binary max functor + */ +template +struct MaxFunctor { + inline T initial() { + return static_cast(std::numeric_limits::lowest()); + } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return (b > a) ? b : a; + } +}; + +/** + * @brief Default binary add functor + */ +template +struct AddFunctor { + inline T initial() { return static_cast(0.0f); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return b + a; + } +}; + +/** + * @brief Default binary add functor + */ +template +struct MulFunctor { + inline T initial() { return static_cast(1.0f); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return b * a; + } +}; + +/** + * @brief Default binary logic or functor + */ +template +struct LogicalOrFunctor { + inline T initial() { return static_cast(false); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return b || a; + } +}; + +/** + * @brief Default binary logic and functor + */ +template +struct LogicalAndFunctor { + inline T initial() { return static_cast(true); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return b && a; + } +}; + +/** + * @brief Default binary sub functor + */ +template +struct SubFunctor { + inline T initial() { return static_cast(0.0f); } + + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; } +}; + +/** + * @brief Default binary div functor + */ +template +struct DivFunctor { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; } +}; + +template +struct DivFunctor::value>::type> { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T& a, const T& b) const { + // For int32/int64, need to check whether the divison is zero. + PADDLE_ENFORCE_NE(b, 0, + platform::errors::InvalidArgument( + "Integer division by zero encountered " + "in (floor) divide. Please check the input value.")); + return a / b; + } +}; + +/** + * @brief Default binary floor divide functor + */ +template +struct FloorDivFunctor { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T& a, const T& b) const { + PADDLE_ENFORCE_NE(b, 0, + platform::errors::InvalidArgument( + "Integer division by zero encountered " + "in (floor) divide. Please check the input value.")); + return static_cast(std::trunc(a / b)); + } +}; + +} // namespace kernel_primitives +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h index 45ee4fd738174b..9a4f8bb026b9da 100644 --- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h @@ -16,6 +16,7 @@ #include "paddle/fluid/operators/kernel_primitives/compute_primitives.h" #include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h" +#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h" #include "paddle/fluid/operators/kernel_primitives/helper_primitives.h" namespace paddle { From a679fcbb26f9f7abb5938d4c201ef5125cd5c580 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Mon, 11 Oct 2021 19:33:43 +0800 Subject: [PATCH 104/298] Add more tests and fix bugs for cudnn_norm_conv_test and cudnn_bn_and_relu_test (#36314) --- .../operators/fused/cudnn_bn_add_relu_test.cc | 650 +++++++++++++++--- .../operators/fused/cudnn_norm_conv_test.cc | 71 +- 2 files changed, 599 insertions(+), 122 deletions(-) diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 7229754cb8ed82..837bca6c2cf4e3 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -33,6 +33,8 @@ namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; USE_OP(batch_norm); +USE_CUDA_ONLY_OP(fused_bn_add_activation); +USE_CUDA_ONLY_OP(fused_bn_add_activation_grad); template void InitRandomTensor(const std::vector &dims, @@ -40,7 +42,7 @@ void InitRandomTensor(const std::vector &dims, T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), platform::CPUPlace()); std::default_random_engine random(0); - std::uniform_real_distribution dis(0.0, 1.0); + std::uniform_real_distribution dis(-1.0, 1.0); for (int i = 0; i < cpu_out->numel(); ++i) { cpu_out_ptr[i] = static_cast(dis(random)); } @@ -89,7 +91,7 @@ void CheckOutput(std::string name, const framework::Tensor &cpu_res, } } std::string error_type = is_relative_atol ? "relative" : "absolute"; - LOG(INFO) << "[" << name << "], The dims is [" << cpu_res.dims() + LOG(INFO) << "[" << name << "] The dims is [" << cpu_res.dims() << "], maximum " << error_type << " error is " << max_diff << ": " << cpu_res_ptr[index] << " vs " << cpu_base_ptr[index]; } @@ -121,13 +123,33 @@ void ComputeSumAndSquareSum(const framework::Tensor &cpu_x, } } -// get paddle batchnorm op results as baseline +template +void ComputeInplaceAdd(const framework::Tensor &cpu_x, + framework::Tensor *cpu_y) { + EXPECT_EQ(cpu_x.dims(), cpu_y->dims()); + + const T *cpu_x_ptr = cpu_x.data(); + T *cpu_y_ptr = cpu_y->data(); + for (int64_t i = 0; i < cpu_x.numel(); ++i) { + cpu_y_ptr[i] += cpu_x_ptr[i]; + } +} + +template +void ComputeInplaceRelu(framework::Tensor *cpu_x) { + T *cpu_x_ptr = cpu_x->data(); + for (int64_t i = 0; i < cpu_x->numel(); ++i) { + cpu_x_ptr[i] = + cpu_x_ptr[i] > static_cast(0) ? cpu_x_ptr[i] : static_cast(0); + } +} + void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, const Tensor &cpu_x, const Tensor &cpu_scale, const Tensor &cpu_bias, Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean, Tensor *cpu_saved_var, Tensor *cpu_y, - Tensor *cpu_reserve_space) { + Tensor *saved_reserve_space) { framework::Scope scope; auto *x = scope.Var("X")->GetMutable(); auto *scale = scope.Var("Scale")->GetMutable(); @@ -178,68 +200,258 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, TensorCopySync(*var, platform::CPUPlace(), cpu_var); TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); - TensorCopySync(*reserve_space, platform::CPUPlace(), cpu_reserve_space); + // reserved_space will stay on GPU and used in grad op. + saved_reserve_space->ShareDataWith(*reserve_space); +} + +void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_x, const Tensor &cpu_z, + const Tensor &cpu_scale, + const Tensor &cpu_bias, Tensor *cpu_mean, + Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *saved_reserve_space) { + framework::Scope scope; + auto *x = scope.Var("X")->GetMutable(); + auto *z = scope.Var("Z")->GetMutable(); + auto *scale = scope.Var("Scale")->GetMutable(); + auto *bias = scope.Var("Bias")->GetMutable(); + auto *mean = scope.Var("Mean")->GetMutable(); + auto *var = scope.Var("Variance")->GetMutable(); + auto *y = scope.Var("Y")->GetMutable(); + auto *saved_mean = scope.Var("SavedMean")->GetMutable(); + auto *saved_var = + scope.Var("SavedVariance")->GetMutable(); + auto *reserve_space = + scope.Var("ReserveSpace")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x, place, x); + TensorCopySync(cpu_z, place, z); + TensorCopySync(cpu_scale, place, scale); + TensorCopySync(cpu_bias, place, bias); + TensorCopySync(*cpu_mean, place, mean); + TensorCopySync(*cpu_var, place, var); + + int64_t channels = x->dims()[3]; + scale->Resize({channels}); + bias->Resize({channels}); + mean->Resize({channels}); + var->Resize({channels}); + + framework::AttributeMap attrs; + + auto op = framework::OpRegistry::CreateOp( + "fused_bn_add_activation", + {{"X", {"X"}}, {"Z", {"Z"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}}, + {{"Y", {"Y"}}, + {"MeanOut", {"Mean"}}, + {"VarianceOut", {"Variance"}}, + {"SavedMean", {"SavedMean"}}, + {"SavedVariance", {"SavedVariance"}}, + {"ReserveSpace", {"ReserveSpace"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*y, platform::CPUPlace(), cpu_y); + TensorCopySync(*mean, platform::CPUPlace(), cpu_mean); + TensorCopySync(*var, platform::CPUPlace(), cpu_var); + TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); + TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); + // reserved_space will stay on GPU and used in grad op. + saved_reserve_space->ShareDataWith(*reserve_space); +} + +void ComputeFusedBNAddReluBackward( + const platform::CUDADeviceContext &ctx, const Tensor &cpu_dy, + const Tensor &cpu_x, const Tensor &cpu_scale, const Tensor &cpu_bias, + const Tensor &cpu_saved_mean, const Tensor &cpu_saved_var, + const Tensor &cpu_y, const Tensor &saved_reserve_space, Tensor *cpu_dx, + Tensor *cpu_dz, Tensor *cpu_dscale, Tensor *cpu_dbias) { + framework::Scope scope; + auto *x = scope.Var("X")->GetMutable(); + auto *y = scope.Var("Y")->GetMutable(); + auto *dy = scope.Var("Y@GRAD")->GetMutable(); + auto *scale = scope.Var("Scale")->GetMutable(); + auto *bias = scope.Var("Bias")->GetMutable(); + auto *saved_mean = scope.Var("SavedMean")->GetMutable(); + auto *saved_var = + scope.Var("SavedVariance")->GetMutable(); + auto *reserve_space = + scope.Var("ReserveSpace")->GetMutable(); + auto *dx = scope.Var("X@GRAD")->GetMutable(); + auto *dz = scope.Var("Z@GRAD")->GetMutable(); + auto *dscale = scope.Var("Scale@GRAD")->GetMutable(); + auto *dbias = scope.Var("Bias@GRAD")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x, place, x); + TensorCopySync(cpu_y, place, y); + TensorCopySync(cpu_dy, place, dy); + TensorCopySync(cpu_scale, place, scale); + TensorCopySync(cpu_bias, place, bias); + TensorCopySync(cpu_saved_mean, place, saved_mean); + TensorCopySync(cpu_saved_var, place, saved_var); + reserve_space->ShareDataWith(saved_reserve_space); + + int64_t channels = x->dims()[3]; + scale->Resize({channels}); + bias->Resize({channels}); + saved_mean->Resize({channels}); + saved_var->Resize({channels}); + + framework::AttributeMap attrs; + float momentum = 0.9; + float epsilon = 1e-5; + std::string act_type = "relu"; + attrs.insert({"momentum", momentum}); + attrs.insert({"epsilon", epsilon}); + attrs.insert({"act_type", act_type}); + + auto op = framework::OpRegistry::CreateOp( + "fused_bn_add_activation_grad", {{"X", {"X"}}, + {"Y", {"Y"}}, + {"Y@GRAD", {"Y@GRAD"}}, + {"Scale", {"Scale"}}, + {"Bias", {"Bias"}}, + {"SavedMean", {"SavedMean"}}, + {"SavedVariance", {"SavedVariance"}}, + {"ReserveSpace", {"ReserveSpace"}}}, + {{"X@GRAD", {"X@GRAD"}}, + {"Z@GRAD", {"Z@GRAD"}}, + {"Scale@GRAD", {"Scale@GRAD"}}, + {"Bias@GRAD", {"Bias@GRAD"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*dx, platform::CPUPlace(), cpu_dx); + TensorCopySync(*dz, platform::CPUPlace(), cpu_dz); + TensorCopySync(*dscale, platform::CPUPlace(), cpu_dscale); + TensorCopySync(*dbias, platform::CPUPlace(), cpu_dbias); } template class CudnnBNAddReluTester { public: - CudnnBNAddReluTester(int batch_size, int height, int width, int channels) { + CudnnBNAddReluTester(int batch_size, int height, int width, int channels, + std::string act_type, bool fuse_add, bool has_shortcut) { batch_size_ = batch_size; height_ = height; width_ = width; channels_ = channels; ele_count_ = batch_size_ * height_ * width_; + act_type_ = act_type; + fuse_add_ = fuse_add; + has_shortcut_ = has_shortcut; SetUp(); } ~CudnnBNAddReluTester() {} void CheckForward(float diff, bool is_relative_atol = false) { + LOG(INFO) << "[CheckForward, diff=" << diff + << ", is_relative_atol=" << is_relative_atol + << "] act_type=" << act_type_ << ", fuse_add=" << fuse_add_ + << ", has_shortcut=" << has_shortcut_; platform::CUDADeviceContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(0))); - framework::Tensor cpu_mean_base; - framework::Tensor cpu_var_base; - framework::Tensor cpu_saved_mean_base; - framework::Tensor cpu_saved_var_base; - framework::Tensor cpu_y_base; - framework::Tensor cpu_reserve_space_base; - BaselineForward(*ctx, &cpu_mean_base, &cpu_var_base, &cpu_saved_mean_base, - &cpu_saved_var_base, &cpu_y_base, &cpu_reserve_space_base); - - framework::Tensor cpu_mean; - framework::Tensor cpu_var; - framework::Tensor cpu_saved_mean; - framework::Tensor cpu_saved_var; - framework::Tensor cpu_y; - framework::Tensor cpu_bitmask; - FusedForward(*ctx, &cpu_mean, &cpu_var, &cpu_saved_mean, &cpu_saved_var, - &cpu_y, &cpu_bitmask); + auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; }; - CheckOutput("Mean", cpu_mean, cpu_mean_base, diff, is_relative_atol); - CheckOutput("Variance", cpu_var, cpu_var_base, diff, + framework::Tensor cpu_mean_base_x; + framework::Tensor cpu_var_base_x; + framework::Tensor cpu_mean_base_z; + framework::Tensor cpu_var_base_z; + if (!has_shortcut_ && fuse_add_ && (act_type_ == "relu")) { + BaselineForwardFusedBNAddRelu( + *ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_, + &cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_); + } else { + BaselineForward( + *ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_, + &cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_, + select(&cpu_mean_base_z), select(&cpu_var_base_z), + select(&cpu_saved_mean_base_z_), select(&cpu_saved_var_base_z_), + select(&saved_reserve_space_z_)); + } + + framework::Tensor cpu_mean_x; + framework::Tensor cpu_var_x; + framework::Tensor cpu_y; + framework::Tensor cpu_mean_z; + framework::Tensor cpu_var_z; + FusedForward(*ctx, &cpu_mean_x, &cpu_var_x, &cpu_saved_mean_x_, + &cpu_saved_var_x_, &cpu_y, &cpu_bitmask_, select(&cpu_mean_z), + select(&cpu_var_z), select(&cpu_saved_mean_z_), + select(&cpu_saved_var_z_)); + + CheckOutput("Mean", cpu_mean_x, cpu_mean_base_x, diff, + is_relative_atol); + CheckOutput("Variance", cpu_var_x, cpu_var_base_x, diff, is_relative_atol); - CheckOutput("SavedMean", cpu_saved_mean, cpu_saved_mean_base, diff, + CheckOutput("SavedMean", cpu_saved_mean_x_, cpu_saved_mean_base_x_, + diff, is_relative_atol); + CheckOutput("SavedVariance", cpu_saved_var_x_, cpu_saved_var_base_x_, + diff, is_relative_atol); + if (has_shortcut_) { + CheckOutput("MeanZ", cpu_mean_z, cpu_mean_base_z, diff, + is_relative_atol); + CheckOutput("VarianceZ", cpu_var_z, cpu_var_base_z, diff, + is_relative_atol); + CheckOutput("SavedMeanZ", cpu_saved_mean_z_, + cpu_saved_mean_base_z_, diff, is_relative_atol); + CheckOutput("SavedVarianceZ", cpu_saved_var_z_, + cpu_saved_var_base_z_, diff, is_relative_atol); + } + CheckOutput("Y", cpu_y, cpu_y_base_, diff, is_relative_atol); + } + + void CheckBackward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + framework::Tensor cpu_dx_base; + framework::Tensor cpu_dz_base; + framework::Tensor cpu_dscale_base; + framework::Tensor cpu_dbias_base; + BaselineBackwardFusedBNAddRelu(*ctx, &cpu_dx_base, &cpu_dz_base, + &cpu_dscale_base, &cpu_dbias_base); + + framework::Tensor cpu_dx; + framework::Tensor cpu_dz; + framework::Tensor cpu_dscale; + framework::Tensor cpu_dbias; + FusedBackward(*ctx, &cpu_dx, &cpu_dz, &cpu_dscale, &cpu_dbias); + + CheckOutput("DX", cpu_dx, cpu_dx_base, diff, is_relative_atol); + CheckOutput("DZ", cpu_dz, cpu_dz_base, diff, is_relative_atol); + CheckOutput("DScale", cpu_dscale, cpu_dscale_base, diff, is_relative_atol); - CheckOutput("SavedVariance", cpu_saved_var, cpu_saved_var_base, diff, + CheckOutput("DBias", cpu_dbias, cpu_dbias_base, diff, is_relative_atol); - CheckOutput("Y", cpu_y, cpu_y_base, diff, is_relative_atol); } private: void SetUp() { - // Initialize input data InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_x_); - ComputeSumAndSquareSum(cpu_x_, &cpu_sum_, &cpu_sum_of_square_); + InitRandomTensor({channels_}, &cpu_bn_scale_x_); + InitRandomTensor({channels_}, &cpu_bn_bias_x_); - // scale and bias should be initialized randomly. - InitConstantTensor({channels_}, static_cast(1.0f), - &cpu_bn_scale_); - InitConstantTensor({channels_}, static_cast(0.0f), - &cpu_bn_bias_); + if (has_shortcut_) { + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_z_); + InitRandomTensor({channels_}, &cpu_bn_scale_z_); + InitRandomTensor({channels_}, &cpu_bn_bias_z_); + } else { + if (fuse_add_) { + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_z_); + } + } + + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_dy_); } void InitMeanVar(Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean, @@ -252,71 +464,178 @@ class CudnnBNAddReluTester { cpu_saved_var); } - void BaselineForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean, - Tensor *cpu_var, Tensor *cpu_saved_mean, - Tensor *cpu_saved_var, Tensor *cpu_y, - Tensor *cpu_reserve_space) { + void BaselineForward(const platform::CUDADeviceContext &ctx, + Tensor *cpu_mean_x, Tensor *cpu_var_x, + Tensor *cpu_saved_mean_x, Tensor *cpu_saved_var_x, + Tensor *cpu_y, Tensor *saved_reserve_space_x, + Tensor *cpu_mean_z = nullptr, + Tensor *cpu_var_z = nullptr, + Tensor *cpu_saved_mean_z = nullptr, + Tensor *cpu_saved_var_z = nullptr, + Tensor *saved_reserve_space_z = nullptr) { + InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x); + ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, + cpu_mean_x, cpu_var_x, cpu_saved_mean_x, + cpu_saved_var_x, cpu_y, saved_reserve_space_x); + if (has_shortcut_) { + framework::Tensor cpu_z_out; + InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z); + ComputeBatchNormForward( + ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_, cpu_mean_z, cpu_var_z, + cpu_saved_mean_z, cpu_saved_var_z, &cpu_z_out, saved_reserve_space_z); + ComputeInplaceAdd(cpu_z_out, cpu_y); + } else { + if (fuse_add_) { + ComputeInplaceAdd(cpu_z_, cpu_y); + } + } + if (act_type_ == "relu") { + ComputeInplaceRelu(cpu_y); + } + } + + void BaselineForwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, + Tensor *cpu_mean, Tensor *cpu_var, + Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *saved_reserve_space) { InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var); - ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_, cpu_bn_bias_, cpu_mean, - cpu_var, cpu_saved_mean, cpu_saved_var, cpu_y, - cpu_reserve_space); + ComputeFusedBNAddReluForward( + ctx, cpu_x_, cpu_z_, cpu_bn_scale_x_, cpu_bn_bias_x_, cpu_mean, cpu_var, + cpu_saved_mean, cpu_saved_var, cpu_y, saved_reserve_space); + } + + void BaselineBackwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, + Tensor *cpu_dx, Tensor *cpu_dz, + Tensor *cpu_dscale, Tensor *cpu_dbias) { + ComputeFusedBNAddReluBackward( + ctx, cpu_dy_, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, + cpu_saved_mean_base_x_, cpu_saved_var_base_x_, cpu_y_base_, + saved_reserve_space_x_, cpu_dx, cpu_dz, cpu_dscale, cpu_dbias); + } + + void ComputeFusedBNStatsFinalize(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_x, + const Tensor &cpu_bn_scale, + const Tensor &cpu_bn_bias, Tensor *sum, + Tensor *sum_of_square, Tensor *bn_scale, + Tensor *bn_bias, Tensor *mean, Tensor *var, + Tensor *saved_mean, Tensor *saved_var, + Tensor *equiv_scale, Tensor *equiv_bias) { + framework::Tensor cpu_sum; + framework::Tensor cpu_sum_of_square; + ComputeSumAndSquareSum(cpu_x, &cpu_sum, &cpu_sum_of_square); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_sum, place, sum); + TensorCopySync(cpu_sum_of_square, place, sum_of_square); + TensorCopySync(cpu_bn_scale, place, bn_scale); + TensorCopySync(cpu_bn_bias, place, bn_bias); + + bn_scale->Resize({1, 1, 1, channels_}); + bn_bias->Resize({1, 1, 1, channels_}); + + // input + float *sum_ptr = sum->data(); + float *sum_of_square_ptr = sum_of_square->data(); + float *bn_scale_ptr = bn_scale->data(); + float *bn_bias_ptr = bn_bias->data(); + + mean->Resize({1, 1, 1, channels_}); + var->Resize({1, 1, 1, channels_}); + + // output + float *mean_ptr = mean->data(); + float *var_ptr = var->data(); + float *saved_mean_ptr = + saved_mean->mutable_data({1, 1, 1, channels_}, place); + float *saved_var_ptr = + saved_var->mutable_data({1, 1, 1, channels_}, place); + T *equiv_scale_ptr = + equiv_scale->mutable_data({1, 1, 1, channels_}, place); + T *equiv_bias_ptr = + equiv_bias->mutable_data({1, 1, 1, channels_}, place); + + auto param_shape = framework::vectorize(bn_scale->dims()); + op::CudnnBNStatsFinalize bn_op(ctx, param_shape); + bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr, + saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr, + equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_, + true); } // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu - void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean, - Tensor *cpu_var, Tensor *cpu_saved_mean, - Tensor *cpu_saved_var, Tensor *cpu_y, Tensor *cpu_bitmask) { + void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean_x, + Tensor *cpu_var_x, Tensor *cpu_saved_mean_x, + Tensor *cpu_saved_var_x, Tensor *cpu_y, Tensor *cpu_bitmask, + Tensor *cpu_mean_z = nullptr, Tensor *cpu_var_z = nullptr, + Tensor *cpu_saved_mean_z = nullptr, + Tensor *cpu_saved_var_z = nullptr) { framework::Tensor x; - framework::Tensor sum; - framework::Tensor sum_of_square; - framework::Tensor bn_scale; - framework::Tensor bn_bias; + framework::Tensor sum_x; + framework::Tensor sum_of_square_x; + framework::Tensor bn_scale_x; + framework::Tensor bn_bias_x; + + framework::Tensor z; + framework::Tensor sum_z; + framework::Tensor sum_of_square_z; + framework::Tensor bn_scale_z; + framework::Tensor bn_bias_z; auto place = ctx.GetPlace(); TensorCopySync(cpu_x_, place, &x); - TensorCopySync(cpu_sum_, place, &sum); - TensorCopySync(cpu_sum_of_square_, place, &sum_of_square); - TensorCopySync(cpu_bn_scale_, place, &bn_scale); - TensorCopySync(cpu_bn_bias_, place, &bn_bias); + if (fuse_add_ || has_shortcut_) { + TensorCopySync(cpu_z_, place, &z); + } - bn_scale.Resize({1, 1, 1, channels_}); - bn_bias.Resize({1, 1, 1, channels_}); + framework::Tensor mean_x; + framework::Tensor var_x; + framework::Tensor saved_mean_x; + framework::Tensor saved_var_x; + framework::Tensor equiv_scale_x; + framework::Tensor equiv_bias_x; - T *x_ptr = x.data(); - float *sum_ptr = sum.data(); - float *sum_of_square_ptr = sum_of_square.data(); - float *bn_scale_ptr = bn_scale.data(); - float *bn_bias_ptr = bn_bias.data(); + framework::Tensor mean_z; + framework::Tensor var_z; + framework::Tensor saved_mean_z; + framework::Tensor saved_var_z; + framework::Tensor equiv_scale_z; + framework::Tensor equiv_bias_z; - framework::Tensor mean; - framework::Tensor var; - framework::Tensor saved_mean; - framework::Tensor saved_var; - framework::Tensor equiv_scale; - framework::Tensor equiv_bias; framework::Tensor y; framework::Tensor bitmask; - InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var); - TensorCopySync(*cpu_mean, place, &mean); - TensorCopySync(*cpu_var, place, &var); + InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x); + TensorCopySync(*cpu_mean_x, place, &mean_x); + TensorCopySync(*cpu_var_x, place, &var_x); + if (has_shortcut_) { + InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z); + TensorCopySync(*cpu_mean_z, place, &mean_z); + TensorCopySync(*cpu_var_z, place, &var_z); + } - mean.Resize({1, 1, 1, channels_}); - var.Resize({1, 1, 1, channels_}); + // 1. BN Stats Finalize + ComputeFusedBNStatsFinalize(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, + &sum_x, &sum_of_square_x, &bn_scale_x, + &bn_bias_x, &mean_x, &var_x, &saved_mean_x, + &saved_var_x, &equiv_scale_x, &equiv_bias_x); + if (has_shortcut_) { + ComputeFusedBNStatsFinalize(ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_, + &sum_z, &sum_of_square_z, &bn_scale_z, + &bn_bias_z, &mean_z, &var_z, &saved_mean_z, + &saved_var_z, &equiv_scale_z, &equiv_bias_z); + } - float *mean_ptr = mean.data(); - float *var_ptr = var.data(); - float *saved_mean_ptr = - saved_mean.mutable_data({1, 1, 1, channels_}, place); - float *saved_var_ptr = - saved_var.mutable_data({1, 1, 1, channels_}, place); - T *equiv_scale_ptr = - equiv_scale.mutable_data({1, 1, 1, channels_}, place); - T *equiv_bias_ptr = equiv_bias.mutable_data({1, 1, 1, channels_}, place); + T *x_ptr = x.data(); + T *z_ptr = (fuse_add_ || has_shortcut_) ? z.data() : nullptr; + T *equiv_scale_x_ptr = equiv_scale_x.data(); + T *equiv_bias_x_ptr = equiv_bias_x.data(); + T *equiv_scale_z_ptr = has_shortcut_ ? equiv_scale_z.data() : nullptr; + T *equiv_bias_z_ptr = has_shortcut_ ? equiv_bias_z.data() : nullptr; T *y_ptr = y.mutable_data({batch_size_, height_, width_, channels_}, place); - // bitmask int c = channels_; int64_t nhw = ele_count_; int32_t c_int32_elems = ((c + 63) & ~63) / 32; @@ -325,31 +644,90 @@ class CudnnBNAddReluTester { {nhw_int32_elems, c_int32_elems, 1}, place); auto data_shape = framework::vectorize(x.dims()); - auto param_shape = framework::vectorize(bn_scale.dims()); + auto param_shape = framework::vectorize(bn_scale_x.dims()); auto bitmask_shape = framework::vectorize(bitmask.dims()); - // 1. BN Stats Finalize - op::CudnnBNStatsFinalize bn_op(ctx, param_shape); - bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr, - saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr, - equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_, - true); - - // 2. Scale Bias + Relu (not fused add) - std::string act_type = ""; - op::CudnnScaleBiasAddRelu sbar_op( - ctx, act_type, false, false, data_shape, param_shape, bitmask_shape); - sbar_op.Forward(ctx, x_ptr, equiv_scale_ptr, equiv_bias_ptr, y_ptr, - bitmask_ptr); - - TensorCopySync(mean, platform::CPUPlace(), cpu_mean); - TensorCopySync(var, platform::CPUPlace(), cpu_var); - TensorCopySync(saved_mean, platform::CPUPlace(), cpu_saved_mean); - TensorCopySync(saved_var, platform::CPUPlace(), cpu_saved_var); + // 2. Scale Bias + Relu + op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type_, fuse_add_, + has_shortcut_, data_shape, param_shape, + bitmask_shape); + sbar_op.Forward(ctx, x_ptr, equiv_scale_x_ptr, equiv_bias_x_ptr, y_ptr, + bitmask_ptr, z_ptr, equiv_scale_z_ptr, equiv_bias_z_ptr); + + TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x); + TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x); + TensorCopySync(saved_mean_x, platform::CPUPlace(), cpu_saved_mean_x); + TensorCopySync(saved_var_x, platform::CPUPlace(), cpu_saved_var_x); + if (has_shortcut_) { + TensorCopySync(mean_z, platform::CPUPlace(), cpu_mean_z); + TensorCopySync(var_z, platform::CPUPlace(), cpu_var_z); + TensorCopySync(saved_mean_z, platform::CPUPlace(), cpu_saved_mean_z); + TensorCopySync(saved_var_z, platform::CPUPlace(), cpu_saved_var_z); + } TensorCopySync(y, platform::CPUPlace(), cpu_y); TensorCopySync(bitmask, platform::CPUPlace(), cpu_bitmask); } + // Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu + void FusedBackward(const platform::CUDADeviceContext &ctx, Tensor *cpu_dx, + Tensor *cpu_dz, Tensor *cpu_dscale, Tensor *cpu_dbias) { + framework::Tensor dy; + framework::Tensor x; + framework::Tensor bn_scale; + framework::Tensor bn_bias; + framework::Tensor saved_mean; + framework::Tensor saved_var; + framework::Tensor bitmask; + framework::Tensor dx; + framework::Tensor dz; + framework::Tensor dscale; + framework::Tensor dbias; + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_dy_, place, &dy); + TensorCopySync(cpu_x_, place, &x); + TensorCopySync(cpu_bn_scale_x_, place, &bn_scale); + TensorCopySync(cpu_bn_bias_x_, place, &bn_bias); + TensorCopySync(cpu_saved_mean_x_, place, &saved_mean); + TensorCopySync(cpu_saved_var_x_, place, &saved_var); + TensorCopySync(cpu_bitmask_, place, &bitmask); + + bn_scale.Resize({1, 1, 1, channels_}); + bn_bias.Resize({1, 1, 1, channels_}); + saved_mean.Resize({1, 1, 1, channels_}); + saved_var.Resize({1, 1, 1, channels_}); + + T *dy_ptr = dy.data(); + T *x_ptr = x.data(); + float *bn_scale_ptr = bn_scale.data(); + float *bn_bias_ptr = bn_bias.data(); + float *saved_mean_ptr = saved_mean.data(); + float *saved_var_ptr = saved_var.data(); + int32_t *bitmask_ptr = bitmask.data(); + T *dx_ptr = + dx.mutable_data({batch_size_, height_, width_, channels_}, place); + T *dz_ptr = + dz.mutable_data({batch_size_, height_, width_, channels_}, place); + float *dscale_ptr = dscale.mutable_data({1, 1, 1, channels_}, place); + float *dbias_ptr = dbias.mutable_data({1, 1, 1, channels_}, place); + + auto data_shape = framework::vectorize(x.dims()); + auto param_shape = framework::vectorize(bn_scale.dims()); + auto bitmask_shape = framework::vectorize(bitmask.dims()); + + std::string act_type = "relu"; + op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type, true, false, data_shape, + param_shape, bitmask_shape); + sbar_op.Backward(ctx, dy_ptr, x_ptr, bn_scale_ptr, bn_bias_ptr, + saved_mean_ptr, saved_var_ptr, bitmask_ptr, dx_ptr, dz_ptr, + dscale_ptr, dbias_ptr, eps_); + + TensorCopySync(dx, platform::CPUPlace(), cpu_dx); + TensorCopySync(dz, platform::CPUPlace(), cpu_dz); + TensorCopySync(dscale, platform::CPUPlace(), cpu_dscale); + TensorCopySync(dbias, platform::CPUPlace(), cpu_dbias); + } + private: int batch_size_; int height_; @@ -357,24 +735,80 @@ class CudnnBNAddReluTester { int channels_; int ele_count_; + std::string act_type_; + bool fuse_add_; + bool has_shortcut_; + // Forward input framework::Tensor cpu_x_; - framework::Tensor cpu_sum_; - framework::Tensor cpu_sum_of_square_; - framework::Tensor cpu_bn_scale_; - framework::Tensor cpu_bn_bias_; + framework::Tensor cpu_bn_scale_x_; + framework::Tensor cpu_bn_bias_x_; + framework::Tensor cpu_z_; + framework::Tensor cpu_bn_scale_z_; + framework::Tensor cpu_bn_bias_z_; + + // Backward input + framework::Tensor cpu_dy_; + framework::Tensor cpu_bitmask_; + framework::Tensor cpu_saved_mean_x_; + framework::Tensor cpu_saved_var_x_; + framework::Tensor cpu_saved_mean_z_; + framework::Tensor cpu_saved_var_z_; + framework::Tensor cpu_saved_mean_base_x_; + framework::Tensor cpu_saved_var_base_x_; + framework::Tensor saved_reserve_space_x_; + framework::Tensor cpu_saved_mean_base_z_; + framework::Tensor cpu_saved_var_base_z_; + framework::Tensor saved_reserve_space_z_; + framework::Tensor cpu_y_base_; double eps_ = 1e-5; float momentum_ = 0.9; }; -TEST(CudnnBNAddReluForward, GPUCudnnBNAddReluForwardFp16) { +TEST(CudnnBNAddReluFp16, BNAdd) { + int batch_size = 4; + int height = 8; + int width = 8; + int channels = 64; + std::string act_type = ""; + bool has_shortcut = false; + FLAGS_cudnn_batchnorm_spatial_persistent = true; + for (auto fuse_add : {false, true}) { + CudnnBNAddReluTester test( + batch_size, height, width, channels, act_type, fuse_add, has_shortcut); + test.CheckForward(2e-3); + } +} + +TEST(CudnnBNAddReluFp16, BNAddRelu) { + int batch_size = 4; + int height = 8; + int width = 8; + int channels = 64; + std::string act_type = "relu"; + bool has_shortcut = false; + FLAGS_cudnn_batchnorm_spatial_persistent = true; + for (auto fuse_add : {false, true}) { + CudnnBNAddReluTester test( + batch_size, height, width, channels, act_type, fuse_add, has_shortcut); + test.CheckForward(2e-3); + if (fuse_add) { + test.CheckBackward(2e-4); + } + } +} + +TEST(CudnnBNAddReluFp16, HasShortcut) { int batch_size = 4; int height = 8; int width = 8; int channels = 64; + std::string act_type = ""; + bool fuse_add = false; + bool has_shortcut = true; FLAGS_cudnn_batchnorm_spatial_persistent = true; - CudnnBNAddReluTester test(batch_size, height, - width, channels); - test.CheckForward(2e-3); + CudnnBNAddReluTester test( + batch_size, height, width, channels, act_type, fuse_add, has_shortcut); + test.CheckForward(5e-3); } diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index fff7b327f3f2ec..4c14029b99c69c 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -92,10 +92,9 @@ void CheckOutput(const framework::Tensor &cpu_res, } // Use Paddle conv2d op results as baseline -template void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, const Tensor &cpu_input, const Tensor &cpu_filter, - Tensor *cpu_output) { + Tensor *cpu_output, int stride, int padding) { framework::Scope scope; auto *input = scope.Var("Input")->GetMutable(); auto *filter = scope.Var("Filter")->GetMutable(); @@ -108,10 +107,12 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, framework::AttributeMap attrs; bool use_cudnn = true; std::string data_format = "NHWC"; - std::string padding_algorithm = "SAME"; + std::vector strides = {stride, stride}; + std::vector paddings = {padding, padding}; + attrs.insert({"strides", strides}); + attrs.insert({"paddings", paddings}); attrs.insert({"use_cudnn", use_cudnn}); attrs.insert({"data_format", data_format}); - attrs.insert({"padding_algorithm", padding_algorithm}); auto op = framework::OpRegistry::CreateOp( "conv2d", {{"Input", {"Input"}}, {"Filter", {"Filter"}}}, @@ -122,7 +123,6 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, } // Use Paddle conv2d_grad op results as baseline -template void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, const Tensor &cpu_input, const Tensor &cpu_filter, const Tensor &cpu_output_grad, @@ -147,7 +147,7 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, framework::AttributeMap attrs; bool use_cudnn = true; std::string data_format = "NHWC"; - std::string padding_algorithm = "SAME"; + std::string padding_algorithm = "EXPLICIT"; std::vector strides = {stride, stride}; std::vector paddings = {padding, padding}; std::vector dilations = {dilation, dilation}; @@ -216,6 +216,8 @@ class CudnnNormConvolutionTester { kernel_size_ = kernel_size; stride_ = stride; padding_ = (kernel_size_ - 1) / 2; + out_height_ = (height_ + 2 * padding_ - kernel_size_) / stride_ + 1; + out_width_ = (width_ + 2 * padding_ - kernel_size_) / stride_ + 1; SetUp(); } @@ -227,6 +229,15 @@ class CudnnNormConvolutionTester { platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(0))); + if (!Support(*ctx)) { + LOG(INFO) + << "Current test is only supported in the platforms with " + << "compatiblity greater than or equal to 70 and the kernel size " + << "must be equal to 1 or 3. Besides, when the kernel size is 1, " + << "the stride must be 1 if the compatiblity is equal to 70."; + return; + } + framework::Tensor cpu_output_base; framework::Tensor cpu_sum_base; framework::Tensor cpu_sum_of_square_base; @@ -277,15 +288,17 @@ class CudnnNormConvolutionTester { &cpu_filter_nchw_); // transpoes for filter, NCHW -> NHWC TransposeNchwToNhwc(cpu_filter_nchw_, &cpu_filter_nhwc_); - InitRandomTensor({batch_size_, height_, width_, output_channels_}, - &cpu_output_grad_); + InitRandomTensor( + {batch_size_, out_height_, out_width_, output_channels_}, + &cpu_output_grad_); } void BaselineForward(const platform::CUDADeviceContext &ctx, framework::Tensor *cpu_output_base, framework::Tensor *cpu_sum_base, framework::Tensor *cpu_sum_of_square_base) { - ComputeConv2DForward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base); + ComputeConv2DForward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base, + stride_, padding_); ComputeSumAndSquareSum(*cpu_output_base, cpu_sum_base, cpu_sum_of_square_base); } @@ -293,10 +306,9 @@ class CudnnNormConvolutionTester { void BaselineBackward(const platform::CUDADeviceContext &ctx, framework::Tensor *cpu_input_grad_base, framework::Tensor *cpu_filter_grad_base) { - ComputeConv2DBackward(ctx, cpu_input_, cpu_filter_nchw_, - cpu_output_grad_, cpu_input_grad_base, - cpu_filter_grad_base, stride_, padding_, - dilation_); + ComputeConv2DBackward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_grad_, + cpu_input_grad_base, cpu_filter_grad_base, stride_, + padding_, dilation_); } // get forward results of cudnn_norm_conv @@ -316,7 +328,7 @@ class CudnnNormConvolutionTester { T *input_ptr = input.data(); T *filter_ptr = filter_nhwc.data(); T *output_ptr = output.mutable_data( - {batch_size_, height_, width_, output_channels_}, place); + {batch_size_, out_height_, out_width_, output_channels_}, place); float *sum_ptr = sum.mutable_data({1, 1, 1, output_channels_}, place); float *sum_of_square_ptr = @@ -369,10 +381,25 @@ class CudnnNormConvolutionTester { TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad); } + bool Support(const platform::CUDADeviceContext &ctx) { + if (ctx.GetComputeCapability() == 70) { + if ((kernel_size_ == 3) || ((kernel_size_ == 1) && (stride_ == 1))) { + return true; + } + } else if (ctx.GetComputeCapability() > 70) { + if ((kernel_size_ == 3) || (kernel_size_ == 1)) { + return true; + } + } + return false; + } + private: int batch_size_; int height_; int width_; + int out_height_; + int out_width_; int input_channels_; int output_channels_; int kernel_size_; @@ -437,3 +464,19 @@ TEST(CudnnNormConvFp16, K1S1O4) { test.CheckForward(1e-3, true); test.CheckBackward(1e-3, true); } + +// test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4 +TEST(CudnnNormConvFp16, K1S2O4) { + int batch_size = 4; + int height = 8; + int width = 8; + int input_channels = 32; + int output_channels = 128; + int kernel_size = 1; + int stride = 2; + CudnnNormConvolutionTester test( + batch_size, height, width, input_channels, output_channels, kernel_size, + stride); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3); +} From 14393876fca754330fe68e7c244a8d81d863b5a9 Mon Sep 17 00:00:00 2001 From: jakpiase <62569058+jakpiase@users.noreply.github.com> Date: Mon, 11 Oct 2021 13:43:07 +0200 Subject: [PATCH 105/298] added missing bf16 ops (#36291) --- .../framework/ir/graph_pattern_detector.cc | 37 ++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 4150d0ca555c9d..449849762cb101 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2263,15 +2263,34 @@ PDNode *patterns::QuantizePlacement::operator()( PDNode *patterns::Bfloat16Placement::operator()( const std::unordered_set &bfloat16_enabled_op_types) { std::unordered_set supported_op_types = - std::unordered_set( - {"concat", "conv2d", "conv2d_transpose", - "elementwise_add", "elementwise_mul", "fc", - "fusion_gru", "fusion_lstm", "gelu", - "layer_norm", "matmul", "matmul_v2", - "pool2d", "prelu", "relu", - "reshape2", "softmax", "split", - "squeeze", "squeeze2", "sum", - "transpose2"}); + std::unordered_set({"cast", + "clip", + "concat", + "conv2d", + "conv2d_transpose", + "elementwise_add", + "elementwise_mul", + "expand_v2", + "fc", + "fusion_gru", + "fusion_lstm", + "gelu", + "layer_norm", + "matmul", + "matmul_v2", + "pool2d", + "prelu", + "relu", + "reshape2", + "scale", + "sigmoid", + "slice", + "softmax", + "split", + "squeeze", + "squeeze2", + "sum", + "transpose2"}); if (!bfloat16_enabled_op_types.empty()) { supported_op_types = bfloat16_enabled_op_types; } From 85b77232768b53ee3db2f86653eeeedccbf570d1 Mon Sep 17 00:00:00 2001 From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com> Date: Mon, 11 Oct 2021 19:53:53 +0800 Subject: [PATCH 106/298] Add nn.functional.sparse_attention and some test cases, test=develop (#35757) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add paddle.nn.functional.sparse_attention API 本个PR主要将sparse_attention功能在python层进行了一层封装,OP的主体代码见:#PR35676 此外,对于封装的python 接口,增加了相应的单测。 --- paddle/fluid/operators/CMakeLists.txt | 2 +- .../fluid/tests/unittests/CMakeLists.txt | 5 + .../unittests/test_sparse_attention_op.py | 151 +++++++++++++++--- python/paddle/nn/functional/__init__.py | 3 + .../paddle/nn/functional/sparse_attention.py | 144 +++++++++++++++++ 5 files changed, 285 insertions(+), 20 deletions(-) create mode 100644 python/paddle/nn/functional/sparse_attention.py diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index c487313f91c588..b910b4ec73901b 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -94,7 +94,7 @@ if (WITH_GPU OR WITH_ROCM) endif() op_library(sync_batch_norm_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n") - if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) ) + if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) ) op_library(sparse_attention_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sparse_attention);\n") endif() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 0c2731bc45258d..9d6a1d00cff604 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -464,6 +464,11 @@ list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while) # disable this unittest temporarily list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception) +# disable sparse_attention which not in suitable env +if ( (NOT WITH_GPU) OR (WIN32) OR (PADDLE_WITH_ARM) OR (WITH_ROCM) ) + list(REMOVE_ITEM TEST_OPS test_sparse_attention_op) +endif() + if (APPLE OR WIN32) list(REMOVE_ITEM TEST_OPS test_dataset) list(REMOVE_ITEM TEST_OPS test_dataset_dataloader) diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py index 48401fb55ef3f5..5134b885f33072 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py @@ -16,10 +16,13 @@ import numpy as np from op_test import OpTest import paddle.fluid.core as core +from paddle.static import Program, program_guard import paddle +import paddle.fluid as fluid +import paddle.fluid.framework as framework +import paddle.nn.functional as F import os import re -import platform def get_cuda_version(): @@ -34,22 +37,6 @@ def get_cuda_version(): return -1 -def get_linux_platform(): - if platform.system().lower() == 'windows': - return 0 - elif platform.system().lower() == 'linux': - return 1 - else: - return -1 - - -def get_suitable_env(): - if get_cuda_version() >= 11020 and get_linux_platform() == 1: - return True - else: - return False - - def softmax(x): max = np.max(x, axis=1, keepdims=True) e_x = np.exp(x - max) @@ -141,8 +128,9 @@ def init_csr_format(batch_size, num_heads, rows, blocksize): @unittest.skipIf( - not core.is_compiled_with_cuda() or get_suitable_env() == False, - "core is not compiled with CUDA and cuda version need >= 11.2 in windows") + not core.is_compiled_with_cuda() or get_cuda_version() < 11020, + "core is not compiled with CUDA and cuda version need larger than or equal to 11.2" +) class TestSparseAttentionOp(OpTest): def config(self): self.shape = (1, 1, 16, 8) @@ -201,5 +189,130 @@ def config(self): self.dtype = "float64" +@unittest.skipIf( + not core.is_compiled_with_cuda() or get_cuda_version() < 11020, + "core is not compiled with CUDA and cuda version need larger than or equal to 11.2" +) +class TestSparseAttentionAPI(unittest.TestCase): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (1, 1, 8, 4) + self.blocksize = 2 + self.dtype = 'float64' + + def test_static_graph(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + Q = paddle.static.data(name="Q", shape=self.shape, dtype=self.dtype) + K = paddle.static.data(name="K", shape=self.shape, dtype=self.dtype) + V = paddle.static.data(name="V", shape=self.shape, dtype=self.dtype) + + batch_size, num_heads, rows = self.shape[0], self.shape[ + 1], self.shape[2] + block_num = rows / self.blocksize + block_last = rows % self.blocksize + sparse_nnz_num = block_num * self.blocksize * self.blocksize + block_last * block_last + offset_shape = (batch_size, num_heads, rows + 1) + columns_shape = (batch_size, num_heads, int(sparse_nnz_num)) + + offset = paddle.static.data( + name="Offset", shape=offset_shape, dtype="int32") + columns = paddle.static.data( + name="Columns", shape=columns_shape, dtype="int32") + Out = F.sparse_attention(Q, K, V, offset, columns) + + Q_np = np.random.random(self.shape).astype(self.dtype) + K_np = np.random.random(self.shape).astype(self.dtype) + V_np = np.random.random(self.shape).astype(self.dtype) + offset_np, columns_np = init_csr_format( + self.shape[0], self.shape[1], self.shape[2], self.blocksize) + offset_np = offset_np.astype('int32') + columns_np = columns_np.astype('int32') + + exe = fluid.Executor(self.place) + fetches_result = exe.run(feed={ + "Q": Q_np, + "K": K_np, + "V": V_np, + "Offset": offset_np, + "Columns": columns_np + }, + fetch_list=[Out]) + expected_result, __, __ = ref_batch_sparse_attention( + Q_np, K_np, V_np, offset_np, columns_np) + + self.assertTrue( + np.allclose( + fetches_result, expected_result, atol=1e-5)) + + def test_dygraph(self): + paddle.disable_static() + offset, columns = init_csr_format(self.shape[0], self.shape[1], + self.shape[2], self.blocksize) + offset = offset.astype('int32') + columns = columns.astype('int32') + query = np.random.random(self.shape).astype(self.dtype) + key = np.random.random(self.shape).astype(self.dtype) + value = np.random.random(self.shape).astype(self.dtype) + + paddle_query = paddle.to_tensor(query, place=self.place) + paddle_key = paddle.to_tensor(key, place=self.place) + paddle_value = paddle.to_tensor(value, place=self.place) + paddle_offset = paddle.to_tensor(offset, place=self.place) + paddle_colunmns = paddle.to_tensor(columns, place=self.place) + + paddle_result = F.sparse_attention(paddle_query, paddle_key, + paddle_value, paddle_offset, + paddle_colunmns) + + numpy_result, __, __ = ref_batch_sparse_attention(query, key, value, + offset, columns) + numpy_result = numpy_result.astype(self.dtype) + + self.assertTrue( + np.allclose( + paddle_result.numpy(), numpy_result, atol=1e-5)) + + +class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (2, 2, 8, 4) + self.blocksize = 2 + self.dtype = 'float32' + + +class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (2, 2, 64, 32) + self.blocksize = 2 + self.dtype = 'float64' + + +class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (2, 1, 64, 32) + self.blocksize = 2 + self.dtype = 'float64' + + +class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (4, 4, 128, 32) + self.blocksize = 8 + self.dtype = 'float64' + + +class TestSparseAttentionAPITestShape4(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (3, 3, 35, 15) + self.blocksize = 3 + self.dtype = 'float64' + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index 7965b362b9c55a..4151f25b94aff2 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -112,6 +112,8 @@ from ...fluid.layers import gather_tree # noqa: F401 from ...fluid.layers import temporal_shift # noqa: F401 +from .sparse_attention import sparse_attention + __all__ = [ #noqa 'conv1d', 'conv1d_transpose', @@ -207,4 +209,5 @@ 'layer_norm', 'instance_norm', 'class_center_sample', + 'sparse_attention', ] diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py new file mode 100644 index 00000000000000..f57669f11457f6 --- /dev/null +++ b/python/paddle/nn/functional/sparse_attention.py @@ -0,0 +1,144 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +import paddle +from ...fluid.framework import in_dygraph_mode, default_main_program +from paddle.fluid.layer_helper import LayerHelper +from ...fluid.framework import in_dygraph_mode +from paddle import _C_ops + + +def sparse_attention(query, + key, + value, + sparse_csr_offset, + sparse_csr_columns, + name=None): + r""" + This operator sparsify the Attention matrix in Transformer module + to achieve the effect of reducing memory consumption and computation. + The sparse layout is expressed in CSR format and contains two parameters, + ``offset`` and ``columns``. + + .. math:: + + result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V + + where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module. + The dimensions of the three parameters are the same. + ``d`` represents the size of the last dimension of the three parameters. + + Parameters: + query(Tensor): The query tensor in the Attention module. + It's a 4-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. + The dtype can be ``float32`` and ``float64``. + key(Tensor): The key tensor in the Attention module. + It's a 4-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. + The dtype can be ``float32`` and ``float64``. + value(Tensor): The value tensor in the Attention module. + It's a 4-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. + The dtype can be ``float32`` and ``float64``. + sparse_csr_offset(Tensor): The sparsity feature in the Attention module + is expressed in the CSR format, and the offset represents + the number of non-zero elements in each row of the matrix. + It's a 3-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len + 1]`. + The dtype should be ``int32``. + sparse_csr_columns(Tensor): The sparsity feature in the Attention module + is expressed in the CSR format, and the columns represent + the column index values of non-zero elements in the matrix. + It's a 3-D tensor with a shape of + :math:`[batch\_size, num\_heads, sparse\_nnz]`. + The dtype should be ``int32``. + name(str, optional): The default value is None. Normally there is no need for user + to set this property. For more information, please refer to + :ref:`api_guide_Name`. + + Returns: + A Tensor which refers to the result in the Attention module. + It's a 4-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. + The dtype can be ``float32`` and ``float64``. + + Examples: + .. code-block:: python + + # required: skiptest + import paddle + import numpy as np + + query_data = np.array([[[[0, 1,], [2, 3], + [ 0, 1], [2, 3]]]]).astype("float32") + key_data = np.array([[[[0, 1,], [2, 3], + [ 0, 1], [2, 3]]]]).astype("float32") + value_data = np.array([[[[0, 1,], [2, 3], + [ 0, 1], [2, 3]]]]).astype("float32") + sparse_csr_offset_data = np.array([[[0, 2, + 4, 6, 8]]]).astype("int32") + sparse_csr_columns_data = np.array([[[0, 1, + 0, 1, 2, 3, 2, 3]]]).astype("int32") + print(query_data.shape) + # (1, 1, 4, 2) + print(sparse_csr_offset_data.shape) + # (1, 1, 5) + print(sparse_csr_columns_data.shape) + # (1, 1, 8) + paddle.disable_static() + query = paddle.to_tensor(query_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + key = paddle.to_tensor(key_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + value = paddle.to_tensor(value_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + offset = paddle.to_tensor(sparse_csr_offset_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + columns = paddle.to_tensor(sparse_csr_columns_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + output = paddle.nn.functional.sparse_attention(query, key, + value, offset, columns) + print(output) + + # [[[[1.60885942, 2.60885954], + # [1.99830270, 2.99830270], + # [1.60885942, 2.60885954], + # [1.99830270, 2.99830270]]]] + """ + if in_dygraph_mode(): + result_attention, result_sdd, result_softmax = _C_ops.sparse_attention( + query, key, value, sparse_csr_offset, sparse_csr_columns) + return result_attention + + helper = LayerHelper('sparse_attention', **locals()) + dtype = helper.input_dtype(input_param_name='Q') + out = helper.create_variable_for_type_inference(dtype) + result_sdd = helper.create_variable_for_type_inference(dtype) + result_softmax = helper.create_variable_for_type_inference(dtype) + inputs = { + 'Q': query, + 'K': key, + 'V': value, + 'Offset': sparse_csr_offset, + 'Columns': sparse_csr_columns + } + outputs = { + 'Out': out, + 'SparseDotSdd': result_sdd, + 'Softmax': result_softmax + } + helper.append_op(type='sparse_attention', inputs=inputs, outputs=outputs) + return out From 7b45a46e13fe057ca12a001dac7b8d6d24d9f211 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Mon, 11 Oct 2021 19:59:16 +0800 Subject: [PATCH 107/298] Add FLAGS_allreduce_record_one_event to remove event waiting number (#36263) * add FLAGS_allreduce_record_one_event * add more comments * fix ut * improve coverage * fix ut, improve coverage --- .../details/computation_op_handle.cc | 8 +- .../details/fused_all_reduce_op_handle.cc | 85 +++++++++++++++++++ .../details/fused_all_reduce_op_handle.h | 7 ++ paddle/fluid/platform/flags.cc | 17 ++++ .../unittests/test_dist_mnist_fleetapi.py | 6 +- 5 files changed, 120 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 2256b826ed501f..60b8461668f6fa 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -16,6 +16,8 @@ #include +DECLARE_bool(allreduce_record_one_event); + namespace paddle { namespace framework { namespace details { @@ -31,11 +33,13 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, scope_idx_(scope_idx) {} void ComputationOpHandle::RunImpl() { - WaitInputVarGenerated(place_); + if (!FLAGS_allreduce_record_one_event) { + WaitInputVarGenerated(place_); + } auto run_func = [this]() { op_->Run(*local_exec_scopes_[0], place_); }; - if (is_lock_and_record_event_free_) { + if (is_lock_and_record_event_free_ || FLAGS_allreduce_record_one_event) { run_func(); } else { this->RunAndRecordEvent(run_func); diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index 8f45c364476a75..94507140a81d61 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -19,6 +19,8 @@ #include "paddle/fluid/platform/profiler.h" DEFINE_bool(skip_fused_all_reduce_check, false, ""); +DECLARE_bool(allreduce_record_one_event); + namespace paddle { namespace framework { namespace details { @@ -48,11 +50,80 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle( num_of_all_reduce_(num_of_all_reduce) {} #endif +FusedAllReduceOpHandle::~FusedAllReduceOpHandle() { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto destroy_event = [](gpuEvent_t event) { + if (event == nullptr) return; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event)); +#endif + }; + destroy_event(start_event_); + destroy_event(end_event_); +#endif +} + void FusedAllReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name()); VLOG(4) << this->DebugString(); +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (FLAGS_allreduce_record_one_event && start_event_ == nullptr) { + VLOG(10) << "FLAGS_allreduce_record_one_event=true"; + PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false, + platform::errors::Unimplemented( + "The hierarchical allreduce does not support " + "FLAGS_allreduce_record_one_event=true")); + PADDLE_ENFORCE_EQ(places_.size(), 1, + platform::errors::Unimplemented( + "FLAGS_allreduce_record_one_event=true is only valid " + "when using one GPU device per process.")); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(places_[0]), true, + platform::errors::Unimplemented( + "FLAGS_allreduce_record_one_event=true is only valid " + "when using GPU device.")); + auto create_event = [](gpuEvent_t *event) { + if (*event) return; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipEventCreateWithFlags(event, hipEventDisableTiming)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventCreateWithFlags(event, cudaEventDisableTiming)); +#endif + }; + create_event(&start_event_); + create_event(&end_event_); + } + + gpuStream_t nccl_stream{nullptr}; + gpuStream_t compute_stream{nullptr}; + + if (FLAGS_allreduce_record_one_event) { + auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, places_[0]); + compute_stream = + platform::DeviceContextPool::Instance().GetByPlace(gpu_place)->stream(); + auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); + auto &nccl_ctx = flat_nccl_ctxs->at(gpu_place.device); + nccl_stream = nccl_ctx.stream(); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(start_event_, compute_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + hipStreamWaitEvent(nccl_stream, start_event_, 0)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(start_event_, compute_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamWaitEvent(nccl_stream, start_event_, 0)); +#endif + } else { + WaitInputVarGenerated(); + } +#else WaitInputVarGenerated(); +#endif + // The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)... // The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)... auto in_var_handles = DynamicCast(this->Inputs()); @@ -94,6 +165,20 @@ void FusedAllReduceOpHandle::RunImpl() { } else { FusedAllReduceFunc(in_var_handles, out_var_handles); } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (FLAGS_allreduce_record_one_event) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(end_event_, nccl_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + hipStreamWaitEvent(compute_stream, end_event_, 0)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(end_event_, nccl_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamWaitEvent(compute_stream, end_event_, 0)); +#endif + } +#endif } void FusedAllReduceOpHandle::FusedAllReduceFunc( diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h index d22dc0a421ac0e..8473700867ce32 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -67,12 +67,19 @@ struct FusedAllReduceOpHandle : public AllReduceOpHandle { #endif std::string Name() const override; + ~FusedAllReduceOpHandle(); + protected: void RunImpl() override; private: size_t num_of_all_reduce_; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + gpuEvent_t start_event_{nullptr}; + gpuEvent_t end_event_{nullptr}; +#endif + // Check the dtype of the input void GetDTypeAndNumel( const std::vector> &g_tensor, diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 18636f6f842785..dd65d743fad31a 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -682,6 +682,23 @@ PADDLE_DEFINE_EXPORTED_bool( "It controls whether to apply IR pass to program when using Fleet APIs"); /** + * Distributed related FLAG + * Name: FLAGS_allreduce_record_one_event + * Since Version: 2.2.0 + * Value Range: bool, default=false + * Example: FLAGS_allreduce_record_one_event=true makes the allreduce + * operations would only wait one event instead of multiple events. + * Note: Make the allreduce operations would only wait one event instead of + * multiple events. Currently, only fuse allreduce supports this. + * Otherwise, the precision may be wrong. + */ +PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event, false, + "It controls whether the allreduce operations " + "would only wait one event instead of multiple " + "events. Currently, only fuse allreduce supports " + "this. Otherwise, the precision may be wrong."); + +/* * CINN related FLAG * Name: FLAGS_use_cinn * Since Version: 2.3 diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py index 34abc5b45531a9..3b15b06b5efa8a 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py @@ -32,7 +32,11 @@ def _setup_config(self): def test_dist_train(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1e-5) + self.check_with_place( + "dist_mnist.py", + delta=1e-5, + check_error_log=True, + need_envs={'FLAGS_allreduce_record_one_event': '1'}) class FleetCollectiveTest(unittest.TestCase): From 339cb1917eb8efd8d190d3490b1aadf1f2d1a615 Mon Sep 17 00:00:00 2001 From: jakpiase <62569058+jakpiase@users.noreply.github.com> Date: Mon, 11 Oct 2021 14:11:41 +0200 Subject: [PATCH 108/298] fix for matmul_v2 6D x 2D (#36342) --- .../operators/mkldnn/matmul_v2_mkldnn_op.cc | 8 +++---- .../mkldnn/test_matmul_v2_mkldnn_op.py | 21 ++++++++++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index 57a3c385593160..c332b9194164ea 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -148,8 +148,8 @@ class MatMulV2MKLDNNKernel if (x_dims.size() == 1) { x_bd_dims[x_bd_dims.size() - 1] = x_dims[0]; } else if (x_dims.size() == 2) { - x_bd_dims[2] = x_dims[1]; - x_bd_dims[1] = x_dims[0]; + x_bd_dims[x_bd_dims.size() - 1] = x_dims[1]; + x_bd_dims[x_bd_dims.size() - 2] = x_dims[0]; } else { for (size_t i = 0; i < x_dims.size(); ++i) { x_bd_dims[i] = x_dims[i]; @@ -158,8 +158,8 @@ class MatMulV2MKLDNNKernel if (y_dims.size() == 1) { y_bd_dims[x_bd_dims.size() - 2] = y_dims[0]; } else if (y_dims.size() == 2) { - y_bd_dims[2] = y_dims[1]; - y_bd_dims[1] = y_dims[0]; + y_bd_dims[y_bd_dims.size() - 1] = y_dims[1]; + y_bd_dims[y_bd_dims.size() - 2] = y_dims[0]; } else { for (size_t i = 0; i < y_dims.size(); ++i) { y_bd_dims[i] = y_dims[i]; diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py index 5cc6651bb0ec8e..994d78126bda58 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py @@ -235,6 +235,22 @@ def config(self): self.trans_y = True +class TestMatMulV2MatrixXMatrix6Dx2DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 1, 2, 1, 8, 9) + self.y_shape = (9, 12) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXMatrix2Dx5DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (20, 5) + self.y_shape = (1, 2, 1, 5, 11) + self.trans_x = False + self.trans_y = False + + # BF16 TESTS def create_bf16_test_class(parent): @OpTestTool.skip_if_not_cpu_bf16() @@ -274,7 +290,8 @@ def calculate_grads(self): 2: [1, 0], 3: [0, 2, 1], 4: [0, 1, 3, 2], - 5: [0, 1, 2, 4, 3] + 5: [0, 1, 2, 4, 3], + 6: [0, 1, 2, 3, 5, 4] } # expand vector so it will be a valid matrix for multiplication @@ -370,6 +387,8 @@ def calculate_grads(self): create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp) create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp) create_bf16_test_class(TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrix6Dx2DOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrix2Dx5DOneDNNOp) if __name__ == "__main__": paddle.enable_static() From e5b4dd7386486610a183460e88e21b8899bd1d55 Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Mon, 11 Oct 2021 20:47:08 +0800 Subject: [PATCH 109/298] [heterps] add fuse_allreduce (#35131) * heterps:add fuse_allreduce op; test=develop * add program_mode in minimize for pslib mode;test=develop --- python/paddle/distributed/fleet/utils/fs.py | 13 +- .../fleet/parameter_server/pslib/__init__.py | 13 +- python/paddle/fluid/transpiler/collective.py | 267 +++++++++++++++++- 3 files changed, 284 insertions(+), 9 deletions(-) diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index d3f84d50ac8f9f..f56580f8ca2fe6 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -468,10 +468,17 @@ def __init__( self._bd_err_re = re.compile( r'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:') - def _run_cmd(self, cmd, redirect_stderr=False): + def _run_cmd(self, cmd, redirect_stderr=False, retry_times=5): exe_cmd = "{} -{}".format(self._base_cmd, cmd) - ret, output = core.shell_execute_cmd(exe_cmd, 0, 0, redirect_stderr) - ret = int(ret) + ret = 0 + output = None + retry_sleep_second = 3 + for x in range(retry_times + 1): + ret, output = core.shell_execute_cmd(exe_cmd, 0, 0, redirect_stderr) + ret = int(ret) + if ret == 0: + break + time.sleep(retry_sleep_second) if ret == 134: raise FSShellCmdAborted(cmd) return ret, output.splitlines() diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py index d245ce222ca6cf..78af7fd65dccbb 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py @@ -1091,7 +1091,8 @@ def minimize(self, scopes=None, startup_programs=None, parameter_list=None, - no_grad_set=None): + no_grad_set=None, + program_mode="all_reduce"): """ minimize a program through loss, loss can be a list in DistributedOptimizer. Note that in parameter server mode, a worker will not get anything about optimize_os @@ -1105,6 +1106,7 @@ def minimize(self, in `parameter_list`. parameter_list (list): list of Variables to update. no_grad_set (set|None): set of Variables should be ignored. + program_mode (str|"all_reduce"): grad action for grogram when use_ps_gpu. Returns: tuple: (optimize_ops, params_grads) which are, list of operators appended; and list of (param, grad) Variables pair for optimization. @@ -1139,12 +1141,17 @@ def minimize(self, if opt_info["use_ps_gpu"]: from paddle.fluid.transpiler.collective import MultiThread # check start program - + if program_mode not in [ + "all_reduce", "fuse_all_reduce", "all_gather" + ]: + raise ValueError("You should set program_mode in [ all_reduce, \ + fuse_all_reduce, all_gather ]") env = self.get_dist_env() if not isinstance(losses, list): startup_programs = [startup_programs] for i in range(0, len(startup_programs)): - t = MultiThread() + + t = MultiThread(trans_mode=program_mode) start_program = startup_programs[i] main_program = programs[i] t.transpile( diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py index ec8602ec7e6726..ea88a89e68224c 100644 --- a/python/paddle/fluid/transpiler/collective.py +++ b/python/paddle/fluid/transpiler/collective.py @@ -65,7 +65,7 @@ def transpile(self, startup_program, main_program, rank, endpoints, self.main_program = default_main_program() self.nranks = len(endpoints) - if self.nranks == 1 and self.mode != "single_process_multi_thread": + if self.nranks == 1 and self.mode != "single_process_multi_thread" and self.mode != "box": raise ValueError('the number of endpoints must > 1') if rank < 0: @@ -441,9 +441,14 @@ class MultiThread(GradAllReduce): ''' ''' - def __init__(self, nrings=1): + def __init__(self, nrings=1, trans_mode="all_reduce"): GradAllReduce.__init__(self, nrings) - self.mode = "single_process_multi_thread" + self.mode = "box" + self.trans_mode = trans_mode + self.fuse_grad_size_in_num = 128 + gpu_nums = os.getenv("FLAGS_selected_gpus", + "0,1,2,3,4,5,6,7,8").split(",") + self.gpu_num = len(gpu_nums) def _transpile_startup_program(self): if len(self.endpoints) > 1: @@ -460,3 +465,259 @@ def _transpile_startup_program(self): print("begin to _transpile_startup_program for single-node") block = self.startup_program.global_block() block.append_op(type='c_comm_init_all', attrs={'ring_id': 0}) + + def _transpile_main_program(self): + self._insert_scale_loss_grad_ops() + if self.trans_mode == "all_gather": + print("begin to transpile in all-gather mode") + self.allgather_ranks = self.nranks * self.gpu_num + self._insert_allgather_ops() + self._update_adam_ops() + elif self.trans_mode == "fuse_all_reduce": + print("begin to transpile in fuse all-reduce mode") + self._insert_fuse_allreduce_ops() + else: + print("begin to transpile in all-reduce mode") + self._insert_allreduce_ops() + + def _insert_allgather_ops(self): + """ + insert allgather op to the main_program + """ + block = self.main_program.global_block() + ring_id = -1 + grad = None + for idx, op in reversed(list(enumerate(block.ops))): + if self._is_backward_op(op) and \ + self.op_role_var_key in op.attr_names: + op_role_var = op.all_attrs()[self.op_role_var_key] + if len(op_role_var) == 0: + continue + assert len(op_role_var) % 2 == 0 + + offset = idx + for i in range(0, len(op_role_var), 2): + param = block.vars[op_role_var[i]] + new_grad_var = block.create_var( + name=op_role_var[i] + "_allgather", + shape=[self.allgather_ranks] + list(param.shape), + persistable=False, + dtype=core.VarDesc.VarType.FP32, + stop_gradient=True) + grad = block.vars[op_role_var[i + 1]] + if param.is_distributed: # no need to care: used in PLSC + continue + + if offset == idx: + offset += 1 + block._insert_op( + offset, + type='c_sync_calc_stream', + inputs={'X': grad}, + outputs={'Out': grad}, + attrs={self.op_role_key: OpRole.Backward}) + offset += 1 + + # As we search ops reversedly, we should insert c_allgather + # op in the same way to keep the ring_id alternate + ring_id = (ring_id + 1) % self.nrings + block._insert_op( + offset, + type='c_allgather', + inputs={'X': grad}, + outputs={'Out': new_grad_var}, + attrs={ + 'nranks': self.allgather_ranks, + 'ring_id': ring_id, + self.op_role_key: OpRole.Backward + }) + + if grad is None: + return + + for idx, op in enumerate(block.ops): + if self._is_optimizer_op(op): + for ring_id in range(self.nrings): + block._insert_op( + idx + ring_id, + type='c_sync_comm_stream', + inputs={'X': grad}, + outputs={'Out': grad}, + attrs={ + 'ring_id': ring_id, + self.op_role_key: OpRole.Backward + }) + break + + def _update_adam_ops(self): + """ + remove the original adam op, and add new adam ops + """ + block = self.main_program.global_block() + + for idx, op in reversed(list(enumerate(block.ops))): + if self._is_optimizer_op(op): + offset = idx + if op.type != 'adam' and op.type != 'lamb': # filter out scale op + continue + param_name = op.input("Param")[0] + inputs = { + "Param": block.vars[op.input("Param")[0]], + "LearningRate": block.vars[op.input("LearningRate")[0]], + "Moment1": block.vars[op.input("Moment1")[0]], + "Moment2": block.vars[op.input("Moment2")[0]], + "Beta1Pow": block.vars[op.input("Beta1Pow")[0]], + "Beta2Pow": block.vars[op.input("Beta2Pow")[0]] + } + outputs = { + "ParamOut": block.vars[op.output("ParamOut")[0]], + "Moment1Out": block.vars[op.output("Moment1Out")[0]], + "Moment2Out": block.vars[op.output("Moment2Out")[0]], + "Beta1PowOut": block.vars[op.output("Beta1PowOut")[0]], + "Beta2PowOut": block.vars[op.output("Beta2PowOut")[0]] + } + attrs = { + "epsilon": op.attr('epsilon'), + "beta1": op.attr('beta1'), + "beta2": op.attr('beta2'), + "lazy_mode": op.attr('lazy_mode'), + "min_row_size_to_use_multithread": + op.attr('min_row_size_to_use_multithread') + } + split_vars = [ + block.create_var( + name=param_name + "_" + str(i), + shape=block.vars[op.input("Param")[0]].shape, + persistable=False, + dtype=core.VarDesc.VarType.FP32, + stop_gradient=True) for i in range(self.allgather_ranks) + ] + block._insert_op( + offset, + type="split", + inputs={ + 'X': block.vars[op.input("Param")[0] + "_allgather"] + }, + outputs={'Out': split_vars}, + attrs={'num': self.allgather_ranks, + 'axis': 0}) + offset += 1 + + for i in range(self.allgather_ranks): + inputs["Grad"] = split_vars[i] + block._insert_op( + offset, + type=op.type, + inputs=inputs, + outputs=outputs, + attrs=attrs) + offset += 1 + # remove the original adam op + block._remove_op(offset) + + def _insert_fuse_allreduce_ops(self): + """ + insert coalesce_tensor and all reduce ops + """ + block = self.main_program.global_block() + ring_id = 0 % self.nrings + grad = None + param_grads = [] + # find all grad params + for op in reversed(block.ops): + if self._is_backward_op(op) and \ + self.op_role_var_key in op.attr_names: + op_role_var = op.all_attrs()[self.op_role_var_key] + if len(op_role_var) == 0: + continue + assert len(op_role_var) % 2 == 0, "vars need to be one param var followed by one grad var, " \ + "but got odd number of vars" + for i in range(0, len(op_role_var), 2): + param_name = op_role_var[i] + param = block.var(param_name) + grad_name = op_role_var[i + 1] + grad = block.var(grad_name) + if param.is_distributed: + continue + param_grads.append(grad) + if grad is None: + return + + segments = [] + last_dtype = None + # split the grad based on dtype and fused size + for var in param_grads: + if len(segments) == 0 \ + or len(segments[-1]) == self.fuse_grad_size_in_num \ + or var.dtype != last_dtype: + segments.append([var]) + last_dtype = var.dtype + else: + segments[-1].append(var) + + fused_vars = [] + for idx, op in enumerate(block.ops): + if self._is_optimizer_op(op): + for segment in segments: + # insert coalesce tensor + tmp_var = block.create_var( + name=unique_name.generate('FusedOutput_{}'.format( + segment[0].name)), + dtype=segment[0].dtype, + persistable=False, + stop_gradient=True) + fused_vars.append(tmp_var) + block._insert_op( + idx, + type="coalesce_tensor", + inputs={"Input": segment}, + outputs={"Output": segment, + "FusedOutput": tmp_var}, + attrs={ + "copy_data": True, + "use_align": True, + "dtype": segment[0].dtype, + self.op_role_key: OpRole.Backward + }) + break + + # insert the allreduce_sum op + for idx, op in enumerate(block.ops): + if self._is_optimizer_op(op): + for fused_var in fused_vars: + block._insert_op( + idx, + type='c_allreduce_sum', + inputs={'X': fused_var}, + outputs={'Out': fused_var}, + attrs={ + 'ring_id': ring_id, + 'use_calc_stream': False, + self.op_role_key: OpRole.Backward + }) + block._insert_op( + idx, + type='c_sync_calc_stream', + inputs={'X': fused_var}, + outputs={'Out': fused_var}, + attrs={self.op_role_key: OpRole.Backward}) + break + + if len(fused_vars) == 0: + block._sync_with_cpp() + return + + # insert the sync comm op + for idx, op in enumerate(block.ops): + if self._is_optimizer_op(op): + block._insert_op( + idx, + type='c_sync_comm_stream', + inputs={'X': fused_vars[0]}, + outputs={'Out': fused_vars[0]}, + attrs={ + 'ring_id': ring_id, + self.op_role_key: OpRole.Backward + }) + break + block._sync_with_cpp() From 6d353aa524770279a9b216e011d6623b7be0ea35 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 11 Oct 2021 20:59:49 +0800 Subject: [PATCH 110/298] refine auto_growth allocator (#35732) * do not use alignedAllocator when cuda has alignment * update test * fix error during multiple process --- .../memory/allocation/aligned_allocator.cc | 1 + .../memory/allocation/allocator_facade.cc | 36 ++++++++++++++++++- .../auto_growth_best_fit_allocator.cc | 15 ++++---- .../auto_growth_best_fit_allocator_test.cc | 14 +++++--- 4 files changed, 55 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc index 1d89918bfebf6a..f0b7f1a4b0d9e7 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.cc +++ b/paddle/fluid/memory/allocation/aligned_allocator.cc @@ -20,6 +20,7 @@ namespace paddle { namespace memory { namespace allocation { +// For memory address alignment class AlignedAllocation : public Allocation { public: AlignedAllocation(AllocationPtr underlying_allocation, size_t offset) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 0388e2d13afb0d..281902f3a2b12a 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -23,6 +23,7 @@ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif +#include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" @@ -201,6 +202,8 @@ class AllocatorFacadePrivate { inline const std::shared_ptr& GetAllocator( const platform::Place& place, size_t size) { + VLOG(4) << "GetAllocator" + << " " << place << " " << size; const auto& allocators = (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ : GetAllocatorMap()) @@ -256,8 +259,39 @@ class AllocatorFacadePrivate { void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, bool allow_free_idle_chunk) { auto cuda_allocator = std::make_shared(p); + auto alignment = platform::GpuMinChunkSize(); + bool need_addr_align = true; + // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda + // API in that case may got cuda error(3), i.e., + // cudaErrorInitializationError. And, the CUDAAllocator is only initialized + // but not really used. + // Here, the try-catch block is added to handle the case that + // GetDeviceProperties() may failed in the multiple process(for example, in + // dataloader with num_worker > 0) + try { + const auto& prop = platform::GetDeviceProperties(p.GetDeviceId()); + need_addr_align = prop.textureAlignment < alignment; + VLOG(4) << "GetDeviceProperties ok, textureAlignment: " + << prop.textureAlignment + << ", set need_addr_align=" << need_addr_align; + } catch (...) { + need_addr_align = true; + VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true"; + } + // The address returned is aligned already, + // ref: + // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295 + std::shared_ptr underlying_allocator{nullptr}; + if (need_addr_align) { + VLOG(10) << "use AlignedAllocator with alignment: " << alignment; + underlying_allocator = + std::make_shared(underlying_allocator, alignment); + } else { + VLOG(10) << "not use AlignedAllocator with alignment: " << alignment; + underlying_allocator = cuda_allocator; + } allocators_[p] = std::make_shared( - cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); + underlying_allocator, alignment, 0, allow_free_idle_chunk); } #endif diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index f36d589f907fb4..9f34f5198a1796 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -40,14 +40,14 @@ namespace allocation { AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( const std::shared_ptr &underlying_allocator, size_t alignment, size_t chunk_size, bool allow_free_idle_chunk) - : underlying_allocator_( - std::make_shared(underlying_allocator, alignment)), + : underlying_allocator_(underlying_allocator), alignment_(alignment), chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)), allow_free_idle_chunk_(allow_free_idle_chunk) {} -Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { - size = AlignedSize(size, alignment_); +Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) { + size_t size = AlignedSize(unaligned_size, alignment_); + VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size; std::lock_guard guard(spinlock_); auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); @@ -57,6 +57,8 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { free_blocks_.erase(iter); auto *chunk = block_it->chunk_; size_t remaining_size = block_it->size_ - size; + VLOG(10) << "Allocate " << size << " bytes from chunk size " + << block_it->size_ << ", remaining " << remaining_size; if (remaining_size == 0) { block_it->is_free_ = false; } else { @@ -95,13 +97,14 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { } blocks.emplace_back(p + remaining_size, size, false, chunk); block_it = --(blocks.end()); - VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining " - << remaining_size; + VLOG(2) << "Not found and reallocate " << realloc_size << "(" + << static_cast(p) << "), and remaining " << remaining_size; } return new BlockAllocation(block_it); } void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { + VLOG(10) << "Free " << allocation->size() << " bytes"; std::lock_guard guard(spinlock_); auto block_it = static_cast(allocation)->block_it_; auto &blocks = block_it->chunk_->blocks_; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc index 6f2591c8b15c8e..926af8292d2e86 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" - #include +#include "paddle/fluid/memory/allocation/aligned_allocator.h" +#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" + #include "gtest/gtest.h" DECLARE_bool(free_idle_chunk); @@ -50,10 +51,13 @@ static void TestFreeIdleChunk(bool free_idle_chunk, FLAGS_free_idle_chunk = free_idle_chunk; FLAGS_free_when_no_cache_hit = free_when_no_cache_hit; auto recorded_allocator = std::make_shared(); + size_t alignment = 4096; size_t memory_size = 8192; + auto underlying_allocator = + std::make_shared(recorded_allocator, alignment); auto ag_allocator = std::make_shared( - recorded_allocator, alignment); + underlying_allocator, alignment); for (size_t i = 0; i < 10; ++i) { auto allocation = ag_allocator->Allocate(memory_size); @@ -131,8 +135,10 @@ static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) { auto underlying_allocator = std::make_shared(memory_capacity); + auto aligned_allocator = + std::make_shared(underlying_allocator, alignment); auto ag_allocator = std::make_shared( - underlying_allocator, alignment); + aligned_allocator, alignment); ag_allocator->Allocate(allocate_size[0]); ASSERT_EQ(underlying_allocator->AllocatedSize(), From 2a75b44727173dd4317adb61648f27bfbedbeecc Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 12 Oct 2021 10:03:57 +0800 Subject: [PATCH 111/298] Fix stop_gradient in RunProgramOp (#36339) * Fix stop_gradient in RunProgramOp * fix reference --- paddle/fluid/operators/run_program_op.h | 26 +++++++--- .../tests/unittests/test_run_program_op.py | 48 +++++++++++++++++++ 2 files changed, 67 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index ac352876e7871d..04e4dc62b039b1 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -142,10 +142,15 @@ static void ShareVarsIntoScope(const std::vector &vars, static void ShareVarsFromScope(const std::vector &vars, const std::vector &var_names, + const BlockDesc &global_block, framework::Scope *scope) { for (size_t i = 0; i < vars.size(); ++i) { + // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all + // parameters before generating out_tmp have no @GRAD, it will raise error + // because we can't findthem in scope. So we skip sharing these vars or + // var@GRAD if they don't appear in global block. if (var_names[i] == framework::kEmptyVarName || - var_names[i] == "Fake_var") { + var_names[i] == "Fake_var" || !global_block.HasVar(var_names[i])) { VLOG(2) << "find variable name is " << var_names[i] << ", skip it!"; continue; } @@ -214,8 +219,10 @@ class RunProgramOpKernel : public framework::OpKernel { details::ShareVarsIntoScope(input_vars, input_var_names, &scope); details::ShareVarsIntoScope(param_vars, param_names, &scope); + auto *global_block = ctx.Attr("global_block"); + if (end_op_index > start_op_index) { - auto *program = ctx.Attr("global_block")->Program(); + auto *program = global_block->Program(); auto cache_info = framework::GetExecutorInfoFromCache( *program, ctx.GetPlace(), start_op_index, end_op_index, /*is_grad=*/false, program_id, &scope); @@ -240,8 +247,10 @@ class RunProgramOpKernel : public framework::OpKernel { parallel_executor->RunWithoutFetch(skip_eager_delete_vars); } // Step 4. Get Output - details::ShareVarsFromScope(output_vars, output_var_names, &scope); - details::ShareVarsFromScope(dout_vars, dout_var_names, &scope); + details::ShareVarsFromScope(output_vars, output_var_names, *global_block, + &scope); + details::ShareVarsFromScope(dout_vars, dout_var_names, *global_block, + &scope); // Debug info: scope info when run end VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front()); @@ -307,10 +316,11 @@ class RunProgramGradOpKernel : public framework::OpKernel { "least one sub scope.")); auto &scope = *(global_inner_scope->kids().front()); + auto *global_block = ctx.Attr("global_block"); if (end_op_index > start_op_index) { // Step 2. prepare executor and scope - auto *program = ctx.Attr("global_block")->Program(); + auto *program = global_block->Program(); auto cache_info = framework::GetExecutorInfoFromCache( *program, ctx.GetPlace(), start_op_index, end_op_index, /*is_grad*/ true, program_id, &scope); @@ -341,8 +351,10 @@ class RunProgramGradOpKernel : public framework::OpKernel { } // Step 4. get outputs - details::ShareVarsFromScope(input_grad_vars, input_grad_var_names, &scope); - details::ShareVarsFromScope(param_grad_vars, param_grad_names, &scope); + details::ShareVarsFromScope(input_grad_vars, input_grad_var_names, + *global_block, &scope); + details::ShareVarsFromScope(param_grad_vars, param_grad_names, + *global_block, &scope); // Step5. drop current scope global_inner_scope->DeleteScope(&scope); diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py index b3d0845a4fbbc1..33b32a6632c9e3 100644 --- a/python/paddle/fluid/tests/unittests/test_run_program_op.py +++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py @@ -343,5 +343,53 @@ def build_model(self): return fwd_op_num +class Net(paddle.nn.Layer): + def __init__(self): + super(Net, self).__init__() + self.fc1 = paddle.nn.Linear(10, 10) + self.fc2 = paddle.nn.Linear(10, 1) + + def forward(self, x): + out = self.fc1(x) + out.stop_gradient = True + out = self.fc2(out) + return out + + +class TestParametersWithStopGradient(unittest.TestCase): + def setUp(self): + self.seed = 2021 + self.iter = 5 + + def train(self, to_static): + # prepare env + paddle.seed(self.seed) + + net = Net() + if to_static: + net = paddle.jit.to_static(net) + sgd = paddle.optimizer.SGD(0.01, parameters=net.parameters()) + + for i in range(self.iter): + x = paddle.rand([4, 10]) + out = net(x) + loss = paddle.mean(out) + + loss.backward() + sgd.minimize(loss) + net.clear_gradients() + + return loss + + def test_stop_gradient(self): + paddle.disable_static() + + dy_loss = self.train(to_static=False) + st_loss = self.train(to_static=True) + self.assertEqual(dy_loss[0], st_loss[0]) + + paddle.enable_static() + + if __name__ == "__main__": unittest.main() From 0594d2a7f086cc64b58f01aeb0299cc06c683825 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 12 Oct 2021 10:05:52 +0800 Subject: [PATCH 112/298] Revert "refine case when thread_num = 1 (#36201)" (#36347) This reverts commit 7e60cc63c33f0c17df36b0ee52ae50a3d04a6697. --- .../fast_threaded_ssa_graph_executor.cc | 20 +++---------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index eb027d7c2f636a..75998e4582e2bc 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -47,16 +47,7 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( << "Change thread number to 1 because the toposort order is unique"; strategy_.num_threads_ = 1; } - if (strategy_.num_threads_ > 1) { - pool_.reset(new ::ThreadPool(strategy.num_threads_)); - } else { - auto nodes = ir::TopologySortOperations(*graph_); - traced_ops_.clear(); - traced_ops_.reserve(nodes.size()); - for (auto *node : nodes) { - traced_ops_.push_back(&node->Wrapper()); - } - } + pool_.reset(new ::ThreadPool(strategy.num_threads_)); for (auto &op : ir::FilterByNodeWrapper(*graph_)) { int dep = static_cast(op->NotReadyInputSize()); op_deps_.emplace(op, dep); @@ -239,7 +230,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( OpHandleBase *op, const std::shared_ptr> &complete_q) { ++remaining_; - auto func = [=] { + this->pool_->enqueue([=] { std::deque op_queue; op_queue.push_front(op); @@ -298,12 +289,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( } --remaining_; complete_q->Push(complete); - }; - if (pool_) { - pool_->enqueue(func); - } else { - func(); - } + }); } void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() { From ec148cab5be5e7298203d2cd5c294b41c0622d8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?LJQ=E2=9D=A4=EF=B8=8F?= <33169170+lijiaqi0612@users.noreply.github.com> Date: Tue, 12 Oct 2021 10:29:03 +0800 Subject: [PATCH 113/298] fft: modify sample code result (#36325) --- python/paddle/tensor/fft.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/python/paddle/tensor/fft.py b/python/paddle/tensor/fft.py index f7990e3f89107b..20fd143589fa4b 100644 --- a/python/paddle/tensor/fft.py +++ b/python/paddle/tensor/fft.py @@ -339,7 +339,7 @@ def irfft(x, n=None, axis=-1, norm="backward", name=None): xp = paddle.to_tensor(x) irfft_xp = paddle.fft.irfft(xp).numpy() print(irfft_xp) - # [0. 0. 0. 4.] + # [0. 1. 0. 0.] """ return fft_c2r(x, n, axis, norm, forward=False, name=name) @@ -477,7 +477,7 @@ def fftn(x, s=None, axes=None, norm="backward", name=None): import numpy as np import paddle - x = x = np.mgrid[:4, :4, :4][1] + x = np.mgrid[:4, :4, :4][1] xp = paddle.to_tensor(x) fftn_xp = paddle.fft.fftn(xp, axes=(1, 2)).numpy() print(fftn_xp) @@ -631,9 +631,9 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None): # use axes(2, 0) print(paddle.fft.rfftn(x, axes=(2, 0))) # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True, - # [[[(24+0j), 0j , 0j ], - # [0j , 0j , 0j ], - # [0j , 0j , 0j ]], + # [[[(8+0j), 0j , 0j ], + # [(8+0j), 0j , 0j ], + # [(8+0j), 0j , 0j ]], # # [[0j , 0j , 0j ], # [0j , 0j , 0j ], @@ -1267,9 +1267,8 @@ def fftshift(x, axes=None, name=None): import paddle x = np.array([3, 1, 2, 2, 3], dtype=float) - scalar_temp = 0.3 n = x.size - fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp) + fftfreq_xp = paddle.fft.fftfreq(n, d=0.3) res = paddle.fft.fftshift(fftfreq_xp).numpy() print(res) # [-1.3333334 -0.6666667 0. 0.6666667 1.3333334] @@ -1311,9 +1310,8 @@ def ifftshift(x, axes=None, name=None): import paddle x = np.array([3, 1, 2, 2, 3], dtype=float) - scalar_temp = 0.3 n = x.size - fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp) + fftfreq_xp = paddle.fft.fftfreq(n, d=0.3) res = paddle.fft.ifftshift(fftfreq_xp).numpy() print(res) # [ 1.3333334 -1.3333334 -0.6666667 0. 0.6666667] From d247cf17d11e2ee32921c0b321bafb28d7a3477d Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Tue, 12 Oct 2021 10:59:47 +0800 Subject: [PATCH 114/298] =?UTF-8?q?fix=20bugs=20in=20mp=5Flayers=E3=80=81p?= =?UTF-8?q?p=5Flayers=20and=20HybridParallelClipGrad=20(#36144)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix calling bug of HybridParallelClipGrad * fix bugs of HybridParallelClipGrad * add unittest of pp with HybridParallelClipGrad * fix bugs in mp_layers.py * update * fix bugs in pp_layers.py * update --- .../hybrid_parallel_optimizer.py | 36 ++++++++++++------- .../parallel_layers/mp_layers.py | 8 ++--- .../parallel_layers/pp_layers.py | 7 ++++ .../unittests/hybrid_parallel_pp_alexnet.py | 17 ++++----- .../unittests/hybrid_parallel_pp_clip_grad.py | 35 ++++++++++++++++++ ...test_parallel_dygraph_pipeline_parallel.py | 3 ++ 6 files changed, 81 insertions(+), 25 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index 76e326ce20d7cb..6cd875905864bd 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -52,6 +52,7 @@ def _dygraph_clip(self, params_grads): params_and_grads = [] sum_square_list_dist = [] sum_square_list_not_dist = [] + for p, g in params_grads: if g is None: continue @@ -64,29 +65,38 @@ def _dygraph_clip(self, params_grads): square = layers.square(merge_grad) sum_square = layers.reduce_sum(square) - if p.is_distributed: - sum_square_list_dist.append(sum_square) - else: - sum_square_list_not_dist.append(sum_square) + not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or ( + hasattr(p, 'is_firstly_shared') and + getattr(p, 'is_firstly_shared', True)) - # all parameters have been filterd out - if len(sum_square_list_dist) + len(sum_square_list_not_dist) == 0: - return params_grads + if not_shared_enable: + if p.is_distributed: + sum_square_list_dist.append(sum_square) + else: + sum_square_list_not_dist.append(sum_square) global_norm_var_dist = layers.concat(sum_square_list_dist) if len( sum_square_list_dist) != 0 else layers.concat( [paddle.to_tensor([0.])]) global_norm_var_dist = layers.reduce_sum(global_norm_var_dist) + global_norm_var_not_dist = layers.concat( sum_square_list_not_dist) if len( sum_square_list_not_dist) != 0 else layers.concat( [paddle.to_tensor([0.])]) global_norm_var_not_dist = layers.reduce_sum(global_norm_var_not_dist) - # add all reduce to get global norm of distributed params_and_grads in world size - # all reduce is not needed while getting global norm of non-distributed params_and_grads - paddle.distributed.all_reduce( - global_norm_var_dist, group=self._hcg.get_check_parallel_group()) + # add all reduce to get global norm of distributed params_and_grads + if self._hcg.get_model_parallel_world_size() > 1: + paddle.distributed.all_reduce( + global_norm_var_dist, + group=self._hcg.get_check_parallel_group()) + + # add all reduce to get global norm of non-distributed params_and_grads in groups of pp + if self._hcg.get_pipe_parallel_world_size() > 1: + paddle.distributed.all_reduce( + global_norm_var_not_dist, + group=self._hcg.get_pipe_parallel_group()) # In Sharding mode, param and grad is mapping different rank in optimizer. # ClipGradByGlobalNorm need allreduce to get globol norm @@ -143,8 +153,8 @@ def __init__(self, optimizer, hcg, strategy): if isinstance(self._inner_opt._grad_clip, ClipGradByGlobalNorm) and not self._use_dp_mode: - logger.warning("using ClipGradByGlobalNorm in TensorParallel, the origin " \ - "optmizer'grad clip will be changed.") + logger.warning("While using ClipGradByGlobalNorm in TensorParallel, PipelineParallel " \ + "or Sharding, the grad clip of original optimizer will be changed.") if self._sharding_enable: # change sharding inner_optimizer's _grad_clip diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py index 2555d73462b780..2ce8cf7bdeb74e 100644 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py @@ -70,7 +70,7 @@ def __init__(self, dtype=self._dtype, is_bias=False) - self.weight.is_distributed = True + self.weight.is_distributed = True if self.is_mp else False def forward(self, x): if self.is_mp: @@ -135,7 +135,7 @@ def __init__(self, dtype=self._dtype, is_bias=False) - self.weight.is_distributed = True + self.weight.is_distributed = True if self.is_mp else False if has_bias: # initialize bias to zero like Megatron @@ -144,7 +144,7 @@ def __init__(self, attr=paddle.nn.initializer.Constant(value=0.0), dtype=self._dtype, is_bias=True) - self.bias.is_distributed = True + self.bias.is_distributed = True if self.is_mp else False else: self.bias = None @@ -212,7 +212,7 @@ def __init__(self, dtype=self._dtype, is_bias=False) - self.weight.is_distributed = True + self.weight.is_distributed = True if self.is_mp else False if has_bias: self.bias = self.create_parameter( diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py index db6fc964895ffc..9920bbd400c709 100755 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py @@ -261,6 +261,10 @@ def _synchronize_shared_weights(self): src=min(comm['ranks']), group=comm['group']) + for param in comm['layer'].parameters(): + if self.global_rank != min(comm['ranks']): + setattr(param, 'is_firstly_shared', False) + def allreduce_shared_weight_gradients(self): for key, comm in self.shared_comm.items(): param = getattr(self.shared_layers[key], comm['weight_attr']) @@ -316,6 +320,9 @@ def _build_layer(self): self.shared_layers[layer.layer_name] = layer.build_layer() self.shared_weight_attrs[ layer.layer_name] = layer.shared_weight_attr + for param in self.shared_layers[ + layer.layer_name].parameters(): + setattr(param, "is_firstly_shared", True) if layer.forward_func is None: self.run_function.append(self.shared_layers[ diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py index 912849ffbeb71c..71e873b0e2f7c9 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py @@ -53,6 +53,13 @@ def setUp(self): } fleet.init(is_collective=True, strategy=strategy) + def build_optimizer(self, model): + scheduler = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True) + optimizer = paddle.optimizer.SGD(learning_rate=scheduler, + parameters=model.parameters()) + return scheduler, optimizer + def test_pp_model(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() @@ -63,10 +70,7 @@ def test_pp_model(self): #construct model a model_a = AlexNet(10) - scheduler_a = paddle.optimizer.lr.PiecewiseDecay( - boundaries=[2], values=[0.001, 0.002], verbose=True) - optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a, - parameters=model_a.parameters()) + scheduler_a, optimizer_a = self.build_optimizer(model_a) param_len = len(model_a.parameters()) @@ -76,10 +80,7 @@ def test_pp_model(self): # construct model b model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size) - scheduler_b = paddle.optimizer.lr.PiecewiseDecay( - boundaries=[2], values=[0.001, 0.002], verbose=True) - optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b, - parameters=model_b.parameters()) + scheduler_b, optimizer_b = self.build_optimizer(model_b) model_b = fleet.distributed_model(model_b) optimizer_b = fleet.distributed_optimizer(optimizer_b) diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py new file mode 100644 index 00000000000000..de980f3c3f787e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py @@ -0,0 +1,35 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import paddle +import unittest +from hybrid_parallel_pp_alexnet import TestDistPPTraning + + +class TestPPClipGrad(TestDistPPTraning): + def build_optimizer(self, model): + grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5) + scheduler = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True) + optimizer = paddle.optimizer.SGD(learning_rate=scheduler, + grad_clip=grad_clip, + parameters=model.parameters()) + return scheduler, optimizer + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py index 7a4f7f9fbd62bd..f54aa1bb6e5561 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py @@ -42,6 +42,9 @@ def test_hybrid_parallel_save_load(self): def test_hybrid_parallel_recompute(self): self.run_mnist_2gpu('hybrid_parallel_pp_recompute.py') + def test_hybrid_parallel_pp_clip_grad(self): + self.run_mnist_2gpu('hybrid_parallel_pp_clip_grad.py') + if __name__ == "__main__": unittest.main() From e275e423043e9df51f0e969ffc81e0dc1562aa01 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Mon, 11 Oct 2021 22:13:17 -0500 Subject: [PATCH 115/298] Add pool2d test convert (#36338) --- .../inference/tensorrt/convert/pool2d_op.cc | 27 ++++++++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 41 ++++++++++--------- .../ir/inference/test_trt_convert_pool2d.py | 30 +++++++++++--- 3 files changed, 73 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 1898f28c73ad0b..733a8f64ae5dba 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -107,6 +107,9 @@ class Pool2dOpConverter : public OpConverter { plugin_pool_type = plugin::PoolPlugin::PoolType::avg; } + if (padding_algorithm == "VALID") { + std::fill(paddings.begin(), paddings.end(), 0); + } nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); nvinfer1::DimsHW nv_strides(strides[0], strides[1]); nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); @@ -123,6 +126,30 @@ class Pool2dOpConverter : public OpConverter { if (engine_->with_dynamic_shape()) { if (!adaptive && !global_pooling && !ceil_mode) { + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, + nv_pool_type, nv_ksize); + pool_layer->setStride(nv_strides); + pool_layer->setPadding(nv_paddings); + pool_layer->setAverageCountExcludesPadding(exclusive); + if (padding_algorithm == "SAME") { + pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } + layer = pool_layer; + } else if (!adaptive && !global_pooling && ceil_mode) { + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); + // If ceil mode is true, we will pad the appropriate size to the input. + DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, + input_dims); + auto *pad_layer = TRT_ENGINE_ADD_LAYER( + engine_, Padding, *const_cast(input1), pre_pad, + post_pad); + PADDLE_ENFORCE_NOT_NULL( + pad_layer, platform::errors::Fatal( + "Pad layer in poolOp converter could not be " + "created. The pointer to pad layer is `NULL`.")); + input1 = pad_layer->getOutput(0); + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, nv_pool_type, nv_ksize); pool_layer->setStride(nv_strides); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 7a70ceda60c1fb..ef50aee48e2eb8 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -174,22 +174,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (op_type == "pool2d") { std::vector paddings = BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); - if (paddings.size() > 2) return false; - if (desc.HasAttr("exclusive")) { - if (BOOST_GET_CONST(bool, desc.GetAttr("exclusive"))) { - std::vector ksize = - BOOST_GET_CONST(std::vector, desc.GetAttr("ksize")); - for (size_t i = 0; i < ksize.size(); i++) { - if (ksize[i] <= paddings[i]) { - VLOG(3) << "the padding size should be less than the filter size " - "for exclusive-counting pooling."; - return false; - } - } - } - } - if (desc.HasAttr("ceil_mode")) { - if (BOOST_GET_CONST(bool, desc.GetAttr("ceil_mode"))) return false; + if (paddings.size() > 2) { + return false; } if (desc.Input("X").size() != 1) { VLOG(3) << "TRT Pool2d expect 1 input, but got " @@ -211,15 +197,32 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << pool_type << " pool type."; return false; } + if (pool_type == "avg") { + if (desc.HasAttr("global_pooling")) { + if (!BOOST_GET_CONST(bool, desc.GetAttr("global_pooling"))) { + if (desc.HasAttr("exclusive")) { + if (BOOST_GET_CONST(bool, desc.GetAttr("exclusive"))) { + std::vector ksize = + BOOST_GET_CONST(std::vector, desc.GetAttr("ksize")); + for (size_t i = 0; i < ksize.size(); i++) { + if (ksize[i] <= paddings[i]) { + VLOG(3) << "the padding size should be less than the " + "filter size " + "for exclusive-counting pooling."; + return false; + } + } + } + } + } + } + } } } if (op_type == "conv2d" || op_type == "conv2d_transpose" || op_type == "conv2d_fusion" || op_type == "depthwise_conv2d" || op_type == "depthwise_conv2d_transpose") { - std::vector paddings = - BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); - if (desc.Input("Input").size() != 1) { VLOG(3) << "TRT Conv2d expect 1 input, but got " << desc.Input("Input").size() << " input."; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py index 3e923b1bd89d60..9ec2f83fa5ba0a 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py @@ -21,9 +21,22 @@ class TrtConvertPool2dTest(TrtLayerAutoScanTest): - def is_program_valid(self, program_config: ProgramConfig) -> bool: + def is_paddings_valid(self, program_config: ProgramConfig) -> bool: + exclusive = program_config.ops[0].attrs['exclusive'] + paddings = program_config.ops[0].attrs['paddings'] + ksize = program_config.ops[0].attrs['ksize'] + pooling_type = program_config.ops[0].attrs['pooling_type'] + global_pooling = program_config.ops[0].attrs['global_pooling'] + if global_pooling == False: + if pooling_type == 'avg': + for index in range(len(ksize)): + if ksize[index] <= paddings[index]: + return False return True + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return self.is_paddings_valid(program_config) + def sample_program_configs(self): self.trt_param.workspace_size = 1073741824 @@ -34,7 +47,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]): return np.random.random([24, 3, 3, 3]).astype(np.float32) for strides in [[1, 1], [2, 2], [1, 2]]: - for paddings in [[0, 2], [0, 3], [1, 2, 3, 4]]: + for paddings in [[0, 2], [0, 3], [0, 1, 2, 3]]: for pooling_type in ['max', 'avg']: for padding_algotithm in ['EXPLICIT', 'SAME', 'VAILD']: for ksize in [[2, 3], [3, 3]]: @@ -43,7 +56,6 @@ def generate_weight1(attrs: List[Dict[str, Any]]): for exclusive in [True, False]: for adaptive in [True, False]: for ceil_mode in [True, False]: - self.paddings = paddings dics = [{ "pooling_type": @@ -102,9 +114,6 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if self.paddings == [0, 3] or attrs[0][ - 'global_pooling'] == True or attrs[0]['ceil_mode'] == True: - return 0, 3 return 1, 2 attrs = [ @@ -139,6 +148,15 @@ def teller1(program_config, predictor_config): self.add_skip_case(teller1, SkipReasons.TRT_NOT_IMPLEMENTED, "4-dims paddings are not support for trt now.") + def teller2(program_config, predictor_config): + if program_config.ops[0].attrs['global_pooling'] == True: + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "It is not support that global_pooling is true for trt now.") + def test(self): self.add_skip_trt_case() self.run_test() From 8cc7146d1c53000888b4f6f063aed7db8c9ff922 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Tue, 12 Oct 2021 11:16:31 +0800 Subject: [PATCH 116/298] [NPU] add int64 kernel for slice, test=develop (#36328) * [NPU] add int64 kernel for scale and slice, test=develop * remove int64 for scale, test=develop --- paddle/fluid/operators/scale_op_npu.cc | 5 +- paddle/fluid/operators/slice_op_npu.cc | 39 +++++------ .../tests/unittests/npu/test_slice_op_npu.py | 64 +++++++++++++++++++ 3 files changed, 80 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc index 23817190208693..744a9b137f622e 100644 --- a/paddle/fluid/operators/scale_op_npu.cc +++ b/paddle/fluid/operators/scale_op_npu.cc @@ -12,11 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include - -#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/scale_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc index f8bf46da4a6383..52351a98bce37d 100644 --- a/paddle/fluid/operators/slice_op_npu.cc +++ b/paddle/fluid/operators/slice_op_npu.cc @@ -12,18 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the Licnse. */ -#include -#include - -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/slice_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; void UpdateAttr(const framework::DDim& in_dims, const std::vector axes, const std::vector starts, const std::vector ends, @@ -54,7 +50,7 @@ void UpdateAttr(const framework::DDim& in_dims, const std::vector axes, } } -template +template class SliceNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -128,17 +124,14 @@ class SliceNPUKernel : public framework::OpKernel { UpdateAttr(in_dims, axes, starts, ends, &offsets, &size); + auto stream = ctx.template device_context().stream(); const auto& runner = NpuOpRunner("SliceD", {*input}, {*out}, {{"offsets", offsets}, {"size", size}}); - - auto stream = - ctx.template device_context() - .stream(); runner.Run(stream); } }; -template +template class SliceGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -221,15 +214,13 @@ class SliceGradNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - slice, ops::SliceNPUKernel, - ops::SliceNPUKernel, - ops::SliceNPUKernel); - -REGISTER_OP_NPU_KERNEL( - slice_grad, - ops::SliceGradNPUKernel, - ops::SliceGradNPUKernel, - ops::SliceGradNPUKernel); +REGISTER_OP_NPU_KERNEL(slice, ops::SliceNPUKernel, + ops::SliceNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::SliceNPUKernel, +#endif + ops::SliceNPUKernel); + +REGISTER_OP_NPU_KERNEL(slice_grad, ops::SliceGradNPUKernel, + ops::SliceGradNPUKernel, + ops::SliceGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py index 055c3015f82f5a..611691109e187b 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py @@ -527,5 +527,69 @@ def init_dtype(self): self.dtype = np.float16 +class TestSliceOpInt64(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def setUp(self): + self.op_type = "slice" + self.set_npu() + self.init_dtype() + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags + } + + def config(self): + self.input = np.random.randint( + 100, size=(3, 4, 5, 6)).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [3, 3, 4] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[1:3, 0:3, 2:4, :] + + def init_dtype(self): + self.dtype = np.int64 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestSliceOpTensorInt64(TestSliceOpInt64): + def setUp(self): + self.op_type = "slice" + self.set_npu() + self.init_dtype() + self.config() + self.inputs = { + 'Input': self.input, + 'StartsTensor': self.starts, + 'EndsTensor': self.ends + } + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': [-1, -1, -1], + 'ends': [-1, -1, -1], + 'infer_flags': self.infer_flags + } + + def config(self): + self.input = np.random.randint( + 100, size=(3, 4, 5, 6)).astype(self.dtype) + self.starts = np.array([1, 0, 2]).astype('int32') + self.ends = np.array([3, 3, 4]).astype('int32') + self.axes = [0, 1, 2] + self.infer_flags = [-1, -1, -1] + self.out = self.input[1:3, 0:3, 2:4, :] + + if __name__ == '__main__': unittest.main() From 1d660eb6767b990f8a5760e7b766a880f88d2d03 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 11 Oct 2021 17:42:25 +0800 Subject: [PATCH 117/298] Fix the bug when axis is specified and weight is provided --- .../unittests/test_cross_entropy_loss.py | 48 +++++++++++++++++++ python/paddle/nn/functional/loss.py | 46 +++++++++++------- 2 files changed, 78 insertions(+), 16 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py index d2eae1cce5bcb7..6a0d955040f353 100644 --- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py +++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py @@ -1175,6 +1175,54 @@ def test_cross_entropy_loss_2d_with_weight_none(self): self.assertTrue(np.allclose(static_ret, expected)) self.assertTrue(np.allclose(dy_ret_value, expected)) + def test_cross_entropy_loss_2d_with_weight_axis_change_mean(self): + input_np = np.random.random(size=(2, 3, 2, 2)).astype(self.dtype) #NCHW + label_np = np.random.randint( + 0, 3, size=(2, 2, 2)).astype(np.int64) #NHW + weight_np = np.random.random(size=(3, )).astype(self.dtype) #C + + paddle.enable_static() + prog = fluid.Program() + startup_prog = fluid.Program() + place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + with fluid.program_guard(prog, startup_prog): + input = fluid.data( + name='input', shape=[2, 3, 2, 2], dtype=self.dtype) + label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64') + weight = fluid.data(name='weight', shape=[3], dtype=self.dtype) + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( + weight=weight, reduction='mean', axis=1) + # specify the class channels to axis 1 + ret = cross_entropy_loss(input, label) + + exe = fluid.Executor(place) + static_ret = exe.run(prog, + feed={ + 'input': input_np, + 'label': label_np, + "weight": weight_np + }, + fetch_list=[ret]) + + self.assertIsNotNone(static_ret) + with fluid.dygraph.guard(): + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( + weight=fluid.dygraph.to_variable(weight_np), reduction='mean') + dy_ret = cross_entropy_loss( + fluid.dygraph.to_variable(input_np), + fluid.dygraph.to_variable(label_np)) + dy_ret_value = dy_ret.numpy() + self.assertIsNotNone(dy_ret_value) + expected = cross_entropy_loss_2d( + np.transpose(input_np, [0, 2, 3, 1]), + label_np, + weight=weight_np, + reduction='mean')[0] + self.assertTrue(np.allclose(static_ret, dy_ret_value)) + self.assertTrue(np.allclose(static_ret, expected)) + self.assertTrue(np.allclose(dy_ret_value, expected)) + def test_cross_entropy_loss_2d_with_weight_mean_ignore_exceedlabel(self): N = 4 C = 3 diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index da2d010c323b58..f4e8711a231e4e 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1700,19 +1700,26 @@ def cross_entropy(input, out = _C_ops.elementwise_mul(out, weight_gather_reshape) else: - if input.shape[-1] != weight.shape[-1]: + if input.shape[axis] != weight.shape[-1]: raise ValueError( - "input's class_dimension({}) must equal to \ - weight's class_dimension({}) \ - when weight is provided" - .format(input.shape[-1], weight.shape[-1])) + "input's class_dimension({}) must equal to " + "weight's class_dimension({}) " + "when weight is provided"\ + .format(input.shape[axis], weight.shape[-1])) ignore_weight_mask = paddle.cast((label != ignore_index), out.dtype) if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[ - -1] == 1: - ignore_weight_mask.squeeze_(-1) - weight_gather = _C_ops.gather_nd(weight, valid_label) + axis] == 1: + ignore_weight_mask.squeeze_(axis) + if axis != -1: + temp_perm = list(range(axis % valid_label.ndim)) \ + + list(range((axis + 1) % valid_label.ndim, valid_label.ndim)) \ + + [axis%valid_label.ndim] + weight_gather = _C_ops.gather_nd( + weight, valid_label.transpose(temp_perm)) + else: + weight_gather = _C_ops.gather_nd(weight, valid_label) weight_gather = _C_ops.elementwise_mul(weight_gather, ignore_weight_mask) input_shape = list(label.shape) @@ -1807,20 +1814,27 @@ def cross_entropy(input, weight_gather_reshape = reshape(weight_gather, shape=out_shape) out = paddle.cast(out, weight_gather_reshape.dtype) else: - if input.shape[-1] != weight.shape[-1]: - raise ValueError("input's class_dimension({}) must equal to "\ - "weight's class_dimension({}) "\ - "when weight is provided" - .format(input.shape[-1], weight.shape[-1])) + if input.shape[axis] != weight.shape[-1]: + raise ValueError("input's class_dimension({}) must equal to " + "weight's class_dimension({}) " + "when weight is provided"\ + .format(input.shape[axis], weight.shape[-1])) valid_label = paddle.where(label == ignore_index, paddle.zeros_like(label), label) ignore_weight_mask = paddle.cast((label != ignore_index), input.dtype) if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[ - -1] == 1: - ignore_weight_mask = paddle.squeeze(ignore_weight_mask, -1) - weight_gather = paddle.gather_nd(weight, valid_label) + axis] == 1: + ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis) + if axis != -1: + temp_perm = list(range(axis % valid_label.ndim)) \ + + list(range((axis + 1) % valid_label.ndim, valid_label.ndim)) \ + + [axis % valid_label.ndim] + weight_gather = paddle.gather_nd( + weight, paddle.transpose(valid_label, temp_perm)) + else: + weight_gather = paddle.gather_nd(weight, valid_label) weight_gather = paddle.multiply(weight_gather, ignore_weight_mask) input_shape = list(label.shape) From 8c2fbc3138ff4e17c451cabe605f7f22571d6aaf Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 11 Oct 2021 18:35:39 +0800 Subject: [PATCH 118/298] Update loss.py --- python/paddle/nn/functional/loss.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index f4e8711a231e4e..f8e03e476d7f0c 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1668,12 +1668,12 @@ def cross_entropy(input, format(invalid_label[0], 0)) # TODO: Temporarily use paddle.nonzero instead of paddle.max # to detect and find out possible illegal label values - if len(paddle.nonzero(valid_label >= input.shape[-1])) > 0: + if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0: invalid_label = paddle.gather_nd( - valid_label, paddle.nonzero(valid_label >= input.shape[-1])) + valid_label, paddle.nonzero(valid_label >= input.shape[axis])) raise ValueError( "Target({}) is out of class_dimension's upper bound({})". - format(invalid_label[0], input.shape[-1] - 1)) + format(invalid_label[0], input.shape[axis] - 1)) _, out = _C_ops.softmax_with_cross_entropy( input, label, 'soft_label', soft_label, 'ignore_index', From 53dc0143377552418f1c4db39c5a388a75fd52f8 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 11 Oct 2021 20:36:26 +0800 Subject: [PATCH 119/298] Update loss.py --- python/paddle/nn/functional/loss.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index f8e03e476d7f0c..5bb317cf3e7466 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1712,9 +1712,9 @@ def cross_entropy(input, if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[ axis] == 1: ignore_weight_mask.squeeze_(axis) - if axis != -1: + if axis != -1 and axis != valid_label.ndim - 1: temp_perm = list(range(axis % valid_label.ndim)) \ - + list(range((axis + 1) % valid_label.ndim, valid_label.ndim)) \ + + list(range((axis % valid_label.ndim + 1) , valid_label.ndim)) \ + [axis%valid_label.ndim] weight_gather = _C_ops.gather_nd( weight, valid_label.transpose(temp_perm)) From 3675f25df2d176e558a6d6f3179e0879b6f7c9a6 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 11 Oct 2021 21:10:55 +0800 Subject: [PATCH 120/298] Update loss.py --- python/paddle/nn/functional/loss.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index 5bb317cf3e7466..eb043c005663a7 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1670,7 +1670,8 @@ def cross_entropy(input, # to detect and find out possible illegal label values if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0: invalid_label = paddle.gather_nd( - valid_label, paddle.nonzero(valid_label >= input.shape[axis])) + valid_label, + paddle.nonzero(valid_label >= input.shape[axis])) raise ValueError( "Target({}) is out of class_dimension's upper bound({})". format(invalid_label[0], input.shape[axis] - 1)) From 6cd41cec2146da2f5008a42e972a4627a4deb26d Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 11 Oct 2021 22:15:05 +0800 Subject: [PATCH 121/298] Update loss.py --- python/paddle/nn/functional/loss.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index eb043c005663a7..38d4da17cbefa4 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1712,11 +1712,12 @@ def cross_entropy(input, out.dtype) if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[ axis] == 1: - ignore_weight_mask.squeeze_(axis) + # TODO: Temporarily use squeeze instead of squeeze_ + ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis) if axis != -1 and axis != valid_label.ndim - 1: temp_perm = list(range(axis % valid_label.ndim)) \ + list(range((axis % valid_label.ndim + 1) , valid_label.ndim)) \ - + [axis%valid_label.ndim] + + [axis % valid_label.ndim] weight_gather = _C_ops.gather_nd( weight, valid_label.transpose(temp_perm)) else: @@ -1828,9 +1829,9 @@ def cross_entropy(input, if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[ axis] == 1: ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis) - if axis != -1: + if axis != -1 and axis != valid_label.ndim - 1: temp_perm = list(range(axis % valid_label.ndim)) \ - + list(range((axis + 1) % valid_label.ndim, valid_label.ndim)) \ + + list(range((axis % valid_label.ndim + 1), valid_label.ndim)) \ + [axis % valid_label.ndim] weight_gather = paddle.gather_nd( weight, paddle.transpose(valid_label, temp_perm)) From a4246b90646101f8dd7734d2d8ee5ce8106b67a8 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 11 Oct 2021 23:13:41 +0800 Subject: [PATCH 122/298] Update test_cross_entropy_loss.py --- python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py index 6a0d955040f353..c4be262e93029c 100644 --- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py +++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py @@ -1208,7 +1208,7 @@ def test_cross_entropy_loss_2d_with_weight_axis_change_mean(self): self.assertIsNotNone(static_ret) with fluid.dygraph.guard(): cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( - weight=fluid.dygraph.to_variable(weight_np), reduction='mean') + weight=fluid.dygraph.to_variable(weight_np), reduction='mean', axis=1) dy_ret = cross_entropy_loss( fluid.dygraph.to_variable(input_np), fluid.dygraph.to_variable(label_np)) From 59841e6f324e3a0fe49b047bdff1e425a67497fb Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 11 Oct 2021 23:44:26 +0800 Subject: [PATCH 123/298] Update test_cross_entropy_loss.py --- .../paddle/fluid/tests/unittests/test_cross_entropy_loss.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py index c4be262e93029c..d3ed76e34a614d 100644 --- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py +++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py @@ -1208,7 +1208,9 @@ def test_cross_entropy_loss_2d_with_weight_axis_change_mean(self): self.assertIsNotNone(static_ret) with fluid.dygraph.guard(): cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( - weight=fluid.dygraph.to_variable(weight_np), reduction='mean', axis=1) + weight=fluid.dygraph.to_variable(weight_np), + reduction='mean', + axis=1) dy_ret = cross_entropy_loss( fluid.dygraph.to_variable(input_np), fluid.dygraph.to_variable(label_np)) From f77083bbbc6f559bebee42ec12d42a37472dc8c4 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 11 Oct 2021 23:45:13 +0800 Subject: [PATCH 124/298] Update loss.py --- python/paddle/nn/functional/loss.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index 38d4da17cbefa4..b1db45ad506695 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1713,7 +1713,8 @@ def cross_entropy(input, if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[ axis] == 1: # TODO: Temporarily use squeeze instead of squeeze_ - ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis) + ignore_weight_mask = paddle.squeeze(ignore_weight_mask, + axis) if axis != -1 and axis != valid_label.ndim - 1: temp_perm = list(range(axis % valid_label.ndim)) \ + list(range((axis % valid_label.ndim + 1) , valid_label.ndim)) \ From b3f6eedb77925c28a193eaedb858220b9417c5ca Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 12 Oct 2021 12:55:02 +0800 Subject: [PATCH 125/298] refine LarsOptimizer (#36351) --- python/paddle/fluid/optimizer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 24076e82b0365d..4625d7ea89b25e 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -2047,11 +2047,15 @@ def _create_accumulators(self, block, parameters): def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) _lars_weight_decay = self._lars_weight_decay + _lars_coeff = self._lars_coeff param_name = param_and_grad[0].name + is_excluded = False if len(self._exclude_from_weight_decay) > 0: for name in self._exclude_from_weight_decay: if name in param_name: _lars_weight_decay = 0.0 + _lars_coeff = 0.0 + is_excluded = True break velocity_acc = self._get_accumulator(self._velocity_acc_str, @@ -2065,7 +2069,7 @@ def _append_optimize_op(self, block, param_and_grad): attrs = { "mu": self._momentum, - "lars_coeff": self._lars_coeff, + "lars_coeff": _lars_coeff, "lars_weight_decay": _lars_weight_decay, "multi_precision": find_master, "rescale_grad": self._rescale_grad @@ -2086,7 +2090,7 @@ def _append_optimize_op(self, block, param_and_grad): # create the momentum optimize op momentum_op = block.append_op( - type=self.type, + type='momentum' if is_excluded else self.type, inputs=inputs, outputs=outputs, attrs=attrs, From 09778f464956a450491d5ade3ef79586d61403ca Mon Sep 17 00:00:00 2001 From: Qi Li Date: Tue, 12 Oct 2021 13:31:12 +0800 Subject: [PATCH 126/298] [NPU] fix elementwise_mul to support broadcast, test=develop (#36258) * [NPU] fix elementwise_mul to support broadcast, test=develop * remove debug files, test=develop * add axis support, test=develop --- .../elementwise/elementwise_mul_op_npu.cc | 132 ++++++--- .../npu/test_elementwise_mul_op_npu.py | 274 +++++++++++------- 2 files changed, 258 insertions(+), 148 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc index 47aa7e2521f76a..b2030ad21e8d1f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc @@ -12,67 +12,127 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL -#include -#include - #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_npu.h" #include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; - -template +using NPUDeviceContext = platform::NPUDeviceContext; + +template +static void ReduceDims(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const int axis, + const framework::DDim& ddims, + const framework::DDim& brd_ddims, const Tensor& in, + Tensor* out) { + std::vector axes; + int64_t brd_size = brd_ddims.size(); + int64_t org_size = ddims.size(); + // int64_t diff = brd_dims.size() - dims.size(); + for (int64_t i = 0; i < brd_size; ++i) { + if (i < axis || i >= org_size + axis) { + axes.push_back(i); + continue; + } + if (brd_ddims[i] > ddims[i - axis]) { + axes.push_back(i); + } + } + // LOG(INFO) << "axes = " << framework::make_ddim(axes).to_str(); + out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); +} + +template class ElementwiseMulNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + int axis = ctx.Attr("axis"); + + bool direct_compute = false; + auto x_dims = x->dims(); + auto y_dims = y->dims(); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + if (x_dims.size() >= y_dims.size()) { + direct_compute = x_dims.size() == (y_dims.size() + axis); + } else { + direct_compute = y_dims.size() == (x_dims.size() + axis); + } - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); + auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {}); - runner.Run(stream); + if (direct_compute) { + const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {}); + runner.Run(stream); + } else { + Tensor trans_x, trans_y; + NpuElementWiseOpBroadcast(dev_ctx, x, y, axis, &trans_x, &trans_y); + const auto& runner = NpuOpRunner("Mul", {trans_x, trans_y}, {*out}, {}); + runner.Run(stream); + } } }; -template +template class ElementwiseMulGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); - auto place = ctx.GetPlace(); + axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis); + auto stream = ctx.template device_context().stream(); - auto stream = - ctx.template device_context() - .stream(); + Tensor trans_x, trans_y; + NpuElementWiseOpBroadcast(dev_ctx, x, y, axis, &trans_x, &trans_y); if (dx) { - dx->mutable_data(place); - const auto& runner_dx = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {}); - runner_dx.Run(stream); + if (dx->dims() == dout->dims()) { + dx->mutable_data(ctx.GetPlace()); + const auto& runner_dx = NpuOpRunner("Mul", {*dout, trans_y}, {*dx}, {}); + runner_dx.Run(stream); + } else { + Tensor dx_temp(x->type()); + dx_temp.Resize(trans_x.dims()); + dx_temp.mutable_data(ctx.GetPlace()); + const auto& runner_dx = + NpuOpRunner("Mul", {*dout, trans_y}, {dx_temp}, {}); + runner_dx.Run(stream); + ReduceDims(ctx, stream, axis, dx->dims(), trans_x.dims(), dx_temp, + dx); + } } - if (dy) { - dy->mutable_data(place); - const auto& runner_dy = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {}); - runner_dy.Run(stream); + if (dy->dims() == dout->dims()) { + dy->mutable_data(ctx.GetPlace()); + const auto& runner_dy = NpuOpRunner("Mul", {trans_x, *dout}, {*dy}, {}); + runner_dy.Run(stream); + } else { + Tensor dy_temp(y->type()); + dy_temp.Resize(trans_y.dims()); + dy_temp.mutable_data(ctx.GetPlace()); + const auto& runner_dy = + NpuOpRunner("Mul", {trans_x, *dout}, {dy_temp}, {}); + runner_dy.Run(stream); + ReduceDims(ctx, stream, axis, dy->dims(), trans_y.dims(), dy_temp, + dy); + } } } }; @@ -82,15 +142,9 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - elementwise_mul, - ops::ElementwiseMulNPUKernel, - ops::ElementwiseMulNPUKernel); +REGISTER_OP_NPU_KERNEL(elementwise_mul, ops::ElementwiseMulNPUKernel, + ops::ElementwiseMulNPUKernel); REGISTER_OP_NPU_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradNPUKernel, - ops::ElementwiseMulGradNPUKernel); -#endif + elementwise_mul_grad, ops::ElementwiseMulGradNPUKernel, + ops::ElementwiseMulGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py index ea94661e8a51e6..92bbc9f536d133 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py @@ -18,147 +18,203 @@ import unittest import sys sys.path.append("..") -from op_test import OpTest +from op_test import OpTest, skip_check_grad_ci import paddle import paddle.fluid as fluid paddle.enable_static() -SEED = 2021 -class TestElementwiseMul(OpTest): +class ElementwiseMulOp(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + def setUp(self): self.set_npu() self.op_type = "elementwise_mul" - self.place = paddle.NPUPlace(0) - + self.dtype = np.float32 + self.axis = -1 self.init_dtype() - np.random.seed(SEED) - x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) - y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) - out = np.multiply(x, y) + self.init_input_output() + self.init_axis() self.inputs = { - 'X': OpTest.np_dtype_to_fluid_dtype(x), - 'Y': OpTest.np_dtype_to_fluid_dtype(y) + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) } - self.attrs = {} - self.outputs = {'Out': out} + self.outputs = {'Out': self.out} + self.attrs = {'axis': self.axis} - def set_npu(self): - self.__class__.use_npu = True + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, ['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', no_grad_set=set('Y')) + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.multiply(self.x, self.y) def init_dtype(self): - self.dtype = np.float32 + pass - def test_check_output(self): - self.check_output_with_place(self.place) + def init_axis(self): + pass - # TODO(ascendrc): Mul grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1) to test broadcast.") +class TestElementwiseMulOp_scalar(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(10, 3, 4).astype(np.float32), + 'Y': np.random.rand(1).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} -class TestElementwiseMulFp16(OpTest): +class TestElementwiseMulOp_Vector(ElementwiseMulOp): def setUp(self): self.set_npu() self.op_type = "elementwise_mul" - self.place = paddle.NPUPlace(0) + self.inputs = { + 'X': np.random.random((100, )).astype("float32"), + 'Y': np.random.random((100, )).astype("float32") + } + self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])} - self.init_dtype() - np.random.seed(SEED) - x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) - y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) - out = np.multiply(x, y) +class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp): + def init_input_output(self): + self.x = np.random.rand(100, 2, 3).astype(self.dtype) + self.y = np.random.rand(100).astype(self.dtype) + self.out = self.x * self.y.reshape(100, 1, 1) + + def init_axis(self): + self.axis = 0 + + +class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" self.inputs = { - 'X': OpTest.np_dtype_to_fluid_dtype(x), - 'Y': OpTest.np_dtype_to_fluid_dtype(y) + 'X': np.random.rand(2, 100, 3).astype(np.float32), + 'Y': np.random.rand(100).astype(np.float32) + } + + self.attrs = {'axis': 1} + self.outputs = { + 'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 100, 1) + } + + +class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(2, 3, 100).astype(np.float32), + 'Y': np.random.rand(100).astype(np.float32) + } + + self.outputs = { + 'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 100) } - self.attrs = {} - self.outputs = {'Out': out} - def set_npu(self): - self.__class__.use_npu = True - self.__class__.no_need_check_grad = True +class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(2, 10, 12, 3).astype(np.float32), + 'Y': np.random.rand(10, 12).astype(np.float32) + } + + self.attrs = {'axis': 1} + self.outputs = { + 'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 10, 12, 1) + } + + +class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(10, 2, 11).astype(np.float32), + 'Y': np.random.rand(10, 1, 11).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + + +class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(10, 4, 2, 3).astype(np.float32), + 'Y': np.random.rand(10, 4, 1, 3).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "paddle is not compiled with NPU") +class TestElementwiseMulOpFp16(ElementwiseMulOp): def init_dtype(self): self.dtype = np.float16 - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-5) - - -class TestElementwiseMulNet(unittest.TestCase): - def _test(self, run_npu=True): - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - np.random.seed(SEED) - - a_np = np.random.random(size=(32, 32)).astype('float32') - b_np = np.random.random(size=(32, 32)).astype('float32') - c_np = np.random.random(size=(32, 32)).astype('float32') - d_np = np.random.random(size=(32, 32)).astype('float32') - label_np = np.random.randint(2, size=(32, 1)).astype('int64') - - with paddle.static.program_guard(main_prog, startup_prog): - a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') - b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') - c = paddle.static.data(name="c", shape=[32, 32], dtype='float32') - d = paddle.static.data(name="d", shape=[32, 32], dtype='float32') - label = paddle.static.data( - name="label", shape=[32, 1], dtype='int64') - - e = paddle.multiply(a, b) - f = paddle.multiply(c, d) - f.stop_gradient = True - g = paddle.multiply(e, f) - - fc_1 = fluid.layers.fc(input=g, size=128) - prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.reduce_mean(cost) - sgd = fluid.optimizer.SGD(learning_rate=0.01) - sgd.minimize(loss) - - if run_npu: - place = paddle.NPUPlace(0) - else: - place = paddle.CPUPlace() - - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - print("Start run on {}".format(place)) - for epoch in range(100): - - pred_res, loss_res = exe.run(main_prog, - feed={ - "a": a_np, - "b": b_np, - "c": c_np, - "d": d_np, - "label": label_np - }, - fetch_list=[prediction, loss]) - if epoch % 10 == 0: - print("Epoch {} | Prediction[0]: {}, Loss: {}".format( - epoch, pred_res[0], loss_res)) - - return pred_res, loss_res - - def test_npu(self): - cpu_pred, cpu_loss = self._test(False) - npu_pred, npu_loss = self._test(True) - - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + +class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(2, 3, 100).astype(np.float32), + 'Y': np.random.rand(1, 1, 100).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + + +class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(30, 3, 1, 5).astype(np.float32), + 'Y': np.random.rand(30, 1, 4, 1).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + + +class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(10, 10).astype(np.float32), + 'Y': np.random.rand(2, 2, 10, 10).astype(np.float32) + } + + self.attrs = {'axis': 2} + + self.outputs = { + 'Out': self.inputs['X'].reshape(1, 1, 10, 10) * self.inputs['Y'] + } if __name__ == '__main__': From 1e1aa1977bf15f53ab2a7c115e3ca258797c0de6 Mon Sep 17 00:00:00 2001 From: Tongxin Bai Date: Tue, 12 Oct 2021 16:09:13 +0800 Subject: [PATCH 127/298] [Autograd.functional] VJP and JVP (#36020) * autograd.functional passed pylint checker. * autograd.functional: fix import errors. * autograd.functional: fixed unit tests. * autograd.functional minor format change --- python/paddle/autograd/__init__.py | 2 +- python/paddle/autograd/functional.py | 248 ++++++++++++++- .../tests/unittests/autograd/test_vjp_jvp.py | 294 ++++++++++++++++++ 3 files changed, 533 insertions(+), 11 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py index f4a0122759dc5d..cffc18e95e5ab3 100644 --- a/python/paddle/autograd/__init__.py +++ b/python/paddle/autograd/__init__.py @@ -18,6 +18,6 @@ from .py_layer import PyLayer, PyLayerContext # noqa: F401 from ..framework import set_grad_enabled # noqa: F401 from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401 -from .functional import jacobian, hessian # noqa: F401 +from .functional import vjp, jvp, jacobian, hessian # noqa: F401 __all__ = ['backward', 'PyLayer', 'PyLayerContext'] diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py index a5665631c937f8..688e04335ebb70 100644 --- a/python/paddle/autograd/functional.py +++ b/python/paddle/autograd/functional.py @@ -12,9 +12,239 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.fluid import framework -from .utils import _check_tensors, _stack_tensor_or_return_none, _replace_none_with_zero_tensor +import contextlib import paddle +from ..fluid import framework +from ..fluid.dygraph import grad +from ..nn.initializer import assign +from ..tensor import reshape, zeros_like, to_tensor +from .utils import _check_tensors, _stack_tensor_or_return_none, _replace_none_with_zero_tensor + + +def to_tensorlist(tl): + if not isinstance(tl, list): + if isinstance(tl, tuple): + tl = list(tl) + else: + tl = [tl] + for t in tl: + assert isinstance(t, paddle.Tensor) or t is None, ( + f'{t} is expected to be paddle.Tensor or None, but found {type(t)}.' + ) + return tl + + +@contextlib.contextmanager +def gradient_scope(*var_lists, create_graph=False, allow_unused=False): + def grad_fn(ys, xs, v, create_graph=create_graph): + assert len(ys) == len(v), ( + f'`v` is expected to be of the same size as the output. ' + f'Here the output is {ys}, and `v` is {v}.') + if allow_unused: + ys = [ + to_tensor( + [0.0], stop_gradient=False) if y is None else y for y in ys + ] + return grad( + ys, xs, v, create_graph=create_graph, allow_unused=allow_unused) + + def return_fn(out): + if isinstance(out, paddle.Tensor): + if not create_graph: + out = out.detach() + return out + if isinstance(out, list): + return list(return_fn(x) for x in out) + elif isinstance(out, tuple): + return tuple(return_fn(x) for x in out) + else: + assert out is None + return out + + def process(vl): + out = [] + # If v is treated as constant in the outer scope, its gradient is guaranteed + # not to be taken beyond this scope. Within this scope, however, v's gradient + # may be computed. We only need to detach v in this case. + # Otherwise, v's gradient is valid, and is subject to update beyond this scope. + # In this case we must not confuse the gradient in the outer scope with the + # inner one's. Moreover, we need to make sure that the result from the inner + # scope can flow back to the outer scope. This can be satisfied by extending + # the original variable with a duplication operation v1 = v so that v still + # maintains the complete lineage. + for v in vl: + if v is None: + out.append(v) + continue + if create_graph and not v.stop_gradient: + v = assign(v) + else: + v = v.detach() + v.stop_gradient = False + out.append(v) + return out + + try: + var_lists = [process(vl) for vl in var_lists] + bundle = var_lists + [grad_fn, return_fn] + yield bundle + finally: + pass + + +@framework.dygraph_only +def vjp(func, inputs, v=None, create_graph=False, allow_unused=False): + r"""Computes the Vector-Jacobian product, a functional form of + reverse mode automatic differentiation. + + Args: + func(Callable): `func` takes as input a tensor or a list + of tensors and returns a tensor or a list of tensors. + inputs(list[Tensor]|Tensor): used as positional arguments + to evaluate `func`. `inputs` is accepted as one tensor + or a list of tensors. + v(list[Tensor]|Tensor, optional): the cotangent vector + invovled in the VJP computation. `v` matches the size + and shape of `func`'s output. Default value is None + and in this case is equivalent to all ones the same size + of `func`'s output. + create_graph(bool, optional): if `True`, gradients can + be evaluated on the results. If `False`, taking gradients + on the results is invalid. Default value is False. + allow_unused(bool, optional): In case that some Tensors of + `inputs` do not contribute to the computation of the output. + If `allow_unused` is False, an error will be raised, + Otherwise, the gradients of the said inputs are returned + None. Default value is False. + + Returns: + output(tuple): + func_out: the output of `func(inputs)` + vjp(list[Tensor]|Tensor): the pullback results of `v` on `func` + + Examples: + .. code-block:: python + + def func(x): + return paddle.matmul(x, x) + + x = paddle.ones(shape=[2, 2], dtype='float32') + output, inputs_grad = vjp(func, x) + print(inputs_grad) + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[4., 4.], + # [4., 4.]])] + + v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]]) + output, inputs_grad = vjp(func, x, v) + print(inputs_grad) + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 1.], + # [1., 0.]])] + + output, inputs_grad = vjp(func, x, v, create_graph=True) + print(inputs_grad) + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [[2., 1.], + # [1., 0.]])] + + y = paddle.ones(shape=[2, 2], dtype='float32') + def func_unused(x, y): + return paddle.matmul(x, x) + + output, inputs_grad = vjp(func, [x, y], v) + # ValueError: (InvalidArgument) The 1-th input does not appear in the backward graph. + # Please check the input variable or set allow_unused=True to get None result. + # [Hint: Expected allow_unused_ == true, but received allow_unused_:0 != true:1.] + + output, inputs_grad = vjp(func, [x, y], v, allow_unused=True) + print(inputs_grad) + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 1.], + # [1., 0.]]), None] + """ + xs, v = to_tensorlist(inputs), to_tensorlist(v) + + with gradient_scope( + xs, v, create_graph=create_graph, + allow_unused=allow_unused) as [xs, v, grad_fn, return_fn]: + outputs = func(*xs) + ys = to_tensorlist(outputs) + grads = grad_fn(ys, xs, v) + outputs, grads = return_fn(outputs), return_fn(grads) + + return outputs, grads + + +@framework.dygraph_only +def jvp(func, inputs, v=None, create_graph=False, allow_unused=False): + r""" + Computes the Jacobian-Vector product for a function at the given + inputs and a vector in the tangent space induced by the inputs. + + .. note:: + **This API is ONLY available in imperative mode.** + + Args: + func(Callable): `func` takes as input a tensor or a list + of tensors and returns a tensor or a list of tensors. + inputs(list[Tensor]|Tensor): used as positional arguments + to evaluate `func`. `inputs` is accepted as one tensor + or a list of tensors. + v(list[Tensor]|Tensor, optional): the tangent vector + invovled in the JVP computation. `v` matches the size + and shape of `inputs`. `v` is Optional if `func` returns + a single tensor. Default value is None and in this case + is equivalent to all ones the same size of `inputs`. + create_graph(bool, optional): if `True`, gradients can + be evaluated on the results. If `False`, taking gradients + on the results is invalid. Default value is False. + allow_unused(bool, optional): In case that some Tensors of + `inputs` do not contribute to the computation of the output. + If `allow_unused` is False, an error will be raised, + Otherwise, the gradients of the said inputs are returned + None. Default value is False. + + Returns: + output(tuple): + func_out: the output of `func(inputs)` + jvp(list[Tensor]|Tensor): the pullback results of `v` on `func` + + Examples: + .. code-block:: python + + def func(x): + return paddle.matmul(x, x) + + x = paddle.ones(shape=[2, 2], dtype='float32') + + output, inputs_grad = jvp(func, x) + print(inputs_grad) + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 2.], + # [2., 2.]])] + + v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]]) + output, inputs_grad = vjp(func, x, v) + print(inputs_grad) + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[1., 1.], + # [0., 0.]])] + + """ + xs, v = to_tensorlist(inputs), to_tensorlist(v) + + with gradient_scope( + xs, v, create_graph=create_graph, + allow_unused=allow_unused) as [xs, v, grad_fn, return_fn]: + outputs = func(*xs) + ys = to_tensorlist(outputs) + ys_grad = [zeros_like(y) for y in ys] + xs_grad = grad_fn(ys, xs, ys_grad, create_graph=True) + ys_grad = grad_fn(xs_grad, ys_grad, v) + outputs, ys_grad = return_fn(outputs), return_fn(ys_grad) + + return outputs, ys_grad @framework.dygraph_only @@ -60,7 +290,7 @@ def jacobian(func, inputs, create_graph=False, allow_unused=False): def func(x): return paddle.matmul(x, x) - + x = paddle.ones(shape=[2, 2], dtype='float32') x.stop_gradient = False jacobian = paddle.autograd.jacobian(func, x) @@ -78,7 +308,7 @@ def func(x): def func(x, y): return paddle.matmul(x, y) - + x = paddle.ones(shape=[2, 2], dtype='float32') y = paddle.ones(shape=[2, 2], dtype='float32') * 2 x.stop_gradient = False @@ -131,14 +361,12 @@ def func(x, y): outputs = _check_tensors(func(*inputs), "outputs") fin_size = len(inputs) fout_size = len(outputs) - flat_outputs = tuple( - paddle.reshape( - output, shape=[-1]) for output in outputs) + flat_outputs = tuple(reshape(output, shape=[-1]) for output in outputs) jacobian = tuple() for i, flat_output in enumerate(flat_outputs): jac_i = list([] for _ in range(fin_size)) for k in range(len(flat_output)): - row_k = paddle.grad( + row_k = grad( flat_output[k], inputs, create_graph=create_graph, @@ -146,7 +374,7 @@ def func(x, y): allow_unused=allow_unused) for j in range(fin_size): jac_i[j].append( - paddle.reshape( + reshape( row_k[j], shape=[-1]) if isinstance(row_k[j], paddle.Tensor) else None) jacobian += (tuple( @@ -273,7 +501,7 @@ def func(x, y): ], "The function to compute Hessian matrix should return a Tensor with a single element" def jac_func(*ins): - grad_inputs = paddle.grad( + grad_inputs = grad( outputs, ins, create_graph=True, diff --git a/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py b/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py new file mode 100644 index 00000000000000..86331d36a3ca82 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py @@ -0,0 +1,294 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle + +from paddle.autograd.functional import vjp, jvp, to_tensorlist +from paddle import grad, ones_like, zeros_like + + +def reduce(x): + return paddle.sum(x) + + +def reduce_dim(x): + return paddle.sum(x, axis=0) + + +def matmul(x, y): + return paddle.matmul(x, y) + + +def mul(x, y): + return x * y + + +def pow(x, y): + return paddle.pow(x, y) + + +def o2(x, y): + return paddle.multiply(x, y), paddle.matmul(x, y.t()) + + +def unuse(x, y): + return paddle.sum(x) + + +def nested(x): + def inner(y): + return x * y + + return inner + + +def make_v(f, inputs): + outputs = to_tensorlist(f(*inputs)) + return [ones_like(x) for x in outputs] + + +class TestAutogradFunctional(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.RAW_INPUTS = { + 'a': [1.0], + 'b': [1.0, 2.0], + 'c': [3.0, 4.0], + 'd': [[2.0], [3.0]], + 'A': [[1.0, 2.0], [2.0, 3.0], [3.0, 4.0]], + 'B': [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]], + } + + def setUp(self): + pass + + def gen_input(self, inp, stop_gradient=False): + if isinstance(inp, paddle.Tensor): + return inp + return paddle.to_tensor( + self.RAW_INPUTS[inp], stop_gradient=stop_gradient) + + def gen_inputs(self, inputs): + if isinstance(inputs, list): + inputs = [self.gen_input(x) for x in inputs] + else: + inputs = [self.gen_input(inputs)] + return inputs + + def gen_test_pairs(self, + func, + inputs, + v=None, + create_graph=False, + allow_unused=False): + def vjp_test(): + nonlocal v + xs = self.gen_inputs(inputs) + if v is not None: + v = self.gen_inputs(v) + outputs, inputs_grad = vjp(func, + xs, + v, + create_graph=create_graph, + allow_unused=allow_unused) + else: + outputs, inputs_grad = vjp(func, + xs, + create_graph=create_graph, + allow_unused=allow_unused) + return outputs, inputs_grad + + def grad_test(): + nonlocal v + xs = self.gen_inputs(inputs) + if v is not None: + v = self.gen_inputs(v) + outputs = func(*xs) + if v is not None: + inputs_grad = grad( + outputs, + xs, + v, + create_graph=create_graph, + allow_unused=allow_unused) + else: + inputs_grad = grad( + outputs, + xs, + create_graph=create_graph, + allow_unused=allow_unused) + return outputs, inputs_grad + + return vjp_test, grad_test + + def gen_jvp_tests(self, + func, + inputs, + v=None, + create_graph=False, + allow_unused=False): + def jvp_test(): + nonlocal v + xs = self.gen_inputs(inputs) + if v is not None: + v = self.gen_inputs(v) + outputs, outputs_grad = jvp(func, + xs, + v, + create_graph=create_graph, + allow_unused=allow_unused) + else: + outputs, outputs_grad = jvp(func, + xs, + create_graph=create_graph, + allow_unused=allow_unused) + return outputs, outputs_grad + + return jvp_test + + def check_results(self, ref, res): + type_error = 'Result is different than expected in shape or type' + value_error = 'Result is different than expected values' + if ref is None: + self.assertTrue(res is None, type_error) + elif isinstance(ref, paddle.Tensor): + self.assertTrue(isinstance(res, paddle.Tensor), type_error) + self.assertTrue(paddle.allclose(res, ref), value_error) + else: + self.assertTrue(len(res) == len(ref), type_error) + for i in range(len(ref)): + self.check_results(ref[i], res[i]) + return True + + +class TestVJP(TestAutogradFunctional): + def test_vjp_i1o1_no_create_graph(self): + test_cases = [ + [reduce, 'A'], #noqa + [reduce_dim, 'A'], #noqa + ] #noqa + for f, inputs in test_cases: + vjp, grad = self.gen_test_pairs(f, inputs) + vjp_result, grad_result = vjp(), grad() + self.check_results(grad_result, vjp_result) + + def test_vjp_i2o1_no_create_graph(self): + test_cases = [ + [matmul, ['A', 'B']], #noqa + [mul, ['b', 'c']], #noqa + ] #noqa + for f, inputs in test_cases: + vjp, grad = self.gen_test_pairs(f, inputs) + vjp_result, grad_result = vjp(), grad() + self.check_results(grad_result, vjp_result) + + def test_vjp_i2o2_no_create_graph(self): + test_cases = [ + [o2, ['A', 'A']], #noqa + ] #noqa + for f, inputs in test_cases: + inputs = self.gen_inputs(inputs) + v = make_v(f, inputs) + vjp, grad = self.gen_test_pairs(f, inputs, v=v) + vjp_result, grad_result = vjp(), grad() + self.check_results(grad_result, vjp_result) + + def test_vjp_nested_no_create_graph(self): + x = self.gen_input('a') + test_cases = [ + [nested(x), 'a'], #noqa + ] + for f, inputs in test_cases: + vjp, grad = self.gen_test_pairs(f, inputs) + vjp_result, grad_result = vjp(), grad() + self.check_results(grad_result, vjp_result) + + def test_vjp_aliased_input_no_create_graph(self): + x = self.gen_input('a') + ref = self.gen_test_pairs(nested(x), 'a')[0] + aliased = self.gen_test_pairs(nested(x), x)[0] + ref_result, aliased_result = ref(), aliased() + self.check_results(ref_result, aliased_result) + + def test_vjp_allowunused_no_create_graph(self): + x, y = self.gen_input('A'), self.gen_input('a') + vjp, grad = self.gen_test_pairs(unuse, [x, y], allow_unused=True) + vjp_result, grad_result = vjp(), grad() + self.check_results(grad_result, vjp_result) + + +def jac(grad_fn, f, inputs): + assert grad_fn in [vjp, jvp] + if grad_fn is jvp: + vs = [zeros_like(x) for x in inputs] + else: + outputs = f(*inputs) + if isinstance(outputs, paddle.Tensor): + outputs = [outputs] + vs = [zeros_like(y) for y in outputs] + JJ_cols = [] + for i, v in enumerate(vs): + v = v.flatten() + for j in range(len(v)): + _v = zeros_like(v).detach() + _v[j] = 1.0 + _v = _v.reshape(vs[i].shape) + _vs = vs.copy() + _vs[i] = _v + _, grads = grad_fn(f, inputs, vs) + d_outs = paddle.concat([d_out.flatten() for d_out in grads]) + JJ_cols.append(d_outs) + # JJ is the fully unrolled jacobian + JJ = paddle.stack(JJ_cols) + if grad_fn is vjp: + JJ = JJ.t() + return JJ + + +class TestJVP(TestAutogradFunctional): + def test_jvp_i1o1_no_create_graph(self): + test_cases = [ + [reduce, 'A'], #noqa + [reduce_dim, 'A'], #noqa + ] #noqa + for f, inputs in test_cases: + inputs = self.gen_inputs(inputs) + forward_jac = jac(jvp, f, inputs) + reverse_jac = jac(vjp, f, inputs) + self.check_results(forward_jac, reverse_jac) + + def test_jvp_i2o1_no_create_graph(self): + test_cases = [ #noqa + [matmul, ['A', 'B']], #noqa + ] #noqa + for f, inputs in test_cases: + inputs = self.gen_inputs(inputs) + forward_jac = jac(jvp, f, inputs) + reverse_jac = jac(vjp, f, inputs) + self.check_results(forward_jac, reverse_jac) + + def test_jvp_i2o2_no_create_graph(self): + test_cases = [ #noqa + [o2, ['A', 'A']], #noqa + ] #noqa + for f, inputs in test_cases: + inputs = self.gen_inputs(inputs) + forward_jac = jac(jvp, f, inputs) + reverse_jac = jac(vjp, f, inputs) + self.check_results(forward_jac, reverse_jac) + + +if __name__ == "__main__": + unittest.main() From 021add6121766d5b9c4629446486d5c3eb057fea Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Tue, 12 Oct 2021 10:55:13 +0200 Subject: [PATCH 128/298] remove not needed log (#36348) --- log | Bin 2816 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 log diff --git a/log b/log deleted file mode 100644 index c02e10686b5fbcc78a8591519aaa3389dac63a56..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2816 zcmds3U2oeq6y39b#f5-jGORjC+XWlj1V|&dE9Sb{>LA+_L5s9agdzo!vg5V=@4b}l zN{;O`8L$DHUnG;_<+Ngtf*xLf+~HQ<9(CS(yv%TQ-B~Xgn9-=ByB4rcm7iO!M^x;DvU^LS8x# z69JN=1(`9`kGkC#o$oI1L5vXT%rf49mYUvfFFgkLHd1G%K4-PLAKV&v)Y#kbcK8{! z+vbjhgD@uyWWTFcn z1IDRkxmX%|Lr9v+94c8q9w%^olEAa437kCtyr19J{l`Bw0D3)`92JanC61=5l>BHD z2uJyi;#$)RPk-L&d69=b0WbZk5E_BN7#;cg{@U0jvmCu@xNCM_vFs*n!zru{^D@s@ zw6HRlUM>O~_no5!L*L!O<7b^-rkHc^?$=>D8vTMIDPc$E0*RD*Hm>+9%88O02{#@1 zEUv*}7U-GO0_sNs8&(Lp405!D*}x|2Z)segL5}fICTQGVxiAM8q z?Z?rzHJB&0!7AObf5RW^)_Z`tK`XlvP3ZWnLQ?~HCvsy~C=cgS(dm z)ri-&-iA3c%o{k^VW)ur12U#VsmJ*PM~`O-JWuYa_R)_xep*BZ6SSnw W5 Date: Tue, 12 Oct 2021 17:25:28 +0800 Subject: [PATCH 129/298] delete remove_static_file() function in error.py (#36153) * change time to remove static tempfile * delete remove_static_file() function --- .../fluid/dygraph/dygraph_to_static/error.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py index 2a975bf00d1d26..273961e27efba2 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py @@ -54,27 +54,9 @@ def attach_error_data(error, in_runtime=False): setattr(error, ERROR_DATA, error_data) - remove_static_file() return error -def remove_static_file(): - """ - Removes temporary files created during the transformation of dygraph to static graph. - """ - del_files = set() - for loc in global_origin_info_map: - static_filepath = loc[0] - del_files.add(static_filepath) - - filename, extension = os.path.splitext(static_filepath) - del_files.add(filename + ".pyc") - - for filepath in del_files: - if os.path.exists(filepath): - os.remove(filepath) - - class TraceBackFrame(OriginInfo): """ Traceback frame information. From 6920afeb5edadf836a7f7da30bba6efbb6380f05 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Tue, 12 Oct 2021 17:32:59 +0800 Subject: [PATCH 130/298] fix windows bug that python virtual env can't find python executable (#36227) --- python/paddle/dataset/image.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py index 4fd7dc0d37ff8f..c36213282c59ce 100644 --- a/python/paddle/dataset/image.py +++ b/python/paddle/dataset/image.py @@ -39,10 +39,12 @@ if six.PY3: import subprocess import sys - if sys.platform == 'win32': - interpreter = sys.exec_prefix + "\\" + "python.exe" - else: - interpreter = sys.executable + import os + interpreter = sys.executable + # Note(zhouwei): if use Python/C 'PyRun_SimpleString', 'sys.executable' + # will be the C++ execubable on Windows + if sys.platform == 'win32' and 'python.exe' not in interpreter: + interpreter = sys.exec_prefix + os.sep + 'python.exe' import_cv2_proc = subprocess.Popen( [interpreter, "-c", "import cv2"], stdout=subprocess.PIPE, From 5f1eb839f9de416476fc70c13b6457cfee1c831d Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Tue, 12 Oct 2021 18:19:37 +0800 Subject: [PATCH 131/298] [NPU] concat supports dtype int64 for model deepfm (#36327) * [NPU] modify for model deepfm * [NPU] unit test delete precision control * [NPU] add more unit test * revert elementwise_mul related modification * [NPU] add more unit tests for concat --- paddle/fluid/operators/concat_op_npu.cc | 6 + .../tests/unittests/npu/test_concat_op_npu.py | 171 ++++++++++++++---- 2 files changed, 145 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc index d242c9f8c3fbd5..109007d737c156 100644 --- a/paddle/fluid/operators/concat_op_npu.cc +++ b/paddle/fluid/operators/concat_op_npu.cc @@ -122,8 +122,14 @@ namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL(concat, ops::ConcatNPUKernel, ops::ConcatNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ConcatNPUKernel, +#endif ops::ConcatNPUKernel); REGISTER_OP_NPU_KERNEL(concat_grad, ops::ConcatGradNPUKernel, ops::ConcatGradNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ConcatGradNPUKernel, +#endif ops::ConcatGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py index 8f11d00ccabf67..f9eecefdfb2376 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py @@ -18,7 +18,7 @@ import unittest import sys sys.path.append("..") -from op_test import OpTest +from op_test import OpTest, skip_check_grad_ci import paddle import paddle.fluid as fluid @@ -26,7 +26,7 @@ SEED = 2021 -class TestConcat(OpTest): +class TestConcatOp(OpTest): def setUp(self): self.set_npu() self.op_type = "concat" @@ -56,54 +56,161 @@ def init_dtype(self): def test_check_output(self): self.check_output_with_place(self.place) + def test_check_grad(self): + self.check_grad_with_place(self.place, ['x0', 'x2'], 'Out') + self.check_grad_with_place(self.place, ['x1'], 'Out') + self.check_grad_with_place(self.place, ['x2'], 'Out') + def init_test_data(self): self.x0 = np.random.random((1, 4, 50)).astype(self.dtype) self.x1 = np.random.random((2, 4, 50)).astype(self.dtype) self.x2 = np.random.random((3, 4, 50)).astype(self.dtype) self.axis = 0 + +class TestConcatOp2(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.x2 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.axis = 1 + + +@skip_check_grad_ci( + reason="The function 'check_grad' for large inputs is too slow.") +class TestConcatOp3(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((1, 256, 170, 256)).astype(self.dtype) + self.x1 = np.random.random((1, 128, 170, 256)).astype(self.dtype) + self.x2 = np.random.random((1, 128, 170, 256)).astype(self.dtype) + self.axis = 1 + def test_check_grad(self): - self.check_grad_with_place(self.place, ['x0', 'x2'], 'Out') - self.check_grad_with_place(self.place, ['x1'], 'Out') - self.check_grad_with_place(self.place, ['x2'], 'Out') + pass + + +@skip_check_grad_ci( + reason="This test will meet fetch error when there is a null grad. The detailed information is in PR#17015." +) +class TestConcatOp4(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.x2 = np.random.random((0, 3, 4, 5)).astype(self.dtype) + self.axis = 0 + + def test_check_grad(self): + pass + + +class TestConcatOp5(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype) + self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype) + self.x2 = np.random.random((5, 3, 4, 5)).astype(self.dtype) + self.axis = -3 + + +#----------------Concat Fp16---------------- +def create_test_fp16(parent): + class TestConcatFp16(parent): + def init_dtype(self): + self.dtype = np.float16 + + cls_name = "{0}_{1}".format(parent.__name__, "Fp16") + TestConcatFp16.__name__ = cls_name + globals()[cls_name] = TestConcatFp16 + + +create_test_fp16(TestConcatOp) +create_test_fp16(TestConcatOp2) +create_test_fp16(TestConcatOp3) +create_test_fp16(TestConcatOp4) +create_test_fp16(TestConcatOp5) + + +#----------------Concat Int64---------------- +def create_test_int64(parent): + class TestConcatInt64(parent): + def init_dtype(self): + self.dtype = np.int64 + def test_check_grad(self): + pass + + cls_name = "{0}_{1}".format(parent.__name__, "Int64") + TestConcatInt64.__name__ = cls_name + globals()[cls_name] = TestConcatInt64 + + +create_test_int64(TestConcatOp) +create_test_int64(TestConcatOp2) +create_test_int64(TestConcatOp3) +create_test_int64(TestConcatOp4) +create_test_int64(TestConcatOp5) + + +class TestConcatAPIWithLoDTensorArray(unittest.TestCase): + """ + Test concat api when the input(x) is a LoDTensorArray. + """ -class TestConcatFP16(OpTest): def setUp(self): self.set_npu() - self.op_type = "concat" self.place = paddle.NPUPlace(0) - self.init_dtype() - self.init_test_data() - - self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} - self.attrs = {'axis': self.axis} - if self.axis < 0: - self.actual_axis = self.axis + len(self.x0.shape) - self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0 + self.axis = 1 + self.iter_num = 3 + self.input_shape = [2, 3] + self.x = np.random.random(self.input_shape).astype("float32") + + def set_program(self, use_fluid_api): + paddle.enable_static() + if use_fluid_api: + self.program = fluid.Program() + with fluid.program_guard(self.program): + input = fluid.layers.assign(self.x) + tensor_array = fluid.layers.create_array(dtype='float32') + zero = fluid.layers.fill_constant( + shape=[1], value=0, dtype="int64") + + for i in range(self.iter_num): + fluid.layers.array_write(input, zero + i, tensor_array) + + self.out_var = fluid.layers.concat(tensor_array, axis=self.axis) else: - self.actual_axis = self.axis + self.program = paddle.static.Program() + with paddle.static.program_guard(self.program): + input = paddle.assign(self.x) + tensor_array = fluid.layers.create_array( + dtype='float32' + ) # Api create_array is not supported in paddle 2.0 yet. + zero = paddle.zeros(shape=[1], dtype="int64") - self.outputs = { - 'Out': np.concatenate( - (self.x0, self.x1, self.x2), axis=self.actual_axis) - } + for i in range(self.iter_num): + # Api array_write is not supported in paddle 2.0 yet. + fluid.layers.array_write(input, zero + i, tensor_array) + + self.out_var = paddle.concat(tensor_array, axis=self.axis) def set_npu(self): self.__class__.use_npu = True - self.__class__.no_need_check_grad = True - - def init_dtype(self): - self.dtype = np.float16 - def test_check_output(self): - self.check_output_with_place(self.place) - - def init_test_data(self): - self.x0 = np.random.random((1, 4, 50)).astype(self.dtype) - self.x1 = np.random.random((2, 4, 50)).astype(self.dtype) - self.x2 = np.random.random((3, 4, 50)).astype(self.dtype) - self.axis = 0 + def test_fluid_api(self): + self._run_static_mode(use_fluid_api=True) + + def test_paddle_api(self): + self._run_static_mode(use_fluid_api=False) + + def _run_static_mode(self, use_fluid_api): + self.set_program(use_fluid_api) + self.assertTrue(self.out_var.shape[self.axis] == -1) + exe = fluid.Executor(self.place) + res = exe.run(self.program, fetch_list=self.out_var) + self.assertTrue( + np.array_equal( + res[0], + np.concatenate( + [self.x] * self.iter_num, axis=self.axis))) if __name__ == '__main__': From fba355fbc04ee2cacbc527fbd5e52c25a721e53b Mon Sep 17 00:00:00 2001 From: wawltor Date: Tue, 12 Oct 2021 19:57:22 +0800 Subject: [PATCH 132/298] change the paddle.mm to matmul_v2 (#35770) * change the paddle.mm to matmul_v2 * update the code for the mm * update the document for the mm --- python/paddle/tensor/math.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 14a925ef3e285d..f5f0b5ed0873c1 100755 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -923,8 +923,6 @@ def mm(input, mat2, name=None): nontransposed, the prepended or appended dimension :math:`1` will be removed after matrix multiplication. - This op does not support broadcasting. See paddle.matmul. - Args: input (Tensor): The input tensor which is a Tensor. mat2 (Tensor): The input tensor which is a Tensor. @@ -949,9 +947,7 @@ def mm(input, mat2, name=None): """ if in_dygraph_mode(): - out = _varbase_creator(dtype=input.dtype) - _C_ops.matmul(input, mat2, out) - return out + return _C_ops.matmul_v2(input, mat2) def __check_input(x, y): var_names = {'x': x, 'y': y} @@ -991,7 +987,7 @@ def __check_input(x, y): helper = LayerHelper('mm', **locals()) out = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( - type='matmul', inputs={'X': input, + type='matmul_v2', inputs={'X': input, 'Y': mat2}, outputs={'Out': out}) return out From 3e2dec5b837397d2e8ecc006e302512c26adba9c Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Tue, 12 Oct 2021 21:46:37 +0800 Subject: [PATCH 133/298] Change the input param of fusion op interface from pointer to tensor (#36349) --- .../operators/fused/cudnn_bn_add_relu_test.cc | 64 ++++--------- .../fused/cudnn_bn_stats_finalize.cu.h | 24 +++-- .../operators/fused/cudnn_norm_conv.cu.h | 94 +++++++++++++++---- .../operators/fused/cudnn_norm_conv_test.cc | 61 ++++-------- .../fused/cudnn_scale_bias_add_relu.cu.h | 40 ++++++-- 5 files changed, 161 insertions(+), 122 deletions(-) diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 837bca6c2cf4e3..709d69214c603f 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -536,32 +536,20 @@ class CudnnBNAddReluTester { bn_bias->Resize({1, 1, 1, channels_}); // input - float *sum_ptr = sum->data(); - float *sum_of_square_ptr = sum_of_square->data(); - float *bn_scale_ptr = bn_scale->data(); - float *bn_bias_ptr = bn_bias->data(); - mean->Resize({1, 1, 1, channels_}); var->Resize({1, 1, 1, channels_}); // output - float *mean_ptr = mean->data(); - float *var_ptr = var->data(); - float *saved_mean_ptr = - saved_mean->mutable_data({1, 1, 1, channels_}, place); - float *saved_var_ptr = - saved_var->mutable_data({1, 1, 1, channels_}, place); - T *equiv_scale_ptr = - equiv_scale->mutable_data({1, 1, 1, channels_}, place); - T *equiv_bias_ptr = - equiv_bias->mutable_data({1, 1, 1, channels_}, place); + equiv_scale->Resize({1, 1, 1, channels_}); + equiv_bias->Resize({1, 1, 1, channels_}); + saved_mean->Resize({1, 1, 1, channels_}); + saved_var->Resize({1, 1, 1, channels_}); auto param_shape = framework::vectorize(bn_scale->dims()); op::CudnnBNStatsFinalize bn_op(ctx, param_shape); - bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr, - saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr, - equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_, - true); + bn_op.Forward(ctx, *sum, *sum_of_square, *bn_scale, *bn_bias, saved_mean, + saved_var, mean, var, equiv_scale, equiv_bias, eps_, + momentum_, ele_count_, true); } // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu @@ -627,21 +615,13 @@ class CudnnBNAddReluTester { &saved_var_z, &equiv_scale_z, &equiv_bias_z); } - T *x_ptr = x.data(); - T *z_ptr = (fuse_add_ || has_shortcut_) ? z.data() : nullptr; - T *equiv_scale_x_ptr = equiv_scale_x.data(); - T *equiv_bias_x_ptr = equiv_bias_x.data(); - T *equiv_scale_z_ptr = has_shortcut_ ? equiv_scale_z.data() : nullptr; - T *equiv_bias_z_ptr = has_shortcut_ ? equiv_bias_z.data() : nullptr; - T *y_ptr = - y.mutable_data({batch_size_, height_, width_, channels_}, place); + y.Resize(framework::make_ddim({batch_size_, height_, width_, channels_})); int c = channels_; int64_t nhw = ele_count_; int32_t c_int32_elems = ((c + 63) & ~63) / 32; int32_t nhw_int32_elems = (nhw + 31) & ~31; - int32_t *bitmask_ptr = bitmask.mutable_data( - {nhw_int32_elems, c_int32_elems, 1}, place); + bitmask.Resize(framework::make_ddim({nhw_int32_elems, c_int32_elems, 1})); auto data_shape = framework::vectorize(x.dims()); auto param_shape = framework::vectorize(bn_scale_x.dims()); @@ -651,8 +631,8 @@ class CudnnBNAddReluTester { op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type_, fuse_add_, has_shortcut_, data_shape, param_shape, bitmask_shape); - sbar_op.Forward(ctx, x_ptr, equiv_scale_x_ptr, equiv_bias_x_ptr, y_ptr, - bitmask_ptr, z_ptr, equiv_scale_z_ptr, equiv_bias_z_ptr); + sbar_op.Forward(ctx, x, equiv_scale_x, equiv_bias_x, z, equiv_scale_z, + equiv_bias_z, &y, &bitmask); TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x); TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x); @@ -697,19 +677,10 @@ class CudnnBNAddReluTester { saved_mean.Resize({1, 1, 1, channels_}); saved_var.Resize({1, 1, 1, channels_}); - T *dy_ptr = dy.data(); - T *x_ptr = x.data(); - float *bn_scale_ptr = bn_scale.data(); - float *bn_bias_ptr = bn_bias.data(); - float *saved_mean_ptr = saved_mean.data(); - float *saved_var_ptr = saved_var.data(); - int32_t *bitmask_ptr = bitmask.data(); - T *dx_ptr = - dx.mutable_data({batch_size_, height_, width_, channels_}, place); - T *dz_ptr = - dz.mutable_data({batch_size_, height_, width_, channels_}, place); - float *dscale_ptr = dscale.mutable_data({1, 1, 1, channels_}, place); - float *dbias_ptr = dbias.mutable_data({1, 1, 1, channels_}, place); + dx.Resize(framework::make_ddim({batch_size_, height_, width_, channels_})); + dz.Resize(framework::make_ddim({batch_size_, height_, width_, channels_})); + dscale.Resize(framework::make_ddim({1, 1, 1, channels_})); + dbias.Resize(framework::make_ddim({1, 1, 1, channels_})); auto data_shape = framework::vectorize(x.dims()); auto param_shape = framework::vectorize(bn_scale.dims()); @@ -718,9 +689,8 @@ class CudnnBNAddReluTester { std::string act_type = "relu"; op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type, true, false, data_shape, param_shape, bitmask_shape); - sbar_op.Backward(ctx, dy_ptr, x_ptr, bn_scale_ptr, bn_bias_ptr, - saved_mean_ptr, saved_var_ptr, bitmask_ptr, dx_ptr, dz_ptr, - dscale_ptr, dbias_ptr, eps_); + sbar_op.Backward(ctx, dy, x, bn_scale, bn_bias, saved_mean, saved_var, + bitmask, &dx, &dz, &dscale, &dbias, eps_); TensorCopySync(dx, platform::CPUPlace(), cpu_dx); TensorCopySync(dz, platform::CPUPlace(), cpu_dz); diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h index 7d4b24cd4fc3de..dc703f9a822b5b 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h +++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h @@ -68,12 +68,13 @@ class CudnnBNStatsFinalize { } ~CudnnBNStatsFinalize() {} - void Forward(const platform::CUDADeviceContext &ctx, float *sum_ptr, - float *sum_of_squares_ptr, float *scale_ptr, float *bias_ptr, - float *saved_mean_ptr, float *saved_invstd_ptr, - float *running_mean_ptr, float *running_var_ptr, - T *equiv_scale_ptr, T *equiv_bias_ptr, double eps, - float momentum, int64_t ele_count, bool is_train) { + void Forward(const platform::CUDADeviceContext &ctx, const Tensor &sum, + const Tensor &sum_of_squares, const Tensor &scale, + const Tensor &bias, Tensor *saved_mean, Tensor *saved_invstd, + Tensor *running_mean, Tensor *running_var, Tensor *equiv_scale, + Tensor *equiv_bias, double eps, float momentum, + int64_t ele_count, bool is_train) { + auto place = ctx.GetPlace(); if (is_train) { TrainInit(ctx); } else { @@ -82,6 +83,17 @@ class CudnnBNStatsFinalize { auto &op = is_train ? train_op_ : inference_op_; // Set variant_param for both inference_op_ and train_op_ + float *sum_ptr = const_cast(sum.data()); + float *sum_of_squares_ptr = + const_cast(sum_of_squares.data()); + float *scale_ptr = const_cast(scale.data()); + float *bias_ptr = const_cast(bias.data()); + float *saved_mean_ptr = saved_mean->mutable_data(place); + float *saved_invstd_ptr = saved_invstd->mutable_data(place); + float *running_mean_ptr = running_mean->mutable_data(place); + float *running_var_ptr = running_var->mutable_data(place); + T *equiv_scale_ptr = equiv_scale->mutable_data(place); + T *equiv_bias_ptr = equiv_bias->mutable_data(place); op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr); op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr); op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_MEAN, running_mean_ptr); diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h index 1a73281cb8dc64..9b9328a5ca6208 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h +++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h @@ -38,7 +38,8 @@ struct NormConvolutionArgs { compute_type = platform::CudnnDataType::type; } - void Set(const std::vector &input_shape, + void Set(const platform::CUDADeviceContext &ctx, + const std::vector &input_shape, const std::vector &filter_shape, const std::vector &output_shape, int padding, int stride, int dilation, int group) { @@ -61,12 +62,33 @@ struct NormConvolutionArgs { "The filter_shape is expected to store as nhwc, and " "h = w = 1 or 3. But recieved filter_shape is [%s].", framework::make_ddim(filter_shape))); + PADDLE_ENFORCE_EQ((filter_shape[0] % 32 == 0 && filter_shape[3] % 8 == 0), + true, + platform::errors::InvalidArgument( + "The input channel is expected to be multiple of 8, " + "and the output channel is expected to be multiple " + "of 32. But recieved input channel is %d, output " + "channel is %d.", + filter_shape[3], filter_shape[0])); PADDLE_ENFORCE_EQ( output_shape.size(), 4U, platform::errors::InvalidArgument( "The size of output_shape is expected to 4. But recieved " "filter_shape's size is %d, filter_shape is [%s].", output_shape.size(), framework::make_ddim(output_shape))); + is_support = IsSupport(ctx, filter_shape, stride, dilation, group); + PADDLE_ENFORCE_EQ( + is_support, true, + platform::errors::InvalidArgument( + "Current test is only supported in the platforms with " + "compatiblity greater than or equal to 70 and the kernel size " + "must be equal to 1 or 3. When the kernel size is 1, " + "the stride must be 1 if the compatiblity is equal to 70. " + "Besides, the dilation and group must be equal to 1. But recieved " + "compatiblity is %d, kernel size is %d, stride is %d, " + "dilation is %d, group is %d", + ctx.GetComputeCapability(), filter_shape[1], stride, dilation, + group)); for (size_t i = 0; i < input_shape.size(); ++i) { in_dims.push_back(input_shape[i]); @@ -89,6 +111,25 @@ struct NormConvolutionArgs { conv_desc.set(dtype, paddings, strides, dilations, false, group); } + bool IsSupport(const platform::CUDADeviceContext &ctx, + const std::vector &filter_shape, int stride, int dilation, + int group) { + int kernel_size = filter_shape[1]; + if (dilation != 1 || group != 1) { + return false; + } + if (ctx.GetComputeCapability() == 70) { + if ((kernel_size == 3) || ((kernel_size == 1) && (stride == 1))) { + return true; + } + } else if (ctx.GetComputeCapability() > 70) { + if ((kernel_size == 3) || (kernel_size == 1)) { + return true; + } + } + return false; + } + cudnnDataType_t dtype; cudnnTensorFormat_t format; cudnnDataType_t compute_type; @@ -104,6 +145,8 @@ struct NormConvolutionArgs { platform::TensorDescriptor out_desc; platform::TensorDescriptor out_stats_desc; platform::ConvolutionDescriptor conv_desc; + + bool is_support; }; template @@ -115,15 +158,16 @@ class CudnnNormConvolution { const std::vector &output_shape, const int &padding, const int &stride, const int &dilation, const int &group) { - args_.Set(input_shape, filter_shape, output_shape, padding, stride, + args_.Set(ctx, input_shape, filter_shape, output_shape, padding, stride, dilation, group); } ~CudnnNormConvolution() {} - void Forward(const platform::CUDADeviceContext &ctx, T *input_ptr, - T *filter_ptr, T *output_ptr, float *sum_ptr, - float *sum_of_squares_ptr) { + void Forward(const platform::CUDADeviceContext &ctx, const Tensor &input, + const Tensor &filter, Tensor *output, Tensor *sum, + Tensor *sum_of_squares) { auto cudnn_handle = ctx.cudnn_handle(); + auto place = ctx.GetPlace(); CudnnFusionOp *fwd_op = GetForwardOp(ctx); size_t workspace_size = RoundUp( @@ -132,12 +176,17 @@ class CudnnNormConvolution { // Set variant_param // input ptr + T *input_ptr = const_cast(input.data()); + T *filter_ptr = const_cast(filter.data()); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr); fwd_op->SetOpVariantParamAttrPtr( CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size); // output ptr + T *output_ptr = output->mutable_data(place); + float *sum_ptr = sum->mutable_data(place); + float *sum_of_squares_ptr = sum_of_squares->mutable_data(place); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); @@ -209,28 +258,34 @@ class CudnnNormConvolutionGrad { const std::vector &output_shape, const int &padding, const int &stride, const int &dilation, const int &group) { - args_.Set(input_shape, filter_shape, output_shape, padding, stride, + args_.Set(ctx, input_shape, filter_shape, output_shape, padding, stride, dilation, group); dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } ~CudnnNormConvolutionGrad() {} - void Backward(const platform::CUDADeviceContext &ctx, T *input_ptr, - T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr, - T *filter_grad_ptr, bool use_addto = false) { - if (filter_grad_ptr) { - BackwardFilter(ctx, input_ptr, output_grad_ptr, filter_ptr, - filter_grad_ptr); + void Backward(const platform::CUDADeviceContext &ctx, const Tensor &input, + const Tensor &filter, const Tensor &output_grad, + Tensor *input_grad, Tensor *filter_grad, + bool use_addto = false) { + auto place = ctx.GetPlace(); + T *input_ptr = const_cast(input.data()); + T *filter_ptr = const_cast(filter.data()); + T *output_grad_ptr = const_cast(output_grad.data()); + + if (filter_grad) { + T *filter_grad_ptr = filter_grad->mutable_data(place); + BackwardFilter(ctx, output_grad_ptr, input_ptr, filter_grad_ptr); } - if (input_grad_ptr) { - BackwardData(ctx, input_ptr, output_grad_ptr, filter_ptr, input_grad_ptr, - use_addto); + if (input_grad) { + T *input_grad_ptr = input_grad->mutable_data(place); + BackwardData(ctx, output_grad_ptr, filter_ptr, input_grad_ptr, use_addto); } } private: - void BackwardFilter(const platform::CUDADeviceContext &ctx, T *input_ptr, - T *output_grad_ptr, T *filter_ptr, T *filter_grad_ptr) { + void BackwardFilter(const platform::CUDADeviceContext &ctx, + T *output_grad_ptr, T *input_ptr, T *filter_grad_ptr) { auto cudnn_handle = ctx.cudnn_handle(); CudnnFusionOp *wgrad_op = GetBackwardFilterOp(ctx); @@ -255,9 +310,8 @@ class CudnnNormConvolutionGrad { workspace_size); } - void BackwardData(const platform::CUDADeviceContext &ctx, T *input_ptr, - T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr, - bool use_addto = false) { + void BackwardData(const platform::CUDADeviceContext &ctx, T *output_grad_ptr, + T *filter_ptr, T *input_grad_ptr, bool use_addto = false) { auto cudnn_handle = ctx.cudnn_handle(); size_t workspace_size = GetWorkspaceSizeBwdData(ctx); diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 4c14029b99c69c..23983d447e4788 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -229,15 +229,6 @@ class CudnnNormConvolutionTester { platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(0))); - if (!Support(*ctx)) { - LOG(INFO) - << "Current test is only supported in the platforms with " - << "compatiblity greater than or equal to 70 and the kernel size " - << "must be equal to 1 or 3. Besides, when the kernel size is 1, " - << "the stride must be 1 if the compatiblity is equal to 70."; - return; - } - framework::Tensor cpu_output_base; framework::Tensor cpu_sum_base; framework::Tensor cpu_sum_of_square_base; @@ -325,14 +316,10 @@ class CudnnNormConvolutionTester { TensorCopySync(cpu_input_, place, &input); TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); - T *input_ptr = input.data(); - T *filter_ptr = filter_nhwc.data(); - T *output_ptr = output.mutable_data( - {batch_size_, out_height_, out_width_, output_channels_}, place); - float *sum_ptr = - sum.mutable_data({1, 1, 1, output_channels_}, place); - float *sum_of_square_ptr = - sum_of_square.mutable_data({1, 1, 1, output_channels_}, place); + output.Resize(framework::make_ddim( + {batch_size_, out_height_, out_width_, output_channels_})); + sum.Resize(framework::make_ddim({1, 1, 1, output_channels_})); + sum_of_square.Resize(framework::make_ddim({1, 1, 1, output_channels_})); auto input_shape = framework::vectorize(input.dims()); auto filter_shape = framework::vectorize(filter_nhwc.dims()); @@ -340,8 +327,7 @@ class CudnnNormConvolutionTester { op::CudnnNormConvolution conv_op(ctx, input_shape, filter_shape, output_shape, padding_, stride_, dilation_, group_); - conv_op.Forward(ctx, input_ptr, filter_ptr, output_ptr, sum_ptr, - sum_of_square_ptr); + conv_op.Forward(ctx, input, filter_nhwc, &output, &sum, &sum_of_square); TensorCopySync(output, platform::CPUPlace(), cpu_output); TensorCopySync(sum, platform::CPUPlace(), cpu_sum); @@ -362,11 +348,8 @@ class CudnnNormConvolutionTester { TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); TensorCopySync(cpu_output_grad_, place, &output_grad); - T *input_ptr = input.data(); - T *filter_ptr = filter_nhwc.data(); - T *output_grad_ptr = output_grad.data(); - T *input_grad_ptr = input_grad.mutable_data(input.dims(), place); - T *filter_grad_ptr = filter_grad.mutable_data(filter_nhwc.dims(), place); + input_grad.Resize(input.dims()); + filter_grad.Resize(filter_nhwc.dims()); auto input_shape = framework::vectorize(input.dims()); auto filter_shape = framework::vectorize(filter_nhwc.dims()); @@ -374,26 +357,13 @@ class CudnnNormConvolutionTester { op::CudnnNormConvolutionGrad conv_grad_op(ctx, input_shape, filter_shape, output_shape, padding_, stride_, dilation_, group_); - conv_grad_op.Backward(ctx, input_ptr, output_grad_ptr, filter_ptr, - input_grad_ptr, filter_grad_ptr); + conv_grad_op.Backward(ctx, input, filter_nhwc, output_grad, &input_grad, + &filter_grad); TensorCopySync(input_grad, platform::CPUPlace(), cpu_input_grad); TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad); } - bool Support(const platform::CUDADeviceContext &ctx) { - if (ctx.GetComputeCapability() == 70) { - if ((kernel_size_ == 3) || ((kernel_size_ == 1) && (stride_ == 1))) { - return true; - } - } else if (ctx.GetComputeCapability() > 70) { - if ((kernel_size_ == 3) || (kernel_size_ == 1)) { - return true; - } - } - return false; - } - private: int batch_size_; int height_; @@ -477,6 +447,15 @@ TEST(CudnnNormConvFp16, K1S2O4) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3), paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3)); + } } diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h index 2fdb3635e2e149..b48c964d264add 100644 --- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h +++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h @@ -107,25 +107,33 @@ class CudnnScaleBiasAddRelu { ~CudnnScaleBiasAddRelu() {} - void Forward(const platform::CUDADeviceContext &ctx, T *x_ptr, T *x_scale_ptr, - T *x_bias_ptr, T *out_ptr, int32_t *bitmask_ptr, - T *z_ptr = nullptr, T *z_scale_ptr = nullptr, - T *z_bias_ptr = nullptr) { + void Forward(const platform::CUDADeviceContext &ctx, const Tensor &x, + const Tensor &x_scale, const Tensor &x_bias, const Tensor &z, + const Tensor &z_scale, const Tensor &z_bias, Tensor *out, + Tensor *bitmask) { ForwardInit(ctx); auto handle = ctx.cudnn_handle(); + auto place = ctx.GetPlace(); auto workspace_handle = ctx.cudnn_workspace_handle(); fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle); // Set variant_param // input ptr + T *x_ptr = const_cast(x.data()); + T *x_scale_ptr = const_cast(x_scale.data()); + T *x_bias_ptr = const_cast(x_bias.data()); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, x_scale_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, x_bias_ptr); if (has_shortcut_) { + T *z_ptr = const_cast(z.data()); + T *z_scale_ptr = const_cast(z_scale.data()); + T *z_bias_ptr = const_cast(z_bias.data()); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQSCALE, z_scale_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQBIAS, z_bias_ptr); } else { if (fused_add_) { + T *z_ptr = const_cast(z.data()); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr); } } @@ -134,6 +142,8 @@ class CudnnScaleBiasAddRelu { CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_); // output ptr + T *out_ptr = out->mutable_data(place); + int32_t *bitmask_ptr = bitmask->mutable_data(place); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, out_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr); @@ -147,16 +157,30 @@ class CudnnScaleBiasAddRelu { fwd_workspace_byte_); } - void Backward(const platform::CUDADeviceContext &ctx, T *dy_ptr, T *x_ptr, - float *scale_ptr, float *bias_ptr, float *saved_mean_ptr, - float *saved_invstd_ptr, int32_t *bitmask_ptr, T *dx_ptr, - T *dz_ptr, float *dscale_ptr, float *dbias_ptr, double eps) { + void Backward(const platform::CUDADeviceContext &ctx, const Tensor &dy, + const Tensor &x, const Tensor &scale, const Tensor &bias, + const Tensor &saved_mean, const Tensor &saved_invstd, + const Tensor &bitmask, Tensor *dx, Tensor *dz, Tensor *dscale, + Tensor *dbias, double eps) { BackwardInit(ctx); auto handle = ctx.cudnn_handle(); + auto place = ctx.GetPlace(); auto workspace_handle = ctx.cudnn_workspace_handle(); bwd_workspace_byte_ = bwd_op_.GetWorkspaceSizeInBytes(handle); // Set variant_param // input ptr + T *dy_ptr = const_cast(dy.data()); + T *x_ptr = const_cast(x.data()); + float *scale_ptr = const_cast(scale.data()); + float *bias_ptr = const_cast(bias.data()); + float *saved_mean_ptr = const_cast(saved_mean.data()); + float *saved_invstd_ptr = const_cast(saved_invstd.data()); + int32_t *bitmask_ptr = const_cast(bitmask.data()); + T *dx_ptr = dx->mutable_data(place); + T *dz_ptr = dz ? dz->mutable_data(place) : nullptr; + float *dscale_ptr = dscale ? dscale->mutable_data(place) : nullptr; + float *dbias_ptr = dbias ? dbias->mutable_data(place) : nullptr; + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr); bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, dy_ptr); bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr); From 033a73c376eef67c8f7da91e713b94982d1b477a Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 12 Oct 2021 22:00:10 +0800 Subject: [PATCH 134/298] Revert "refine LarsOptimizer (#36351)" (#36369) This reverts commit b3f6eedb77925c28a193eaedb858220b9417c5ca. --- python/paddle/fluid/optimizer.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 4625d7ea89b25e..24076e82b0365d 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -2047,15 +2047,11 @@ def _create_accumulators(self, block, parameters): def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) _lars_weight_decay = self._lars_weight_decay - _lars_coeff = self._lars_coeff param_name = param_and_grad[0].name - is_excluded = False if len(self._exclude_from_weight_decay) > 0: for name in self._exclude_from_weight_decay: if name in param_name: _lars_weight_decay = 0.0 - _lars_coeff = 0.0 - is_excluded = True break velocity_acc = self._get_accumulator(self._velocity_acc_str, @@ -2069,7 +2065,7 @@ def _append_optimize_op(self, block, param_and_grad): attrs = { "mu": self._momentum, - "lars_coeff": _lars_coeff, + "lars_coeff": self._lars_coeff, "lars_weight_decay": _lars_weight_decay, "multi_precision": find_master, "rescale_grad": self._rescale_grad @@ -2090,7 +2086,7 @@ def _append_optimize_op(self, block, param_and_grad): # create the momentum optimize op momentum_op = block.append_op( - type='momentum' if is_excluded else self.type, + type=self.type, inputs=inputs, outputs=outputs, attrs=attrs, From 3c2bdaa8ceaa7ad725ebc7faead6cf7f29aaa40a Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Wed, 13 Oct 2021 10:59:51 +0800 Subject: [PATCH 135/298] unify usage of tuple and list (#36368) * modify format * modify format --- python/paddle/autograd/functional.py | 81 ++++++++----------- python/paddle/autograd/utils.py | 24 +++--- python/paddle/fluid/dygraph/base.py | 2 +- .../tests/unittests/autograd/test_vjp_jvp.py | 4 +- .../fluid/tests/unittests/autograd/utils.py | 14 ++-- 5 files changed, 56 insertions(+), 69 deletions(-) diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py index 688e04335ebb70..4d7fcd733cdb0b 100644 --- a/python/paddle/autograd/functional.py +++ b/python/paddle/autograd/functional.py @@ -18,20 +18,7 @@ from ..fluid.dygraph import grad from ..nn.initializer import assign from ..tensor import reshape, zeros_like, to_tensor -from .utils import _check_tensors, _stack_tensor_or_return_none, _replace_none_with_zero_tensor - - -def to_tensorlist(tl): - if not isinstance(tl, list): - if isinstance(tl, tuple): - tl = list(tl) - else: - tl = [tl] - for t in tl: - assert isinstance(t, paddle.Tensor) or t is None, ( - f'{t} is expected to be paddle.Tensor or None, but found {type(t)}.' - ) - return tl +from .utils import _tensors, _stack_tensor_or_return_none, _replace_none_with_zero_tensor @contextlib.contextmanager @@ -98,19 +85,19 @@ def vjp(func, inputs, v=None, create_graph=False, allow_unused=False): reverse mode automatic differentiation. Args: - func(Callable): `func` takes as input a tensor or a list - of tensors and returns a tensor or a list of tensors. - inputs(list[Tensor]|Tensor): used as positional arguments - to evaluate `func`. `inputs` is accepted as one tensor - or a list of tensors. - v(list[Tensor]|Tensor, optional): the cotangent vector - invovled in the VJP computation. `v` matches the size - and shape of `func`'s output. Default value is None + func(Callable): `func` takes as input a tensor or a list/tuple + of tensors and returns a tensor or a list/tuple of tensors. + inputs(list[Tensor]|tuple[Tensor]|Tensor): used as positional + arguments to evaluate `func`. `inputs` is accepted as one + tensor or a list of tensors. + v(list[Tensor]|tuple[Tensor]|Tensor|None, optional): the + cotangent vector invovled in the VJP computation. `v` matches + the size and shape of `func`'s output. Default value is None and in this case is equivalent to all ones the same size of `func`'s output. - create_graph(bool, optional): if `True`, gradients can - be evaluated on the results. If `False`, taking gradients - on the results is invalid. Default value is False. + create_graph(bool, optional): if `True`, gradients can be + evaluated on the results. If `False`, taking gradients on + the results is invalid. Default value is False. allow_unused(bool, optional): In case that some Tensors of `inputs` do not contribute to the computation of the output. If `allow_unused` is False, an error will be raised, @@ -119,8 +106,9 @@ def vjp(func, inputs, v=None, create_graph=False, allow_unused=False): Returns: output(tuple): - func_out: the output of `func(inputs)` - vjp(list[Tensor]|Tensor): the pullback results of `v` on `func` + func_out(list[Tensor]|tuple[Tensor]|Tensor): the output of + `func(inputs)` + vjp(list[Tensor]): the pullback results of `v` on `func` Examples: .. code-block:: python @@ -163,13 +151,13 @@ def func_unused(x, y): # [[2., 1.], # [1., 0.]]), None] """ - xs, v = to_tensorlist(inputs), to_tensorlist(v) + xs, v = _tensors(inputs, "inputs"), _tensors(v, "v") with gradient_scope( xs, v, create_graph=create_graph, allow_unused=allow_unused) as [xs, v, grad_fn, return_fn]: outputs = func(*xs) - ys = to_tensorlist(outputs) + ys = _tensors(outputs, "outputs") grads = grad_fn(ys, xs, v) outputs, grads = return_fn(outputs), return_fn(grads) @@ -186,16 +174,16 @@ def jvp(func, inputs, v=None, create_graph=False, allow_unused=False): **This API is ONLY available in imperative mode.** Args: - func(Callable): `func` takes as input a tensor or a list - of tensors and returns a tensor or a list of tensors. - inputs(list[Tensor]|Tensor): used as positional arguments - to evaluate `func`. `inputs` is accepted as one tensor - or a list of tensors. - v(list[Tensor]|Tensor, optional): the tangent vector - invovled in the JVP computation. `v` matches the size - and shape of `inputs`. `v` is Optional if `func` returns - a single tensor. Default value is None and in this case - is equivalent to all ones the same size of `inputs`. + func(Callable): `func` takes as input a tensor or a list/tuple + of tensors and returns a tensor or a list/tuple of tensors. + inputs(list[Tensor]|tuple[Tensor]|Tensor): used as positional + arguments to evaluate `func`. `inputs` is accepted as one + tensor or a list/tuple of tensors. + v(list[Tensor]|tuple[Tensor]|Tensor|None, optional): the + tangent vector invovled in the JVP computation. `v` matches + the size and shape of `inputs`. `v` is Optional if `func` + returns a single tensor. Default value is None and in this + case is equivalent to all ones the same size of `inputs`. create_graph(bool, optional): if `True`, gradients can be evaluated on the results. If `False`, taking gradients on the results is invalid. Default value is False. @@ -207,8 +195,9 @@ def jvp(func, inputs, v=None, create_graph=False, allow_unused=False): Returns: output(tuple): - func_out: the output of `func(inputs)` - jvp(list[Tensor]|Tensor): the pullback results of `v` on `func` + func_out(list[Tensor]|tuple[Tensor]|Tensor): the output of + `func(inputs)` + jvp(list[Tensor]): the pullback results of `v` on `func` Examples: .. code-block:: python @@ -232,13 +221,13 @@ def func(x): # [0., 0.]])] """ - xs, v = to_tensorlist(inputs), to_tensorlist(v) + xs, v = _tensors(inputs, "inputs"), _tensors(v, "v") with gradient_scope( xs, v, create_graph=create_graph, allow_unused=allow_unused) as [xs, v, grad_fn, return_fn]: outputs = func(*xs) - ys = to_tensorlist(outputs) + ys = _tensors(outputs, "outputs") ys_grad = [zeros_like(y) for y in ys] xs_grad = grad_fn(ys, xs, ys_grad, create_graph=True) ys_grad = grad_fn(xs_grad, ys_grad, v) @@ -357,8 +346,8 @@ def func(x, y): # [0., 0., 0., 2.]]), None)) ''' - inputs = _check_tensors(inputs, "inputs") - outputs = _check_tensors(func(*inputs), "outputs") + inputs = _tensors(inputs, "inputs") + outputs = _tensors(func(*inputs), "outputs") fin_size = len(inputs) fout_size = len(outputs) flat_outputs = tuple(reshape(output, shape=[-1]) for output in outputs) @@ -494,7 +483,7 @@ def func(x, y): # [0., 1., 1., 2.]]), None), (None, None)) ''' - inputs = _check_tensors(inputs, "inputs") + inputs = _tensors(inputs, "inputs") outputs = func(*inputs) assert isinstance(outputs, paddle.Tensor) and outputs.shape == [ 1 diff --git a/python/paddle/autograd/utils.py b/python/paddle/autograd/utils.py index d437f7d82d3611..81fe19c1688c12 100644 --- a/python/paddle/autograd/utils.py +++ b/python/paddle/autograd/utils.py @@ -15,22 +15,20 @@ import paddle -def _check_tensors(in_out_list, name): - assert in_out_list is not None, "{} should not be None".format(name) - - if isinstance(in_out_list, (list, tuple)): - assert len(in_out_list) > 0, "{} connot be empyt".format(name) - for each_var in in_out_list: +def _tensors(ts, name): + if isinstance(ts, (list, tuple)): + assert len(ts) > 0, "{} connot be empty".format(name) + for each_t in ts: assert isinstance( - each_var, - paddle.Tensor), "Elements of {} must be paddle.Tensor".format( - name) - return list(in_out_list) + each_t, paddle.Tensor + ) or each_t is None, "Elements of {} must be paddle.Tensor or None".format( + name) + return list(ts) else: assert isinstance( - in_out_list, - paddle.Tensor), "{} must be Tensor or list of Tensor".format(name) - return [in_out_list] + ts, paddle.Tensor + ) or ts is None, "{} must be Tensor or list of Tensor".format(name) + return [ts] def _stack_tensor_or_return_none(origin_list): diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 18052fa7d4da85..460831f8745b31 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -456,7 +456,7 @@ def grad(outputs, the Tensors whose gradients are not needed to compute. Default None. Returns: - tuple: a tuple of Tensors, whose length is the same as the Tensor number + list: a list of Tensors, whose length is the same as the Tensor number inside `inputs`, and the i-th returned Tensor is the sum of gradients of `outputs` with respect to the i-th `inputs`. diff --git a/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py b/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py index 86331d36a3ca82..f3680ab2a62238 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py @@ -15,7 +15,7 @@ import unittest import paddle -from paddle.autograd.functional import vjp, jvp, to_tensorlist +from paddle.autograd.functional import vjp, jvp, _tensors from paddle import grad, ones_like, zeros_like @@ -55,7 +55,7 @@ def inner(y): def make_v(f, inputs): - outputs = to_tensorlist(f(*inputs)) + outputs = _tensors(f(*inputs), "outputs") return [ones_like(x) for x in outputs] diff --git a/python/paddle/fluid/tests/unittests/autograd/utils.py b/python/paddle/fluid/tests/unittests/autograd/utils.py index 0aadef4a809f3f..3087e932051d8e 100644 --- a/python/paddle/fluid/tests/unittests/autograd/utils.py +++ b/python/paddle/fluid/tests/unittests/autograd/utils.py @@ -14,7 +14,7 @@ import numpy as np import paddle -from paddle.autograd.functional import _check_tensors +from paddle.autograd.functional import _tensors def _product(t): @@ -42,8 +42,8 @@ def _set_item(t, idx, value): def _compute_numerical_jacobian(func, xs, delta, np_dtype): - xs = _check_tensors(xs, "xs") - ys = _check_tensors(func(*xs), "ys") + xs = _tensors(xs, "xs") + ys = _tensors(func(*xs), "ys") fin_size = len(xs) fout_size = len(ys) jacobian = list([] for _ in range(fout_size)) @@ -59,11 +59,11 @@ def _compute_numerical_jacobian(func, xs, delta, np_dtype): orig = _get_item(xs[j], q) x_pos = orig + delta xs[j] = _set_item(xs[j], q, x_pos) - ys_pos = _check_tensors(func(*xs), "ys_pos") + ys_pos = _tensors(func(*xs), "ys_pos") x_neg = orig - delta xs[j] = _set_item(xs[j], q, x_neg) - ys_neg = _check_tensors(func(*xs), "ys_neg") + ys_neg = _tensors(func(*xs), "ys_neg") xs[j] = _set_item(xs[j], q, orig) @@ -76,8 +76,8 @@ def _compute_numerical_jacobian(func, xs, delta, np_dtype): def _compute_numerical_hessian(func, xs, delta, np_dtype): - xs = _check_tensors(xs, "xs") - ys = _check_tensors(func(*xs), "ys") + xs = _tensors(xs, "xs") + ys = _tensors(func(*xs), "ys") fin_size = len(xs) hessian = list([] for _ in range(fin_size)) for i in range(fin_size): From 90457d8c49671ba2194912d38a8d00a1dcccc593 Mon Sep 17 00:00:00 2001 From: From00 Date: Wed, 13 Oct 2021 11:09:12 +0800 Subject: [PATCH 136/298] Set NIGHTLY tag for 'tensordot' UT (#36354) --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 9d6a1d00cff604..33cd236a7d0943 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1060,3 +1060,4 @@ endif() set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120) set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400) set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000) +set_tests_properties(test_tensordot PROPERTIES LABELS "RUN_TYPE=NIGHTLY") From caa2003ab82904e2cb3ff4337cd0b94b41539421 Mon Sep 17 00:00:00 2001 From: fuqianya Date: Wed, 13 Oct 2021 11:12:34 +0800 Subject: [PATCH 137/298] [PaddlePaddle Hackathon] add AlexNet (#36058) * add alexnet --- python/paddle/tests/test_pretrained_model.py | 4 +- python/paddle/tests/test_vision_models.py | 4 +- python/paddle/vision/__init__.py | 2 + python/paddle/vision/models/__init__.py | 6 +- python/paddle/vision/models/alexnet.py | 192 +++++++++++++++++++ 5 files changed, 205 insertions(+), 3 deletions(-) create mode 100644 python/paddle/vision/models/alexnet.py diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py index b24b51555c5819..fba1435c75e9c2 100644 --- a/python/paddle/tests/test_pretrained_model.py +++ b/python/paddle/tests/test_pretrained_model.py @@ -52,7 +52,9 @@ def infer(self, arch): np.testing.assert_allclose(res['dygraph'], res['static']) def test_models(self): - arches = ['mobilenet_v1', 'mobilenet_v2', 'resnet18', 'vgg16'] + arches = [ + 'mobilenet_v1', 'mobilenet_v2', 'resnet18', 'vgg16', 'alexnet' + ] for arch in arches: self.infer(arch) diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py index a25a8f373c29c4..ea42c22e289ede 100644 --- a/python/paddle/tests/test_vision_models.py +++ b/python/paddle/tests/test_vision_models.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import unittest import numpy as np @@ -71,6 +70,9 @@ def test_resnet101(self): def test_resnet152(self): self.models_infer('resnet152') + def test_alexnet(self): + self.models_infer('alexnet') + def test_vgg16_num_classes(self): vgg16 = models.__dict__['vgg16'](pretrained=False, num_classes=10) diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py index 76393865ded04a..b8ac548a966636 100644 --- a/python/paddle/vision/__init__.py +++ b/python/paddle/vision/__init__.py @@ -44,6 +44,8 @@ from .models import vgg16 # noqa: F401 from .models import vgg19 # noqa: F401 from .models import LeNet # noqa: F401 +from .models import AlexNet # noqa: F401 +from .models import alexnet # noqa: F401 from .transforms import BaseTransform # noqa: F401 from .transforms import Compose # noqa: F401 from .transforms import Resize # noqa: F401 diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py index d38f3b1722ee8c..b85333614637f0 100644 --- a/python/paddle/vision/models/__init__.py +++ b/python/paddle/vision/models/__init__.py @@ -28,6 +28,8 @@ from .vgg import vgg16 # noqa: F401 from .vgg import vgg19 # noqa: F401 from .lenet import LeNet # noqa: F401 +from .alexnet import AlexNet # noqa: F401 +from .alexnet import alexnet # noqa: F401 __all__ = [ #noqa 'ResNet', @@ -45,5 +47,7 @@ 'mobilenet_v1', 'MobileNetV2', 'mobilenet_v2', - 'LeNet' + 'LeNet', + 'AlexNet', + 'alexnet' ] diff --git a/python/paddle/vision/models/alexnet.py b/python/paddle/vision/models/alexnet.py new file mode 100644 index 00000000000000..1d36ef37b6ced7 --- /dev/null +++ b/python/paddle/vision/models/alexnet.py @@ -0,0 +1,192 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from paddle.nn import Linear, Dropout, ReLU +from paddle.nn import Conv2D, MaxPool2D +from paddle.nn.initializer import Uniform +from paddle.fluid.param_attr import ParamAttr +from paddle.utils.download import get_weights_path_from_url + +model_urls = { + "alexnet": ( + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/AlexNet_pretrained.pdparams", + "7f0f9f737132e02732d75a1459d98a43", ) +} + +__all__ = [] + + +class ConvPoolLayer(nn.Layer): + def __init__(self, + input_channels, + output_channels, + filter_size, + stride, + padding, + stdv, + groups=1, + act=None): + super(ConvPoolLayer, self).__init__() + + self.relu = ReLU() if act == "relu" else None + + self._conv = Conv2D( + in_channels=input_channels, + out_channels=output_channels, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)), + bias_attr=ParamAttr(initializer=Uniform(-stdv, stdv))) + self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0) + + def forward(self, inputs): + x = self._conv(inputs) + if self.relu is not None: + x = self.relu(x) + x = self._pool(x) + return x + + +class AlexNet(nn.Layer): + """AlexNet model from + `"ImageNet Classification with Deep Convolutional Neural Networks" + `_ + + Args: + num_classes (int): Output dim of last fc layer. Default: 1000. + + Examples: + .. code-block:: python + + from paddle.vision.models import AlexNet + + alexnet = AlexNet() + + """ + + def __init__(self, num_classes=1000): + super(AlexNet, self).__init__() + self.num_classes = num_classes + stdv = 1.0 / math.sqrt(3 * 11 * 11) + self._conv1 = ConvPoolLayer(3, 64, 11, 4, 2, stdv, act="relu") + stdv = 1.0 / math.sqrt(64 * 5 * 5) + self._conv2 = ConvPoolLayer(64, 192, 5, 1, 2, stdv, act="relu") + stdv = 1.0 / math.sqrt(192 * 3 * 3) + self._conv3 = Conv2D( + 192, + 384, + 3, + stride=1, + padding=1, + weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)), + bias_attr=ParamAttr(initializer=Uniform(-stdv, stdv))) + stdv = 1.0 / math.sqrt(384 * 3 * 3) + self._conv4 = Conv2D( + 384, + 256, + 3, + stride=1, + padding=1, + weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)), + bias_attr=ParamAttr(initializer=Uniform(-stdv, stdv))) + stdv = 1.0 / math.sqrt(256 * 3 * 3) + self._conv5 = ConvPoolLayer(256, 256, 3, 1, 1, stdv, act="relu") + + if self.num_classes > 0: + stdv = 1.0 / math.sqrt(256 * 6 * 6) + self._drop1 = Dropout(p=0.5, mode="downscale_in_infer") + self._fc6 = Linear( + in_features=256 * 6 * 6, + out_features=4096, + weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)), + bias_attr=ParamAttr(initializer=Uniform(-stdv, stdv))) + + self._drop2 = Dropout(p=0.5, mode="downscale_in_infer") + self._fc7 = Linear( + in_features=4096, + out_features=4096, + weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)), + bias_attr=ParamAttr(initializer=Uniform(-stdv, stdv))) + self._fc8 = Linear( + in_features=4096, + out_features=num_classes, + weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)), + bias_attr=ParamAttr(initializer=Uniform(-stdv, stdv))) + + def forward(self, inputs): + x = self._conv1(inputs) + x = self._conv2(x) + x = self._conv3(x) + x = F.relu(x) + x = self._conv4(x) + x = F.relu(x) + x = self._conv5(x) + + if self.num_classes > 0: + x = paddle.flatten(x, start_axis=1, stop_axis=-1) + x = self._drop1(x) + x = self._fc6(x) + x = F.relu(x) + x = self._drop2(x) + x = self._fc7(x) + x = F.relu(x) + x = self._fc8(x) + + return x + + +def _alexnet(arch, pretrained, **kwargs): + model = AlexNet(**kwargs) + + if pretrained: + assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format( + arch) + weight_path = get_weights_path_from_url(model_urls[arch][0], + model_urls[arch][1]) + + param = paddle.load(weight_path) + model.load_dict(param) + + return model + + +def alexnet(pretrained=False, **kwargs): + """AlexNet model + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False. + + Examples: + .. code-block:: python + + from paddle.vision.models import alexnet + + # build model + model = alexnet() + + # build model and load imagenet pretrained weight + # model = alexnet(pretrained=True) + """ + return _alexnet('alexnet', pretrained, **kwargs) From d7858c997b88c73c4fb0bb94db378578fd7e7f07 Mon Sep 17 00:00:00 2001 From: Wangzheee <634486483@qq.com> Date: Wed, 13 Oct 2021 11:29:58 +0800 Subject: [PATCH 138/298] [PaddleInference] Pass: add int8 flag for op (#36042) * add_int_pass * add_int8_flag_pass * add_int8_flag_pass * fix CMakeLists.txt * fix test_trt_fc_fuse_quant_dequant_pass.py * fix python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py * fix test_trt_fc_fuse_quant_dequant_pass.py --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../framework/ir/add_support_int8_pass.cc | 54 +++++++++++++++++++ .../framework/ir/add_support_int8_pass.h | 36 +++++++++++++ .../framework/ir/graph_pattern_detector.cc | 23 ++++++++ .../framework/ir/graph_pattern_detector.h | 12 +++++ .../inference/api/paddle_pass_builder.cc | 5 +- paddle/fluid/inference/tensorrt/op_teller.cc | 6 ++- .../test_trt_fc_fuse_quant_dequant_pass.py | 13 +++-- 8 files changed, 140 insertions(+), 10 deletions(-) create mode 100644 paddle/fluid/framework/ir/add_support_int8_pass.cc create mode 100644 paddle/fluid/framework/ir/add_support_int8_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 6f5f27400752dd..a2e9fc3a3d9ac5 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -97,6 +97,7 @@ pass_library(multihead_matmul_fuse_pass inference) pass_library(adaptive_pool2d_convert_global_pass inference) pass_library(unsqueeze2_eltwise_fuse_pass inference) pass_library(layer_norm_fuse_pass inference) +pass_library(add_support_int8_pass inference) pass_library(generate_pass DEPS pass_desc_proto) target_link_libraries(generate_pass pass_desc_proto) if(WITH_GPU OR WITH_ROCM) diff --git a/paddle/fluid/framework/ir/add_support_int8_pass.cc b/paddle/fluid/framework/ir/add_support_int8_pass.cc new file mode 100644 index 00000000000000..d157d2e934acea --- /dev/null +++ b/paddle/fluid/framework/ir/add_support_int8_pass.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/add_support_int8_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(prev_op); \ + GET_IR_NODE(prev_out); \ + GET_IR_NODE(quant_op); \ + GET_IR_NODE(quant_out); + +void AddSupportInt8Pass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "add_support_int8"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + + patterns::AddSupportInt8 pattern(gpd.mutable_pattern(), pattern_name); + pattern(); + int found_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + if (prev_op->Op()->HasAttr("out_threshold") && + quant_op->Op()->HasAttr("out_threshold")) { + quant_op->Op()->SetAttr("support_int8", true); + } + found_count++; + }; + gpd(graph, handler); + AddStatis(found_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(add_support_int8_pass, paddle::framework::ir::AddSupportInt8Pass); diff --git a/paddle/fluid/framework/ir/add_support_int8_pass.h b/paddle/fluid/framework/ir/add_support_int8_pass.h new file mode 100644 index 00000000000000..372250d60169d3 --- /dev/null +++ b/paddle/fluid/framework/ir/add_support_int8_pass.h @@ -0,0 +1,36 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Graph; + +class AddSupportInt8Pass : public FusePassBase { + public: + AddSupportInt8Pass() {} + virtual ~AddSupportInt8Pass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 449849762cb101..695da372d18f3e 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2986,6 +2986,29 @@ PDNode *patterns::LayerNorm::operator()() { return shift_out; } +// Add support int8 flag +PDNode *patterns::AddSupportInt8::operator()() { + auto prev_op = + pattern->NewNode(prev_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + return node->Op()->HasAttr("out_threshold") ? true : false; + }); + auto prev_out = pattern->NewNode(prev_out_repr())->assert_is_var(); + auto quant_op = + pattern->NewNode(quant_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + return node->Op()->HasAttr("out_threshold") ? true : false; + }); + auto quant_out = + pattern->NewNode(quant_out_repr())->assert_is_var()->AsOutput(); + prev_op->LinksTo({prev_out}); + prev_out->LinksTo({quant_op}); + quant_op->LinksTo({quant_out}); + return quant_out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 40c3e4f59bf262..4afb7dfd4991b0 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1682,6 +1682,18 @@ struct LayerNorm : public PatternBase { PATTERN_DECL_NODE(shift_out); }; +// Add support int8 flag +struct AddSupportInt8 : public PatternBase { + AddSupportInt8(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "Add_support_int8") {} + + PDNode* operator()(); + PATTERN_DECL_NODE(prev_op); + PATTERN_DECL_NODE(prev_out); + PATTERN_DECL_NODE(quant_op); + PATTERN_DECL_NODE(quant_out); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 704fbb2b95c892..47e9c1fd202a05 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -96,8 +96,9 @@ const std::vector kTRTSubgraphPasses({ "map_matmul_to_mul_pass", // "fc_fuse_pass", // "conv_elementwise_add_fuse_pass", // - "tensorrt_subgraph_pass", // - "conv_bn_fuse_pass", // + "add_support_int8_pass", + "tensorrt_subgraph_pass", // + "conv_bn_fuse_pass", // #if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be // guaranteed at least v7 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index ef50aee48e2eb8..59368a299c59e2 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -59,6 +59,8 @@ struct SimpleOpTypeSetTeller : public Teller { #if CUDA_VERSION >= 10020 teller_set.insert("reshape"); teller_set.insert("reshape2"); + int8_teller_set.insert("reshape"); + int8_teller_set.insert("reshape2"); #endif } @@ -91,7 +93,9 @@ struct SimpleOpTypeSetTeller : public Teller { "scale", "elementwise_mul", "conv2d_transpose", - "hard_swish"}; + "hard_swish", + "transpose", + "transpose2"}; std::unordered_set teller_set{"mul", "matmul", "conv2d", diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py index 114fa6478f8a6f..9e1991ae1ae305 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py @@ -86,15 +86,14 @@ def network(): self.data = fluid.data( name='data', shape=[1, 28, 28], dtype='float32') self.label = fluid.data(name='label', shape=[1, 1], dtype='int64') - label_shape = fluid.layers.reshape(self.label, shape=[1, 1, 1]) fc_out = fluid.layers.fc(input=self.data, size=28, num_flatten_dims=2, bias_attr=False, act=None) - c_out = fluid.layers.reshape(fc_out, shape=[1, 1, 784]) + c_out = fluid.layers.reshape(fc_out, shape=[0, 784]) result = fluid.layers.relu(c_out) - loss = fluid.layers.cross_entropy(input=result, label=label_shape) + loss = fluid.layers.cross_entropy(input=result, label=self.label) avg_loss = fluid.layers.mean(loss) return avg_loss, result @@ -119,11 +118,11 @@ def network(): self.dynamic_shape_params = FCQuantDequantFusePassTRTDims3Cols2Test.DynamicShapeParam( { 'data': [1, 28, 28], - 'reshape2_1.tmp_0': [1, 1, 784] + 'reshape2_0.tmp_0': [1, 784] }, {'data': [4, 28, 28], - 'reshape2_1.tmp_0': [4, 1, 784]}, - {'data': [1, 28, 28], - 'reshape2_1.tmp_0': [1, 1, 784]}, False) + 'reshape2_0.tmp_0': + [4, 784]}, {'data': [1, 28, 28], + 'reshape2_0.tmp_0': [1, 784]}, False) self.activation_quantize_type = 'moving_average_abs_max' self.weight_quantize_type = 'channel_wise_abs_max' From 2c44ee7e8033d6abef02ed492c07caa154402193 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Wed, 13 Oct 2021 13:37:55 +0800 Subject: [PATCH 139/298] [New Feature] Support triple grad in Paddle (#36187) * native commit for triple grad of sigmod * Updated unittests files * init functional jacobian api * Updated trible_test func * Updated gradient_checker & test_script * finish test with dtype float32 * add float64 test case * polish code * use atol=1e-5 with dtype float64 * fix for ci * set timeout for test_jacobian * fix dygraph grad to support high differential * polish API docstring * Updated gradient checker and some related files * fix double grad strip error for high differential * fix double grad strip error for high differential * Add Sigmoid triple grad tests * fix dygraph double grad dtype error when calling for high differential senario * Updated triple grad teses func * Use np.random to initialize ddx * Updated triple_grad_check func * add todo for gradient checker and refine some comments * remove additional code * add test for warnging in backward.py * format python code Co-authored-by: veyron95 Co-authored-by: levi131 --- paddle/fluid/operators/activation_op.cc | 107 +++++++++++++- paddle/fluid/operators/activation_op.cu | 9 ++ paddle/fluid/operators/activation_op.h | 133 ++++++++++++++++-- python/paddle/fluid/backward.py | 9 +- .../fluid/tests/unittests/gradient_checker.py | 117 ++++++++++++++- .../unittests/test_activation_nn_grad.py | 22 +++ ...test_backward_infer_var_data_type_shape.py | 40 ++++++ 7 files changed, 417 insertions(+), 20 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_backward_infer_var_data_type_shape.py diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 5a498e617a4ff4..ac98e49b1c205e 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -77,12 +77,12 @@ class ActivationGradOpMaker : public framework::SingleGradOpMaker { FLAGS_use_mkldnn || (op->HasAttr("use_mkldnn") && BOOST_GET_CONST(bool, op->GetAttr("use_mkldnn")))) { - op->SetInput("X", this->Input("X")); + op->SetInput("X", this->Input("X")); // x } if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepOut)) { - op->SetInput("Out", this->Output("Out")); + op->SetInput("Out", this->Output("Out")); // out } } }; @@ -767,6 +767,10 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { ctx->ShareDim("Out", "DDOut"); ctx->ShareLoD("Out", "DDOut"); } + if (ctx->HasOutput("DOutNew")) { + ctx->ShareDim("Out", "DOutNew"); + ctx->ShareLoD("Out", "DOutNew"); + } } } @@ -804,6 +808,45 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { } }; +template +class ActivationOpTripleGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + if (static_cast(kDepValue) & static_cast(kDepX)) { + if (ctx->HasOutput("DX")) { + ctx->ShareDim("X", "DX"); + ctx->ShareLoD("X", "DX"); + } + if (ctx->HasOutput("DDOut")) { + ctx->ShareDim("X", "DDOut"); + ctx->ShareLoD("X", "DDOut"); + } + } + if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (ctx->HasOutput("D_DOut")) { + ctx->ShareDim("Out", "D_DOut"); + ctx->ShareLoD("Out", "D_DOut"); + } + if (ctx->HasOutput("D_OutNew")) { + ctx->ShareDim("Out", "D_OutNew"); + ctx->ShareLoD("Out", "D_OutNew"); + } + if (ctx->HasOutput("D_DDx")) { + ctx->ShareDim("DDX", "D_DDx"); + ctx->ShareLoD("DDX", "D_DDx"); + } + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return GetKernelType(ctx, *this, "DDX"); + } +}; + template class SigmoidDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { @@ -825,6 +868,36 @@ class SigmoidDoubleGradMaker } }; +template +class SigmoidTripleGradMaker + : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("sigmoid_triple_grad"); + // Out, DDX, DOut, D_DDOut, D_DOut_New // input + // D_OutNew, D_DOut, D_DDx // output + // input1: Out + op->SetInput("Out", this->Input("Out")); + // input2: ddx + op->SetInput("DDX", this->Input("DDX")); + // input3: dout + op->SetInput("DOut", this->Input("DOut")); + // input4: d_ddout + op->SetInput("D_DDOut", this->OutputGrad("DDOut")); + // input5: d_dout_new + op->SetInput("D_DOut_New", this->OutputGrad("DOutNew")); + op->SetAttrMap(this->Attrs()); + + // output: d_dOut, d_OutNew, d_ddx + op->SetOutput("D_OutNew", this->InputGrad("Out")); + op->SetOutput("D_DOut", this->InputGrad("DOut")); + op->SetOutput("D_DDx", this->InputGrad("DDX")); + } +}; + template class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { public: @@ -995,10 +1068,12 @@ class LogDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { }; DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInferer, - {framework::GradVarName("Out"), - framework::GradVarName("X")}); + {framework::GradVarName("Out"), // dout + framework::GradVarName("X")}); // dx DECLARE_INPLACE_OP_INFERER(ActivationDoubleGradOpInplaceInferer, {"DDX", "DDOut"}); +DECLARE_INPLACE_OP_INFERER(ActivationTripleGradOpInplaceInferer, + {"DDX", "D_DOut"}); template class PowGradOpMaker : public framework::SingleGradOpMaker { @@ -1121,13 +1196,21 @@ REGISTER_OPERATOR( REGISTER_OPERATOR(sigmoid_grad, ops::ActivationOpGrad, ops::ActivationGradOpInplaceInferer, ops::SigmoidDoubleGradMaker, - ops::SigmoidDoubleGradMaker) + ops::SigmoidDoubleGradMaker); // 3. Register Sigmoid DoubleGrad Operator REGISTER_OPERATOR( sigmoid_grad_grad, - ops::ActivationOpDoubleGrad::FwdDeps()>, - ops::ActivationDoubleGradOpInplaceInferer); + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInferer, + ops::SigmoidTripleGradMaker, + ops::SigmoidTripleGradMaker); + +// 4. Register Sigmoid TripleGrad Operator +REGISTER_OPERATOR(sigmoid_triple_grad, + ops::ActivationOpTripleGrad< + ops::SigmoidTripleGradFunctor::FwdDeps()>, + ops::ActivationTripleGradOpInplaceInferer); // Register Sigmoid/GradSigmoid Kernels REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor, @@ -1143,6 +1226,16 @@ REGISTER_OP_CPU_KERNEL( ops::SigmoidDoubleGradKernel>); +// Register TripleGrad Kernel +REGISTER_OP_CPU_KERNEL( + sigmoid_triple_grad, + ops::SigmoidTripleGradKernel>, + ops::SigmoidTripleGradKernel>, + ops::SigmoidTripleGradKernel>); + /* ========================================================================== */ /* ========================== tanh register ============================= */ diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 72f10bf19e733a..f330f2d7e87ba7 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -1398,6 +1398,15 @@ REGISTER_OP_CUDA_KERNEL( ops::SigmoidGradGradFunctor>, ops::SigmoidDoubleGradKernel>); + +REGISTER_OP_CUDA_KERNEL( + sigmoid_triple_grad, + ops::SigmoidTripleGradKernel>, + ops::SigmoidTripleGradKernel>, + ops::SigmoidTripleGradKernel>); /* ========================================================================== */ /* =========================== tanh register ============================ */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 57ea97f746246b..4f26cb095c5a72 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -24,12 +24,13 @@ limitations under the License. */ #define _USE_MATH_DEFINES #endif +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" - #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -282,19 +283,77 @@ struct SigmoidGradGradFunctor : public BaseActivationFunctor { auto dout = framework::EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad")); auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad")); + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad")); dout_new.device(*d) = (static_cast(1) - static_cast(2) * out) * dout * ddx; } if (ddOut) { auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad")); + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad")); ddout.device(*d) = (static_cast(1) - out) * out * ddx; } } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +/* + Out + DOut D_Dout + DDx -> SigmoidTripleGrad -> D_DDx + D_DDout d_OutNew + D_Dout_new + + D_Dout = (1-2*Out)*DDx*D_Dout_new + D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new + D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new + + Out, DDX, DOut, D_DDOut, D_DOut_New // input + D_OutNew, D_DOut, D_DDx // output +*/ +template +struct SigmoidTripleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, const framework::Tensor* dOut, + const framework::Tensor* d_DDOut, + const framework::Tensor* d_dOut_New, + framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, + framework::Tensor* d_DDx) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad")); + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad")); + auto d_ddOut = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad")); + auto d_dOutNew = framework::EigenVector::Flatten(GET_DATA_SAFELY( + d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad")); + + if (d_Out_New) { + auto d_OutNew = framework::EigenVector::Flatten(GET_DATA_SAFELY( + d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad")); + d_OutNew.device(*d) = (ddx - static_cast(2) * out * ddx) * d_ddOut - + static_cast(2) * dout * ddx * d_dOutNew; + } + if (d_d_Out) { + auto d_dOut = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad")); + d_dOut.device(*d) = + (static_cast(1) - static_cast(2) * out) * ddx * d_dOutNew; + } + if (d_DDx) { + auto d_ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad")); + d_ddx.device(*d) = + (static_cast(1) - out) * out * d_ddOut + + (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + // silu(x) = x / (1 + exp(-x)) template struct SiluFunctor : public BaseActivationFunctor { @@ -465,13 +524,13 @@ struct TanhGradGradFunctor : public BaseActivationFunctor { auto dout = framework::EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad")); + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad")); dout_new.device(*d) = static_cast(-1) * dout * static_cast(2) * out * ddx; } if (ddOut) { auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad")); + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad")); ddout.device(*d) = (static_cast(1) - out * out) * ddx; } } @@ -1856,7 +1915,6 @@ class SigmoidDoubleGradKernel framework::Tensor *dOutNew, *ddOut; Out = ddX = dOut = nullptr; dOutNew = ddOut = nullptr; - // extract ddx(input) and out(input) ddX = ctx.Input("DDX"); Out = ctx.Input("Out"); @@ -1868,20 +1926,15 @@ class SigmoidDoubleGradKernel Out, platform::errors::NotFound( "Cannot get input Variable Out, variable name = %s", ctx.InputName("Out"))); - // set output ddout ddOut = ctx.Output("DDOut"); - // extract dOut(intput) dOut = ctx.Input("DOut"); PADDLE_ENFORCE_NOT_NULL( dOut, platform::errors::NotFound( "Cannot get input Variable dOut, variable name = %s", ctx.InputName("DOut"))); - - // set output dout_new dOutNew = ctx.Output("DOutNew"); - if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); auto& place = ctx.template device_context(); @@ -1890,6 +1943,64 @@ class SigmoidDoubleGradKernel } }; +// Out, DDX, DOut, D_DDOut, D_DOut_New // input +// D_OutNew, D_DOut, D_DDx // output +template +class SigmoidTripleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; + framework::Tensor *d_OutNew, *d_dOut, *d_ddx; + Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; + d_OutNew = d_dOut = d_ddx = nullptr; + + // extract ddx(input), out(input), dOut(input), d_ddOut(input), + // d_dOutNew(input) + ddX = ctx.Input("DDX"); + Out = ctx.Input("Out"); + dOut = ctx.Input("DOut"); + d_ddOut = ctx.Input("D_DDOut"); + d_dOutNew = ctx.Input("D_DOut_New"); + + PADDLE_ENFORCE_NOT_NULL( + ddX, platform::errors::NotFound( + "Cannot get input Variable ddX, variable name = %s", + ctx.InputName("DDX"))); + PADDLE_ENFORCE_NOT_NULL( + Out, platform::errors::NotFound( + "Cannot get input Variable Out, variable name = %s", + ctx.InputName("Out"))); + PADDLE_ENFORCE_NOT_NULL( + dOut, platform::errors::NotFound( + "Cannot get input Variable dOut, variable name = %s", + ctx.InputName("DOut"))); + PADDLE_ENFORCE_NOT_NULL( + d_ddOut, platform::errors::NotFound( + "Cannot get input Variable d_ddOut, variable name = %s", + ctx.InputName("D_DDOut"))); + PADDLE_ENFORCE_NOT_NULL( + d_dOutNew, + platform::errors::NotFound( + "Cannot get input Variable d_dOutNew, variable name = %s", + ctx.InputName("D_DOutNew"))); + + // set output d_OutNew、d_dOut、d_ddx + d_dOut = ctx.Output("D_DOut"); + d_OutNew = ctx.Output("D_OutNew"); + d_ddx = ctx.Output("D_DDx"); + + if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); + if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); + if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); + auto& place = ctx.template device_context(); + Functor functor; + functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input + d_dOut, d_OutNew, d_ddx); // output + } +}; + template class TanhDoubleGradKernel : public framework::OpKernel { diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 7aa3c888f2ad18..7ab060be6df291 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -27,6 +27,7 @@ from . import log_helper import paddle.fluid from .data_feeder import check_type +import warnings __all__ = [ 'append_backward', 'gradients', @@ -371,6 +372,10 @@ def _infer_var_data_type_shape_(grad_var_name, block): grad_var.set_dtype(fwd_var.dtype()) grad_var.set_shape(fwd_var.shape()) else: + # TODO(jiabin): Maybe we should not to this to cause some unexpected error on dtype + warnings.warn( + "Set grad var: {} dtype to default FP32, since we can't find its related forward var". + format(grad_var_name)) grad_var.set_dtype(core.VarDesc.VarType.FP32) @@ -408,7 +413,9 @@ def _strip_grad_suffix_(name): """ name = cpt.to_text(name) pos = name.find(core.grad_var_suffix()) - return name[:pos] if pos != -1 else name + new_name = name[:pos] if pos != -1 else name + new_pos = name.rfind('grad/') + return new_name[new_pos + 5:] if new_pos != -1 else new_name def _append_grad_suffix_(name): diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py index 633fea17103858..01aa2fd9efa4fb 100644 --- a/python/paddle/fluid/tests/unittests/gradient_checker.py +++ b/python/paddle/fluid/tests/unittests/gradient_checker.py @@ -309,7 +309,7 @@ def fail_test(msg): _compute_analytical_jacobian(prog, clone_x, clone_y, place, scope)) for i, (x_idx, - y_idx) in enumerate(product(* [range(len(x)), range(len(y))])): + y_idx) in enumerate(product(*[range(len(x)), range(len(y))])): a = analytical[y_idx][x_idx] n = numerical[x_idx][y_idx] if not np.allclose(a, n, rtol, atol): @@ -391,3 +391,118 @@ def double_grad_check(x, x_init += y_grads_init grad_check(x, target_grads, x_init, place, program, eps, atol, rtol) + + +# TODO(jiabin): We currently support only triple grad check here, extend this to support +# higher order differenciation later. + + +# check triple grad and two outputs of the triple Kernel +def triple_grad_check(x, + y, + x_init=None, + y_grads=None, + x_grads_grads=None, + place=None, + program=None, + eps=1e-6, + atol=1e-5, + rtol=1e-3, + raise_exception=True): + """ + Check triple gradients. This function will append backward to the + program before third order gradient check. + + Args: + x (Variable|list[Variable]): input variables to the program. + y (Variable|list[Variable]): output variables to the program. + x_init (numpy.array|list[numpy.array]|None): the init value for input x. + y_grads (numpy.array|list[numpy.array]|None): the gradients with respect to y. + x_grads_grads (numpy.array|list[numpy.array]|None): the gradients with respect to your input. + place (fluid.CPUPlace or fluid.CUDAPlace): the device. + program (Program|None): a Program with forward pass. + If None, use fluid.default_main_program(). + eps (float): perturbation for finite differences. + atol (float): absolute tolerance. + rtol (float): relative tolerance. + raise_exception (bool): whether to raise an exception if + the check fails. Default is True. + Returns: + True if all differences satisfy numpy.allclose condition. + """ + # check input arguments + x = _as_list(x) + for v in x: + v.stop_gradient = False + v.persistable = True + y = _as_list(y) + + if program is None: + program = fluid.default_main_program() + + if y_grads is None: + scope = fluid.executor.global_scope() + y_grads = [] + y_grads_init = [] + for yi in y: + dyi_name = _append_grad_suffix_(yi.name) + np_type = dtype_to_np_dtype(yi.dtype) + dy = program.global_block().create_var( + name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True) + dy.stop_gradient = False + v = np.random.random(size=yi.shape).astype(np_type) + set_var_in_scope(scope, place, dyi_name, v) + y_grads.append(dy) + y_grads_init.append(v) + else: + y_grads = _as_list(y_grads) + y_grads_init = [ + var_to_np_array_in_scope(scope, place, v.name) for v in y_grads + ] + + # append first order grads + target_grads = fluid.gradients(y, x, y_grads) + + if x_grads_grads is None: + scope = fluid.executor.global_scope() + x_grads_grads = [] + x_grads_grads_init = [] + for dxi in target_grads: + ddxi_name = _append_grad_suffix_(dxi.name) + np_type = dtype_to_np_dtype(dxi.dtype) + ddx = program.global_block().create_var( + name=ddxi_name, + shape=dxi.shape, + dtype=np_type, + persistable=True) + ddx.stop_gradient = False + v = np.random.random(size=dxi.shape).astype(np_type) + set_var_in_scope(scope, place, ddxi_name, v) + x_grads_grads.append(ddx) + x_grads_grads_init.append(v) + else: + x_grads_grads = _as_list(x_grads_grads) + x_grads_grads_init = [ + var_to_np_array_in_scope(scope, place, v.name) + for v in x_grads_grads + ] + # append second order grads + target_grads_grads = fluid.gradients(target_grads, x, x_grads_grads) + + x += y_grads + x_init = _as_list(x_init) + x_init += y_grads_init + + x += x_grads_grads + x_init += x_grads_grads_init + + # x <=> [x, dout, ddx] + grad_check( + x=x, + y=target_grads_grads, + x_init=x_init, + place=place, + program=program, + eps=eps, + atol=atol, + rtol=rtol) diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index 81b3e9bf34887e..8f3353d1155f6f 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -26,6 +26,28 @@ from decorator_helper import prog_scope +class TestSigmoidTripleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + shape = [2, 3, 7, 9] + eps = 0.0005 + dtype = np.float64 + x = layers.data('x', shape, False, dtype=dtype) + x.persistable = True + y = layers.sigmoid(x) + x_arr = np.random.random(shape).astype(dtype) + x_arr[np.abs(x_arr) < 0.005] = 0.002 + gradient_checker.triple_grad_check( + [x], y, x_init=x_arr, place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + class TestSigmoidDoubleGradCheck(unittest.TestCase): @prog_scope() def func(self, place): diff --git a/python/paddle/fluid/tests/unittests/test_backward_infer_var_data_type_shape.py b/python/paddle/fluid/tests/unittests/test_backward_infer_var_data_type_shape.py new file mode 100644 index 00000000000000..a0cd6fca573392 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_backward_infer_var_data_type_shape.py @@ -0,0 +1,40 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from decorator_helper import prog_scope +import unittest +import paddle.fluid as fluid +import numpy as np +import paddle +import warnings + + +class TestBackwardInferVarDataTypeShape(unittest.TestCase): + def test_backward_infer_var_data_type_shape(self): + paddle.enable_static() + program = fluid.default_main_program() + dy = program.global_block().create_var( + name="Tmp@GRAD", shape=[1, 1], dtype=np.float32, persistable=True) + # invoke warning + fluid.backward._infer_var_data_type_shape_("Tmp@GRAD", + program.global_block()) + res = False + with warnings.catch_warnings(): + res = True + self.assertTrue(res) + + +if __name__ == '__main__': + unittest.main() From e051bba0056053303071caa51849fa9a514015a4 Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Wed, 13 Oct 2021 14:37:03 +0800 Subject: [PATCH 140/298] Remove RunFromCinn in PE because We Will Call CinnRunner in Compute of SubgraphOp (#36385) Remove RunFromCinn method in PE because We Will Call CinnRunner in Compute method of SubgraphOp --- paddle/fluid/framework/parallel_executor.cc | 36 --------------------- paddle/fluid/framework/parallel_executor.h | 5 --- paddle/fluid/pybind/pybind.cc | 12 ------- python/paddle/fluid/executor.py | 14 ++------ 4 files changed, 2 insertions(+), 65 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 3b80e9c78677d1..d19ac0b65f4d1e 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -34,7 +34,6 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h" -#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/event.h" @@ -44,7 +43,6 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_device_guard.h" #endif -DECLARE_bool(use_cinn); DECLARE_double(eager_delete_tensor_gb); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -945,40 +943,6 @@ void ParallelExecutor::RunWithoutFetch( member_->executor_->Run(/*fetch_tensors*/ {}, /*return_merged*/ false); } -FetchResultType ParallelExecutor::RunFromCinn( - const std::unordered_map &feed_tensors, - const std::vector &fetch_names) { - // Feed tensor to scope, now only support 1 scope - // TODO(zhhsplendid): handle multiple scope - size_t scope_id = 0; - std::map cinn_input_tensors; - for (auto &name_tensor_pair : feed_tensors) { - bool is_persistable = member_->IsPersistable(name_tensor_pair.first); - if (!is_persistable) { - member_->SetSkipMemoryReuse(scope_id, name_tensor_pair.first); - } - Scope *feed_scope = is_persistable ? member_->local_scopes_[scope_id] - : member_->local_exec_scopes_[scope_id]; - Variable *feed_var = feed_scope->Var(name_tensor_pair.first); - LoDTensor *trg = feed_var->GetMutable(); - trg->ShareDataWith(name_tensor_pair.second); - trg->set_lod(name_tensor_pair.second.lod()); - - cinn_input_tensors[name_tensor_pair.first] = trg; - } - - // TODO(zhhsplendid): get correct API after CINN API is ready - // now only return empty fetch result; - std::shared_ptr cinn_runner = - paddle2cinn::CinnRunner::GetInstance(); - - cinn_runner->Run(Graph(), member_->local_exec_scopes_[scope_id], - &cinn_input_tensors); - - paddle::framework::FetchResultType fetches = FetchList(fetch_names.size()); - return fetches; -} - void ParallelExecutor::SkipMemoryReuse( size_t scope_idx, const std::vector &skip_vars) { for (auto &var_name : skip_vars) { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index f908ce3f013937..78774f04896389 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include #include #include #include @@ -93,10 +92,6 @@ class ParallelExecutor { void RunWithoutFetch(const std::vector &skip_eager_vars); - FetchResultType RunFromCinn( - const std::unordered_map &feed_tensors, - const std::vector &fetch_names); - void ResetOpHandleScopeMapOfGraphs( const std::unordered_map &scope_map); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 80350abb4fe219..f58c2a5db381c7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -3293,18 +3293,6 @@ All parameter, weight, gradient are variables in Paddle. BOOST_GET(paddle::framework::FetchUnmergedList, ret))); } }) - .def("run_from_cinn", - [](ParallelExecutor &self, - const std::unordered_map &feed_tensors, - const std::vector &fetch_names) -> py::object { - paddle::framework::FetchResultType ret; - { - pybind11::gil_scoped_release release; - ret = self.RunFromCinn(feed_tensors, fetch_names); - } - return py::cast( - std::move(BOOST_GET(paddle::framework::FetchList, ret))); - }) .def("device_count", &ParallelExecutor::DeviceCount); BindFleetWrapper(&m); diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index bea5b29ecafa65..17f8a7291ad8ff 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -24,7 +24,7 @@ import six from .data_feeder import convert_dtype from .framework import Program, default_main_program, Variable, Operator -from .framework import convert_np_dtype_to_dtype_, get_flags +from .framework import convert_np_dtype_to_dtype_ from . import core from . import unique_name from . import compiler @@ -1016,17 +1016,8 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, if need_check_feed: check_feed_shape_type(var, feed_tensor, exe.device_count()) feed_tensor_dict[feed_name] = feed_tensor + exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict) - #TODO(zhhsplendid): handle other feed data format case for CINN - use_cinn = get_flags("FLAGS_use_cinn")["FLAGS_use_cinn"] - if use_cinn: - fetch_var_names = list(map(_to_name_str, fetch_list)) - fetch_tensors = exe.run_from_cinn( - feed_tensor_dict, fetch_var_names)._move_to_list() - return as_numpy( - fetch_tensors) if return_numpy else fetch_tensors - else: - exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict) elif isinstance(feed, list) or isinstance(feed, tuple): res = list() for i, each in enumerate(feed): @@ -1047,7 +1038,6 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, res_dict[feed_name] = tensor res.append(res_dict) - use_cinn = get_flags("FLAGS_use_cinn")["FLAGS_use_cinn"] exe.feed_tensors_into_local_scopes(res) if hasattr(program._program, 'lr_sheduler'): From 59e425cd2d8f2fdc331cc79e6c33726dfeec3249 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 13 Oct 2021 14:39:42 +0800 Subject: [PATCH 141/298] [Amp] refine code of amp level (#36362) * refine amp level * fix typo * update tracer._amp_level --- paddle/fluid/imperative/amp_auto_cast.cc | 13 +++++++++- paddle/fluid/imperative/amp_auto_cast.h | 24 +++++++++---------- paddle/fluid/imperative/tracer.cc | 4 ++-- paddle/fluid/imperative/tracer.h | 9 ++++--- paddle/fluid/pybind/imperative.cc | 11 +++++++-- .../fleet/meta_parallel/pp_utils/utils.py | 2 +- .../distributed/fleet/utils/recompute.py | 2 +- python/paddle/fluid/dygraph/amp/auto_cast.py | 10 ++++---- 8 files changed, 49 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 48e5e430b136a5..b0d86f6db9f960 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -24,6 +24,17 @@ namespace imperative { class VarBase; +AutoCastGuard::AutoCastGuard(std::shared_ptr tracer, AmpLevel level) + : tracer_(tracer) { + pre_amp_level_ = tracer_->GetAmpLevel(); + + if (pre_amp_level_ != level) { + tracer_->SetAmpLevel(level); + } +} + +AutoCastGuard::~AutoCastGuard() { tracer_->SetAmpLevel(pre_amp_level_); } + AmpOperators::AmpOperators() : allow_ops_(new std::unordered_set()), block_ops_(new std::unordered_set()), @@ -117,7 +128,7 @@ static inline std::shared_ptr CastToType( imperative::NameVarBaseMap outs = {{"Out", {out}}}; { - AutoCastGuard guard(tracer, 0); + AutoCastGuard guard(tracer, AmpLevel::O0); tracer->TraceOp("cast", ins, outs, std::move(attrs)); } diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h index 79bc83a777aa90..903e2652888d85 100644 --- a/paddle/fluid/imperative/amp_auto_cast.h +++ b/paddle/fluid/imperative/amp_auto_cast.h @@ -19,15 +19,22 @@ #include #include -#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/type_defs.h" namespace paddle { namespace imperative { -// Singleton implementation with C++ 11 +// NOTE(zhiqiu): only O1 and O2 are valid now +enum class AmpLevel { + O0 = 0, // fp32 + O1, // amp, mixed fp32-fp16 + O2, // almost fp16 + O3, // fp16 +}; + class Tracer; +// Singleton implementation with C++ 11 class AmpOperators { public: ~AmpOperators(); @@ -63,16 +70,9 @@ std::ostream& operator<<(std::ostream& os, AmpOperators& ops); // NOTE(zhiqiu): AutoCastGuard is used for RAII. class AutoCastGuard { public: - AutoCastGuard(std::shared_ptr tracer, int guard_level) - : tracer_(tracer) { - pre_amp_level_ = tracer_->AMPLevel(); - - if (pre_amp_level_ != guard_level) { - tracer_->SetAMPLevel(guard_level); - } - } + AutoCastGuard(std::shared_ptr tracer, AmpLevel guard_level); - ~AutoCastGuard() { tracer_->SetAMPLevel(pre_amp_level_); } + ~AutoCastGuard(); // forbid copy and operator= AutoCastGuard(const AutoCastGuard& guard) = delete; @@ -80,7 +80,7 @@ class AutoCastGuard { private: std::shared_ptr tracer_; - int pre_amp_level_; + AmpLevel pre_amp_level_; }; NameVarBaseMap AutoCastInputs(const std::string& op_type, diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 49e079c58caf3c..0f363d0ea1bff8 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -176,10 +176,10 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, : attr_checker->GetDefaultAttrMap(); NameVarBaseMap new_ins = ins; - if (amp_level_ == 1) { + if (amp_level_ == AmpLevel::O1) { VLOG(5) << "Auto mixed precision run operator: " << type; new_ins = AutoCastInputs(type, ins); - } else if (amp_level_ == 2) { + } else if (amp_level_ == AmpLevel::O2) { VLOG(5) << "Pure fp16 run operator: " << type; new_ins = CastPureFp16Inputs(type, ins); } diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index e77623d7a46092..418b2069b5bb62 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -23,6 +23,7 @@ #include #include "ThreadPool.h" #include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/jit/program_desc_tracer.h" #include "paddle/fluid/imperative/layer.h" @@ -31,6 +32,8 @@ namespace paddle { namespace imperative { +enum class AmpLevel; + using GarbageCollectorMap = std::map>; @@ -105,9 +108,9 @@ class Tracer { void SetHasGrad(bool has_grad) { has_grad_ = has_grad; } - void SetAMPLevel(int level) { amp_level_ = level; } + void SetAmpLevel(AmpLevel level) { amp_level_ = level; } - int AMPLevel() const { return amp_level_; } + AmpLevel GetAmpLevel() const { return amp_level_; } paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists( const platform::Place& place); @@ -120,7 +123,7 @@ class Tracer { platform::Place expected_place_; GarbageCollectorMap gcs_; static thread_local bool has_grad_; - int amp_level_{0}; + AmpLevel amp_level_{AmpLevel::O0}; }; // To access static variable current_tracer diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 5aae05db8cc58c..2e22ee90133a86 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -1940,6 +1940,13 @@ void BindImperative(py::module *m_ptr) { &imperative::jit::ProgramDescTracer::CreateProgramDesc) .def("reset", &imperative::jit::ProgramDescTracer::Reset); + py::enum_(m, "AmpLevel", py::arithmetic()) + .value("O0", paddle::imperative::AmpLevel::O0) + .value("O1", paddle::imperative::AmpLevel::O1) + .value("O2", paddle::imperative::AmpLevel::O2) + .value("O3", paddle::imperative::AmpLevel::O3) + .export_values(); + py::class_>( m, "Tracer", R"DOC()DOC") .def("__init__", @@ -1947,8 +1954,8 @@ void BindImperative(py::module *m_ptr) { .def_property("_enable_program_desc_tracing", &imperative::Tracer::IsProgramDescTracingEnabled, &imperative::Tracer::SetEnableProgramDescTracing) - .def_property("_amp_level", &imperative::Tracer::AMPLevel, - &imperative::Tracer::SetAMPLevel) + .def_property("_amp_level", &imperative::Tracer::GetAmpLevel, + &imperative::Tracer::SetAmpLevel) .def_property("_has_grad", &imperative::Tracer::HasGrad, &imperative::Tracer::SetHasGrad) .def_property( diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py index b29b0b3e275574..08266096548c4a 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py @@ -198,7 +198,7 @@ def forward(ctx, run_function, all_outputs, *args): # TODO support AMP tracer = framework._dygraph_tracer() - if tracer._amp_level == 0: + if tracer._amp_level == core.AmpLevel.O0: ctx.is_fw_autocast = False else: ctx.is_fw_autocast = True diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py index 302877e51fe01d..56a64049b16e15 100755 --- a/python/paddle/distributed/fleet/utils/recompute.py +++ b/python/paddle/distributed/fleet/utils/recompute.py @@ -98,7 +98,7 @@ def forward(ctx, run_function, preserve_rng_state, *args): # TODO support AMP tracer = framework._dygraph_tracer() - if tracer._amp_level == 0: + if tracer._amp_level == core.AmpLevel.O0: ctx.is_fw_autocast = False else: ctx.is_fw_autocast = True diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index 0d02a383c1bb80..d218e6b7490d9c 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -24,6 +24,8 @@ import operator import types +AMP_LEVEL = core.AmpLevel + __all__ = ['amp_guard', 'amp_decorate'] # The set of ops that support fp16 calculation and are considered numerically- @@ -108,7 +110,7 @@ def _in_amp_guard(): """ tracer = _dygraph_tracer() if tracer: - if tracer._amp_level == 1: + if tracer._amp_level == core.AmpLevel.O1: return True else: return False @@ -251,11 +253,11 @@ def amp_guard(enable=True, enable = False if level == 'O1': - amp_level = 1 + amp_level = AMP_LEVEL.O1 _white_list = WHITE_LIST _black_list = BLACK_LIST else: - amp_level = 2 + amp_level = AMP_LEVEL.O2 _white_list = PURE_FP16_WHITE_LIST _black_list = PURE_FP16_BLACK_LIST @@ -264,7 +266,7 @@ def amp_guard(enable=True, custom_black_list, level) if not enable: - amp_level = 0 + amp_level = AMP_LEVEL.O0 if tracer: # enable auto_cast From bf748f245eb74ffc86e44853fa9ebad7c858b015 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Wed, 13 Oct 2021 08:40:20 +0200 Subject: [PATCH 142/298] Implemented LRU based cache clearing (#36290) - Lint - Merge with develop - lint --- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 49 ++++---- .../mkldnn/conv_transpose_mkldnn_op.cc | 33 +++--- .../operators/mkldnn/quantize_mkldnn_op.cc | 105 ++++++------------ paddle/fluid/platform/device_context.cc | 63 +++++++---- paddle/fluid/platform/device_context.h | 15 ++- paddle/fluid/platform/mkldnn_reuse.h | 17 +-- 6 files changed, 136 insertions(+), 146 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index cce835e6bc0354..84c989f64e46c0 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -78,7 +78,8 @@ class ConvMKLDNNHandlerT mkldnn::convolution_backward_weights>( dev_ctx, mkldnn_engine, cpu_place, platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), - unique_name)) { + unique_name)), + is_test_(ctx.Attr("is_test")) { if (!this->isCached()) { PADDLE_ENFORCE_EQ( input->layout(), framework::DataLayout::kMKLDNN, @@ -159,7 +160,6 @@ class ConvMKLDNNHandlerT framework::slice_ddim(filter_dims, 2, filter_dims.size()); const auto ksize = framework::vectorize(filter_data_dims); - const bool is_test = ctx.Attr("is_test"); auto strides_temp = ctx.Attr>("strides"); std::vector strides(begin(strides_temp), end(strides_temp)); @@ -214,9 +214,8 @@ class ConvMKLDNNHandlerT const auto dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training; - + const auto fwd_prop_kind = is_test_ ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training; float sum_scale = 1.0f; std::vector output_shift_scale; if (platform::is_int8()) @@ -261,7 +260,8 @@ class ConvMKLDNNHandlerT mkldnn::convolution_backward_weights>( dev_ctx, dev_ctx.GetEngine(), cpu_place, platform::CreateKey(dev_ctx, framework::vectorize(in->dims()), - unique_name)) { + unique_name)), + is_test_(false) { if (!this->isBwdCached()) { PADDLE_ENFORCE_EQ( in->layout(), framework::DataLayout::kMKLDNN, @@ -291,7 +291,7 @@ class ConvMKLDNNHandlerT "Wrong format set for output_grad tensor")); PADDLE_ENFORCE_EQ( - ctx.Attr("is_test"), false, + is_test_, false, platform::errors::InvalidArgument( "is_test attribute should be set to False in training phase.")); @@ -557,13 +557,14 @@ class ConvMKLDNNHandlerT framework::vectorize(in_mem->dims()), platform::MKLDNNGetDataType(), in_mem->format()); return this->AcquireMemoryWithReorder( - user_mem_md, mem_md, platform::to_void_cast(in_mem_data), key_mem); + user_mem_md, mem_md, platform::to_void_cast(in_mem_data), key_mem, + is_test_); } else { const std::string target_key_suffix{key_mem_target}; const auto target_mem_p = this->AcquireMemory(target_key_suffix); user_mem_p->set_data_handle(platform::to_void_cast(in_mem_data)); if (user_mem_p != target_mem_p) { - this->AcquireReorder(user_mem_p, target_mem_p, key_mem); + this->AcquireReorder(user_mem_p, target_mem_p); } return target_mem_p; } @@ -571,12 +572,11 @@ class ConvMKLDNNHandlerT std::shared_ptr AcquireWeightsMemoryWithReorder( const framework::Tensor* filter, const int groups, const bool is_conv3d, - const bool is_test, const std::vector& scale_data = {1.0f}, - int mask = 0) { + const std::vector& scale_data = {1.0f}, int mask = 0) { // This is workaround to make execution faster, delete // if statement after including md inside Tensor auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target"); - if (is_test && weights_mem_p) { + if (is_test_ && weights_mem_p) { return weights_mem_p; } else { const K* filter_data = filter->data(); @@ -589,16 +589,16 @@ class ConvMKLDNNHandlerT return this->AcquireMemoryWithReorder( user_src_md, this->fwd_pd_->weights_desc(), - platform::to_void_cast(filter_data), "@weights_mem_p", is_test, {}, - scale_data, mask); + platform::to_void_cast(filter_data), "@weights_mem_p", is_test_, + {}, scale_data, mask); } } std::shared_ptr AcquireBiasMemoryWithReorder( - const framework::Tensor* bias, const bool is_test, + const framework::Tensor* bias, const std::vector& scale_data = {1.0f}, int mask = 0) { auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target"); - if (is_test && bias_mem_p) { + if (is_test_ && bias_mem_p) { return bias_mem_p; } else { const K* bias_data = bias->data(); @@ -608,7 +608,7 @@ class ConvMKLDNNHandlerT return this->AcquireMemoryWithReorder( user_bias_md, this->fwd_pd_->bias_desc(), - platform::to_void_cast(bias_data), "@bias_mem_p", is_test, {}, + platform::to_void_cast(bias_data), "@bias_mem_p", is_test_, {}, scale_data, mask); } } @@ -641,7 +641,7 @@ class ConvMKLDNNHandlerT platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) { auto residual_memory_p = this->AcquireResidualMemory(residual_param); dst_memory_p = this->template AcquireDstMemory(output); - this->AcquireReorder(residual_memory_p, dst_memory_p, "@residual_dst"); + this->AcquireReorder(residual_memory_p, dst_memory_p); } else { // Changing ShareDataWith to TensorCopy results in performance drop // on ResNet architectures @@ -651,6 +651,9 @@ class ConvMKLDNNHandlerT } return dst_memory_p; } + + private: + const bool is_test_; }; } // anonymous namespace @@ -695,7 +698,6 @@ class ConvMKLDNNOpKernel : public framework::OpKernel { ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); - const bool is_test = ctx.Attr("is_test"); const bool is_conv3d = ctx.Attr>("strides").size() == 3U; const bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); @@ -712,7 +714,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel { auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( - filter, ctx.Attr("groups"), is_conv3d, is_test); + filter, ctx.Attr("groups"), is_conv3d); std::shared_ptr dst_memory_p; if (fuse_residual_conn) { @@ -731,7 +733,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel { {MKLDNN_ARG_DST, *dst_memory_p}}; if (bias) { - auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias, is_test); + auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias); args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } @@ -783,11 +785,10 @@ class ConvMKLDNNOpKernel : public framework::OpKernel { ctx.Attr>("Scale_weights"); const bool is_multi_channel = scale_weights_data.size() > 1; const int& groups = ctx.Attr("groups"); - const bool& is_test = ctx.Attr("is_test"); int mask_reorder = is_multi_channel ? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( - filter, groups, false, is_test, scale_weights_data, mask_reorder); + filter, groups, false, scale_weights_data, mask_reorder); std::shared_ptr dst_memory_p; if (fuse_residual_conn) { @@ -822,7 +823,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel { handler.get_int8_bias_scales(ctx); auto bias_memory_p = handler.AcquireBiasMemoryWithReorder( - bias, is_test, scale_bias_data, mask_reorder); + bias, scale_bias_data, mask_reorder); args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 8d43e9f0dca44f..4c374d72c046fc 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -51,10 +51,10 @@ class ConvTransposeMKLDNNHandlerT : platform::MKLDNNHandlerT( dev_ctx, mkldnn_engine, cpu_place, platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), - unique_name)) { + unique_name)), + is_test_(ctx.Attr("is_test")) { if (!this->isCached()) { - const bool is_test = ctx.Attr("is_test"); - PADDLE_ENFORCE_EQ(is_test, true, + PADDLE_ENFORCE_EQ(is_test_, true, platform::errors::InvalidArgument( "ConvTransposeMKLDNN works only for inference. " "The attribute \'is_test\' value should be set to " @@ -169,8 +169,8 @@ class ConvTransposeMKLDNNHandlerT const mkldnn::primitive_attr conv_trans_attr = CreatePostOps(fuse_activation, fuse_alpha, fuse_beta); - auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training; + auto fwd_prop_kind = is_test_ ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training; if (bias) { std::vector bias_tz = framework::vectorize(bias->dims()); const auto bias_md = @@ -231,18 +231,18 @@ class ConvTransposeMKLDNNHandlerT const auto target_src_mem_p = this->AcquireMemory(target_key_suffix); user_src_mem_p->set_data_handle(platform::to_void_cast(input_data)); if (user_src_mem_p != target_src_mem_p) { - this->AcquireReorder(user_src_mem_p, target_src_mem_p, "@src_mem_p"); + this->AcquireReorder(user_src_mem_p, target_src_mem_p); } return target_src_mem_p; } } std::shared_ptr AcquireWeightsMemoryWithReorder( - const framework::Tensor* filter, const int& groups, const bool& is_test) { + const framework::Tensor* filter, const int& groups) { // This is workaround to make execution faster, delete // if statement after including md inside Tensor auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target"); - if (is_test && weights_mem_p) { + if (is_test_ && weights_mem_p) { return weights_mem_p; } else { const K* filter_data = filter->data(); @@ -277,15 +277,15 @@ class ConvTransposeMKLDNNHandlerT return this->template AcquireMemoryWithReorder( user_src_md, this->fwd_pd_->weights_desc(), - platform::to_void_cast(filter_data), "@weights_mem_p", is_test, + platform::to_void_cast(filter_data), "@weights_mem_p", is_test_, iohw2oihw_reorder); } } std::shared_ptr AcquireBiasMemoryWithReorder( - const framework::Tensor* bias, const bool& is_test) { + const framework::Tensor* bias) { auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target"); - if (is_test && bias_mem_p) { + if (is_test_ && bias_mem_p) { return bias_mem_p; } else { const K* bias_data = bias->data(); @@ -294,9 +294,12 @@ class ConvTransposeMKLDNNHandlerT MKLDNNMemoryFormat::x); return this->AcquireMemoryWithReorder( user_bias_md, this->fwd_pd_->bias_desc(), - platform::to_void_cast(bias_data), "@bias_mem_p", is_test); + platform::to_void_cast(bias_data), "@bias_mem_p", is_test_); } } + + private: + const bool is_test_; }; template @@ -325,8 +328,6 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel { ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); - const bool is_test = ctx.Attr("is_test"); - const auto* input = ctx.Input("Input"); const auto* filter = ctx.Input("Filter"); const auto* bias = @@ -340,7 +341,7 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel { output, unique_name); auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( - filter, ctx.Attr("groups"), is_test); + filter, ctx.Attr("groups")); std::shared_ptr dst_memory_p = handler.template AcquireDstMemory(output); @@ -352,7 +353,7 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel { {MKLDNN_ARG_DST, *dst_memory_p}}; if (bias) { - auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias, is_test); + auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias); args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc index 819c0d15505ca9..815af4eaaf1b37 100644 --- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc @@ -64,81 +64,46 @@ class QuantOpKernel : public framework::OpKernel { bool is_negative_input = ctx.Attr("is_negative_input"); bool bfloat16 = ctx.Attr("bfloat16"); - std::string key = - platform::CreateKey(dev_ctx, src_tz, scale_data, scale_shift, - is_negative_input, ctx.OutputName("Output")); - key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - - const std::string key_prim = key + "@r"; - const std::string key_src_mem = key + "@s"; - const std::string key_dst_mem = key + "@d"; - + // TODO(jczaja): Refactor with Acquire API std::shared_ptr src_memory; std::shared_ptr dst_memory; std::shared_ptr reorder_p; - reorder_p = std::static_pointer_cast(dev_ctx.GetBlob(key_prim)); - - if (reorder_p == nullptr) { - std::string out_layout = ctx.Attr("output_format"); - MKLDNNMemoryFormat out_format = - platform::data_format_to_memory_format(out_layout); - mkldnn::primitive_attr attri; - int mask = 0; - attri.set_output_scales(mask, {scale_data}); - - if (with_shift) { - mkldnn::post_ops post_operations; - post_operations.append_sum(); - attri.set_post_ops(post_operations); - uint8_t* output_data = output->mutable_data(ctx.GetPlace()); - // memset casts scale_shift to unsigned char (uint8_t) internally - std::memset(output_data, scale_shift, output->numel()); - } - - auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32, - input->format()); - src_memory = std::make_shared( - src_md, engine, to_void_cast(input_data)); - - std::shared_ptr dst_md; - if (bfloat16) { - platform::SetDstMemoryQuantized( - ctx, output, dst_tz, engine, dst_md, dst_memory, out_format); - } else if (is_negative_input && !with_shift) { - platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, - dst_md, dst_memory, out_format); - } else { - platform::SetDstMemoryQuantized( - ctx, output, dst_tz, engine, dst_md, dst_memory, out_format); - } - auto reorder_pd = std::shared_ptr( - new reorder::primitive_desc(*src_memory, *dst_memory, attri)); - reorder_p = std::shared_ptr(new reorder(*reorder_pd)); - - dev_ctx.SetBlob(key_prim, reorder_p); - dev_ctx.SetBlob(key_src_mem, src_memory); - dev_ctx.SetBlob(key_dst_mem, dst_memory); + + std::string out_layout = ctx.Attr("output_format"); + MKLDNNMemoryFormat out_format = + platform::data_format_to_memory_format(out_layout); + mkldnn::primitive_attr attri; + int mask = 0; + attri.set_output_scales(mask, {scale_data}); + + if (with_shift) { + mkldnn::post_ops post_operations; + post_operations.append_sum(); + attri.set_post_ops(post_operations); + uint8_t* output_data = output->mutable_data(ctx.GetPlace()); + // memset casts scale_shift to unsigned char (uint8_t) internally + std::memset(output_data, scale_shift, output->numel()); + } + + auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32, + input->format()); + src_memory = std::make_shared(src_md, engine, + to_void_cast(input_data)); + + std::shared_ptr dst_md; + if (bfloat16) { + platform::SetDstMemoryQuantized( + ctx, output, dst_tz, engine, dst_md, dst_memory, out_format); + } else if (is_negative_input && !with_shift) { + platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, + dst_md, dst_memory, out_format); } else { - src_memory = std::static_pointer_cast( - dev_ctx.GetBlob(key_src_mem)); - src_memory->set_data_handle(to_void_cast(input_data)); - - dst_memory = std::static_pointer_cast( - dev_ctx.GetBlob(key_dst_mem)); - auto place = ctx.GetPlace(); - - if (bfloat16) { - dst_memory->set_data_handle( - output->mutable_data(place)); - } else if (with_shift || !is_negative_input) { - uint8_t* output_data = output->mutable_data(ctx.GetPlace()); - if (with_shift) std::memset(output_data, scale_shift, output->numel()); - dst_memory->set_data_handle(output_data); - } else { - dst_memory->set_data_handle( - output->mutable_data(ctx.GetPlace())); - } + platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, + dst_md, dst_memory, out_format); } + auto reorder_pd = std::shared_ptr( + new reorder::primitive_desc(*src_memory, *dst_memory, attri)); + reorder_p = std::shared_ptr(new reorder(*reorder_pd)); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 587ad5f37e55e5..8c81db8c26b0be 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -11,6 +11,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include +#include +#ifdef _WIN32 +#include +#else +#include +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" @@ -666,7 +672,7 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) { // of this executor for (auto& s : *p_exec_items_) { for (auto& v : (*s.second)[ptr]) { - (v.first)->erase(v.second); + (v.first)->second.erase(v.second); } s.second->erase(ptr); } @@ -677,12 +683,27 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) { } } -void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const { - p_exec_items_->erase(p_exec_items_->begin()); +std::string MKLDNNDeviceContext::PickLeastUsedShape( + BlobPtr_t sb) const { + auto ancient_one = sb->begin(); + for (auto v = std::next(sb->begin()); v != sb->end(); ++v) { + if (v->second->first < ancient_one->second->first) { + ancient_one = v; + } + } + VLOG(2) << "num_shapes: " << sb->size() + << ", remove all blobs of shape: " << ancient_one->first; + return ancient_one->first; +} + +void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor( + std::string shape_to_be_removed) const { + p_exec_items_->erase(shape_to_be_removed); } -void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t pblob, - KeyBlob::iterator it) const { +void MKLDNNDeviceContext::LinkEntryWithExecutor( + BlobPtr_t> pblob, + KeyBlob::iterator it) const { // Take current input shape from TLS // Take current executor addess from TLS // and for this executor's items add the one defined with arguments @@ -719,7 +740,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, BlobPtr_t data) const { BlobMap* pMap = p_blobmap_.get(); BlobPtr_t sBlob = nullptr; - BlobPtr_t pBlob = nullptr; + BlobPtr_t> pBlob = nullptr; int sid = tls().get_cur_mkldnn_session_id(); @@ -748,22 +769,24 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, sBlob->size() && (sBlob->size() >= static_cast(tls().cur_input_shape_cache_capacity))) { - VLOG(2) << "sid=" << sid - << ", remove all blobs of shape: " << sBlob->begin()->first; - sBlob->erase(sBlob->begin()->first); - RemoveShapeEntriesWithExecutor(); + auto shape_to_be_erased = PickLeastUsedShape(sBlob); + sBlob->erase(shape_to_be_erased); + RemoveShapeEntriesWithExecutor(shape_to_be_erased); } - pBlob = std::make_shared(); + pBlob = std::make_shared>(); + pBlob->first = __rdtsc(); (*sBlob)[tls().cur_input_shape_str] = pBlob; } else { pBlob = key_it->second; + // Update time stamp + pBlob->first = __rdtsc(); } // Find Blob via name - auto blob_it = pBlob->find(name); - if (blob_it == pBlob->end()) { - auto el = - pBlob->insert(std::make_pair(name, data)); // (*pBlob)[name] = data; + auto blob_it = pBlob->second.find(name); + if (blob_it == pBlob->second.end()) { + auto el = pBlob->second.insert( + std::make_pair(name, data)); // (*pBlob)[name] = data; // Register new element in per executor map // to have easily erased when executor terminated LinkEntryWithExecutor(pBlob, el.first); @@ -779,7 +802,7 @@ unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const { unsigned int num_entries = 0; for (auto const& l3 : *p_blobmap_) { for (auto const& l2 : *(l3.second)) { - num_entries += (l2.second)->size(); + num_entries += (l2.second->second).size(); } } return num_entries; @@ -789,7 +812,7 @@ MKLDNNDeviceContext::BlobPtr_t MKLDNNDeviceContext::GetBlob( const std::string& name) const { BlobMap* pMap = p_blobmap_.get(); BlobPtr_t sBlob = nullptr; - BlobPtr_t pBlob = nullptr; + BlobPtr_t> pBlob = nullptr; int sid = tls().get_cur_mkldnn_session_id(); @@ -813,12 +836,14 @@ MKLDNNDeviceContext::BlobPtr_t MKLDNNDeviceContext::GetBlob( pBlob = sBlob_it->second; // Find Blob via name - auto key_it = pBlob->find(name); + auto key_it = pBlob->second.find(name); - if (key_it == pBlob->end()) { + if (key_it == pBlob->second.end()) { VLOG(2) << "GetBlob sid=" << sid << ", miss blob=" << name << "\n"; return nullptr; } + // Update timestamp + sBlob_it->second->first = __rdtsc(); // TODO(windows) VLOG(2) << "GetBlob sid=" << sid << ", get blob=" << name << "\n"; // lock will be automatically released when out of scope diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 13a1040dd19df2..ee6bbbf23778db 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -757,18 +757,20 @@ class MKLDNNDeviceContext : public CPUDeviceContext { // Following three maps are used to cache MKLDNN primitives. // There relations are: // - BlobMap = Map - // - ShapeBlob = Map + // - ShapeBlob = Map> // - KeyBlob = Map using KeyBlob = umap_key_string_t; - using ShapeBlob = umap_key_string_t; + using ShapeBlob = umap_key_string_t>; using BlobMap = umap_value_smart_t; // Auxillary two-level structure (shape, executor) to easier control // clearing cache objects related to specific executor using ExecKey = void*; - using ExecMapCacheIterPair = std::pair, KeyBlob::iterator>; + using ExecMapCacheIterPair = + std::pair>, + KeyBlob::iterator>; using ExecMap = std::unordered_map>; using ExecShape = std::unordered_map>; @@ -779,8 +781,11 @@ class MKLDNNDeviceContext : public CPUDeviceContext { const mkldnn::engine& GetEngine() const { return tls().get_engine(); } // Register object to currently used executor's map - void LinkEntryWithExecutor(BlobPtr_t, KeyBlob::iterator) const; - void RemoveShapeEntriesWithExecutor(void) const; + void LinkEntryWithExecutor( + BlobPtr_t> pblob, + KeyBlob::iterator it) const; + void RemoveShapeEntriesWithExecutor(std::string) const; + std::string PickLeastUsedShape(BlobPtr_t sb) const; // Remove all entries from the blob map void ResetBlobMap(void* ptr); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 084b47bb3c7a3b..5d725307e59208 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -500,18 +500,9 @@ class MKLDNNHandlerT { } void AcquireReorder(const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p, - const std::string& suffix) { - const auto key_reorder_p = key_ + suffix + "reorder_p"; - - auto reorder_p = std::static_pointer_cast( - dev_ctx_.GetBlob(key_reorder_p)); - - if (reorder_p == nullptr) { - reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - dev_ctx_.SetBlob(key_reorder_p, reorder_p); - } + const std::shared_ptr& target_memory_p) { + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); @@ -578,6 +569,8 @@ class MKLDNNHandlerT { std::static_pointer_cast(dev_ctx_.GetBlob(user_key)); user_memory_p->set_data_handle(ptr); + // TODO(jczaja): Here we detect if reorder is cached it means it is needed + // need to change this to get rid of keys auto reorder_p = std::static_pointer_cast( dev_ctx_.GetBlob(key_reorder_p)); if (reorder_p != nullptr) { From 192e08cbff30ff2d602aec85ef1bf5b3252590e6 Mon Sep 17 00:00:00 2001 From: wenbin Date: Wed, 13 Oct 2021 14:53:07 +0800 Subject: [PATCH 143/298] pool fix (#36388) * pool fix * comments --- .../inference/tensorrt/convert/pool2d_op.cc | 48 +++++++++++++------ 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 733a8f64ae5dba..e03842db2b8274 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -115,6 +115,18 @@ class Pool2dOpConverter : public OpConverter { nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); nvinfer1::ILayer *layer = nullptr; + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); + // paddle Non ceil_mode : Output size = (input size - filter size + 2 * + // padding) / stride (stride size) + 1 + // tensorrt EXPLICIT_ROUND_DOWN: O = floor((M - DK) / S) + 1 + // so if M - DK < 0 we need extra padding + if (input_shape.d[input_dims - 2] - ksize[0] + 2 * paddings[0] < 0) { + post_pad.h() = strides[0] - 1; + } + if (input_shape.d[input_dims - 1] - ksize[1] + 2 * paddings[1] < 0) { + post_pad.w() = strides[1] - 1; + } if (op_desc.HasAttr("enable_int8")) { #if IS_TRT_VERSION_GE(5000) @@ -126,6 +138,16 @@ class Pool2dOpConverter : public OpConverter { if (engine_->with_dynamic_shape()) { if (!adaptive && !global_pooling && !ceil_mode) { + if ((post_pad.w() > 0 || post_pad.h() > 0) && + (padding_algorithm != "SAME")) { + auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, + pre_pad, post_pad); + PADDLE_ENFORCE_NOT_NULL( + pad_layer, platform::errors::Fatal( + "Pad layer in poolOp converter could not be " + "created. The pointer to pad layer is `NULL`.")); + input1 = pad_layer->getOutput(0); + } auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, nv_pool_type, nv_ksize); pool_layer->setStride(nv_strides); @@ -184,9 +206,8 @@ class Pool2dOpConverter : public OpConverter { if (global_pooling == true) { nv_ksize.d[0] = input_shape.d[input_dims - 2]; nv_ksize.d[1] = input_shape.d[input_dims - 1]; - auto *pool_layer = TRT_ENGINE_ADD_LAYER( - engine_, Pooling, *const_cast(input1), - nv_pool_type, nv_ksize); + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, + nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL( pool_layer, platform::errors::Fatal( "trt pool layer in converter could not be created.")); @@ -208,28 +229,25 @@ class Pool2dOpConverter : public OpConverter { } if (!adaptive) { - // Under ceil mode, the pre_pad and post_pad are used to - // record the the padding size. In some ceil mode cases, - // we do not need padding, so we initialize the two vars to 0. - - nvinfer1::DimsHW pre_pad(0, 0); - nvinfer1::DimsHW post_pad(0, 0); if (ceil_mode) { // If ceil mode is true, we will pad the appropriate size to the input. DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, input_dims); - auto *pad_layer = TRT_ENGINE_ADD_LAYER( - engine_, Padding, *const_cast(input1), pre_pad, - post_pad); + } + + if ((post_pad.w() > 0 || post_pad.h() > 0) && + (padding_algorithm != "SAME")) { + auto *pad_layer = + TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, pre_pad, post_pad); PADDLE_ENFORCE_NOT_NULL( pad_layer, platform::errors::Fatal( "Pad layer in poolOp converter could not be " "created. The pointer to pad layer is `NULL`.")); input1 = pad_layer->getOutput(0); } - auto *pool_layer = TRT_ENGINE_ADD_LAYER( - engine_, Pooling, *const_cast(input1), - nv_pool_type, nv_ksize); + + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, + nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL( pool_layer, platform::errors::Fatal( "trt pool layer in converter could not be created.")); From 817f9ef061166793bc0616540f86a9593e750c7f Mon Sep 17 00:00:00 2001 From: caozhou <48191911+Caozhou1995@users.noreply.github.com> Date: Wed, 13 Oct 2021 14:56:35 +0800 Subject: [PATCH 144/298] fix pp comm init bug (#36377) --- python/paddle/distributed/auto_parallel/reshard.py | 5 ++++- .../fluid/tests/unittests/test_auto_parallel_reshard.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py index d66d799c6e0f91..2d54bf8a7887a3 100644 --- a/python/paddle/distributed/auto_parallel/reshard.py +++ b/python/paddle/distributed/auto_parallel/reshard.py @@ -662,7 +662,10 @@ def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index, def _init_comm_for_send_recv(): - if not PROCESS_GROUP_MAP["global_group"].is_instantiate(): + if not PROCESS_GROUP_MAP: + genv = _get_global_env() + PROCESS_GROUP_MAP["global_group"] = ProcessGroup( + 0, list(range(genv.world_size))) PROCESS_GROUP_MAP["global_group"].instantiate() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py index 89e9b7e817f457..da82e56d4a1518 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py @@ -27,6 +27,7 @@ from paddle.distributed.auto_parallel.partitioner import Partitioner from paddle.distributed.auto_parallel.completion import complete_backward_annotation from paddle.distributed.auto_parallel.reshard import reshard +from paddle.distributed.auto_parallel.process import PROCESS_GROUP_MAP paddle.enable_static() _global_parallel_strategy = None @@ -254,6 +255,8 @@ def test_mlp_pp(self): dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, rank_id) complete_backward_annotation(dist_main_prog, dist_context) + for key in list(PROCESS_GROUP_MAP.keys()): + del PROCESS_GROUP_MAP[key] reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) # check send and recv result From 85bb1a85cdb3bc9927f5047dc81e25f0d7ada844 Mon Sep 17 00:00:00 2001 From: Guoxia Wang Date: Wed, 13 Oct 2021 15:02:41 +0800 Subject: [PATCH 145/298] support auto parallel data shard (#36055) --- .../distributed/auto_parallel/parallelizer.py | 3 + .../paddle/distributed/auto_parallel/utils.py | 37 ++++ .../distributed/fleet/base/fleet_base.py | 1 + .../fluid/tests/unittests/CMakeLists.txt | 3 + .../unittests/auto_parallel_data_unshard.py | 179 ++++++++++++++++++ .../test_auto_parallel_data_unshard.py | 29 +++ 6 files changed, 252 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_data_unshard.py diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index 2994d35ef9202a..1437dbb2f9049f 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -20,6 +20,7 @@ from .completion import complete_annotation, complete_backward_annotation from .partitioner import Partitioner from .process import get_all_process_groups +from .utils import make_data_unshard from .reshard import reshard @@ -95,6 +96,8 @@ def parallelize(self, self._remove_distributed_attrs(partitioned_main_prog) complete_backward_annotation(partitioned_main_prog, self._dist_context) + + make_data_unshard(partitioned_main_prog, partitioned_startup_prog) reshard(partitioned_main_prog, partitioned_startup_prog, rank, self._dist_context) diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index 547495fb848d1c..a81ff69918905c 100755 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -277,3 +277,40 @@ def _linear_idx2coordinate(mesh_shape, linear_idx): # row major order return coordinate + + +def _get_unshard_dist_shape(var, dist_attr): + var_shape = var.shape + mapping = dist_attr.get_dims_mapping() + mesh = dist_attr.get_process_mesh().topology + assert len(var_shape) == len( + mapping + ), "variable shape [{}] and dim_mapping [{}] is NOT match !".format( + var_shape, mapping) + new_shape = [] + for idx in range(len(var_shape)): + if var_shape[idx] == -1 or mapping[idx] == -1: + new_shape.append(var_shape[idx]) + else: + new_shape.append(var_shape[idx] * mesh[mapping[idx]]) + + return new_shape + + +def make_data_unshard(dist_main_prog, dist_startup_prog): + from .context import get_default_distributed_context + dist_context = get_default_distributed_context() + + for var in dist_main_prog.list_vars(): + if var.is_data: + tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_program( + var) + inverse_shape = _get_unshard_dist_shape(var, tensor_dist_attr) + var.desc.set_shape(inverse_shape) + dim_mapping = tensor_dist_attr.get_dims_mapping() + dim_mapping = [-1] * len(dim_mapping) + tensor_dist_attr.set_dims_mapping(dim_mapping) + dist_context.set_tensor_distributed_attr_for_program( + var, tensor_dist_attr) + var._set_attr('dim_mapping' + core.kAutoParallelSuffix(), + dim_mapping) diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 687295b1f2c11c..544c79a0b39691 100755 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -1423,6 +1423,7 @@ def minimize(self, auto_parallelizer = AutoParallelizer(self) optimize_ops, params_grads, dist_startup_prog, dist_main_prog = auto_parallelizer.parallelize( loss, startup_program, parameter_list, no_grad_set) + return optimize_ops, params_grads, dist_startup_prog, dist_main_prog # compile time diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 33cd236a7d0943..f883d7a80a4122 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -36,6 +36,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers) list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper) list(APPEND DIST_TEST_OPS test_parallel_class_center_sample) list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy) +list(APPEND DIST_TEST_OPS test_auto_parallel_data_unshard) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) #remove distribute unittests. list(APPEND MIXED_DIST_TEST_OPS test_dgc_op) @@ -233,6 +234,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_serial) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_mppp) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_dpmppp) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_data_unshard) elseif(WITH_GPU) if (${CUDNN_VERSION} VERSION_LESS 7100) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) @@ -1001,6 +1003,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_class_center_sample PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT 120) + set_tests_properties(test_auto_parallel_data_unshard PROPERTIES TIMEOUT 120) if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py b/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py new file mode 100644 index 00000000000000..367d9858626845 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py @@ -0,0 +1,179 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import copy +import numpy as np +import random + +import paddle +import paddle.nn as nn +import paddle.fluid.core as core +import paddle.distributed.auto_parallel as auto +import paddle.nn.functional as F +from paddle.distributed import fleet + +paddle.enable_static() +paddle.distributed.init_parallel_env() + + +class TestDataUnshard(unittest.TestCase): + def test_dp2pp1mp1(self): + def create_model(train_program, start_program): + with paddle.static.program_guard(train_program, start_program): + + ROOT_MESH = auto.ProcessMesh([0, 1]) + MESH_0 = auto.ProcessMesh([0, 1], ROOT_MESH) + input = paddle.static.data(name='input', shape=[2, 8]) + label = paddle.static.data(name='label', shape=[2, 8]) + + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Normal( + mean=0.0, std=0.02)) + linear0 = nn.Linear(8, 8, weight_attr) + linear1 = nn.Linear(8, 8, weight_attr) + + auto.shard_tensor(input, MESH_0, dim_mapping=[0, -1]) + auto.shard_tensor(label, MESH_0, dim_mapping=[0, -1]) + auto.shard_tensor(linear0.weight, MESH_0, dim_mapping=[-1, -1]) + auto.shard_tensor(linear1.weight, MESH_0, dim_mapping=[-1, -1]) + + linear0_out = linear0(input) + gelu_out = F.gelu(linear0_out) + linear1_out = linear1(gelu_out) + error_cost = paddle.nn.functional.square_error_cost(linear1_out, + label) + loss = paddle.mean(error_cost) + return train_program, start_program, loss, input, label + + train_program = paddle.static.Program() + start_program = paddle.static.Program() + # serial program + train_program, start_program, loss, input, label = create_model( + train_program, start_program) + + dist_strategy = fleet.DistributedStrategy() + dist_strategy.semi_auto = True + fleet.init(is_collective=True, strategy=dist_strategy) + optimizer = paddle.fluid.optimizer.AdamOptimizer( + learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + + optimizer = fleet.distributed_optimizer(optimizer) + _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( + loss, start_program) + + worker_index = paddle.distributed.get_rank() + paddle.seed(worker_index + 2021) + random.seed(worker_index + 2021) + np.random.seed(worker_index + 2021) + + place = paddle.set_device("gpu") + exe = paddle.static.Executor(place) + exe.run(distributed_startup_program) + + input_data = np.array(range(2 * 8)).reshape([2, 8]).astype("float32") + label_data = np.random.randint(0, 10, [2, 8]).astype("float32") + + fetchs = [loss.name, 'input@RESHARD_0'] + loss_np, shard_data_np = exe.run( + distributed_main_program, + feed={"input": input_data, + "label": label_data}, + fetch_list=fetchs) + desired = input_data[worker_index].reshape(shard_data_np.shape) + np.testing.assert_allclose(shard_data_np, desired) + + def dp1pp1mp2(self): + def create_model(train_program, start_program): + with paddle.static.program_guard(train_program, start_program): + + ROOT_MESH = auto.ProcessMesh([0, 1]) + MESH_0 = auto.ProcessMesh([0, 1], ROOT_MESH) + input = paddle.static.data(name='input', shape=[8, 8]) + label = paddle.static.data(name='label', shape=[8, 8]) + + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Normal( + mean=0.0, std=0.02)) + linear0 = nn.Linear(8, 8, weight_attr) + linear1 = nn.Linear(8, 8, weight_attr) + + auto.shard_tensor(input, MESH_0, dim_mapping=[-1, -1]) + auto.shard_tensor(label, MESH_0, dim_mapping=[-1, -1]) + + auto.shard_tensor(linear0.weight, MESH_0, dim_mapping=[-1, 0]) + auto.shard_tensor(linear1.weight, MESH_0, dim_mapping=[0, -1]) + + linear0_out = linear0(input) + gelu_out = F.gelu(linear0_out) + + linear1_out = linear1(gelu_out) + + error_cost = paddle.nn.functional.square_error_cost(linear1_out, + label) + loss = paddle.mean(error_cost) + return train_program, start_program, loss, input, label + + train_program = paddle.static.Program() + start_program = paddle.static.Program() + # serial program + train_program, start_program, loss, input, label = create_model( + train_program, start_program) + + dist_strategy = fleet.DistributedStrategy() + dist_strategy.semi_auto = True + fleet.init(is_collective=True, strategy=dist_strategy) + optimizer = paddle.fluid.optimizer.AdamOptimizer( + learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + + optimizer = fleet.distributed_optimizer(optimizer) + _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( + loss, start_program) + + worker_index = paddle.distributed.get_rank() + paddle.seed(worker_index + 2021) + random.seed(worker_index + 2021) + np.random.seed(worker_index + 2021) + + place = paddle.set_device("gpu") + exe = paddle.static.Executor(place) + exe.run(distributed_startup_program) + + input_data = np.array(range(8 * 8)).reshape([8, 8]).astype("float32") + label_data = np.random.randint(0, 10, [8, 8]).astype("float32") + + fetchs = [loss.name, 'input'] + loss_np, shard_data_np = exe.run( + distributed_main_program, + feed={"input": input_data, + "label": label_data}, + fetch_list=fetchs) + + desired = input_data.reshape(shard_data_np.shape) + np.testing.assert_allclose(shard_data_np, desired) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_data_unshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_data_unshard.py new file mode 100644 index 00000000000000..6cc953dfdee9a6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_data_unshard.py @@ -0,0 +1,29 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestAutoParallelDataUnshard(TestMultipleGpus): + def test_auto_parallel_data_unshard(self): + self.run_mnist_2gpu('auto_parallel_data_unshard.py') + + +if __name__ == "__main__": + unittest.main() From 3a869cc5f68cae83cd536f1cfd46bbf2c7d7e0b0 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Wed, 13 Oct 2021 15:56:26 +0800 Subject: [PATCH 146/298] Add fp16 for clip_by_norm & clip_by_global_norm (#36198) * add fp16 for clip_by_norm api * support ClipByGlobalNorm for fp16 in dygraph * add unittest for dygraph clipGlobalNorm * refine unittest for dygraph clipGlobalNorm for mac and windows * refine unittest * add unittest for fp64 * refine unittest for fp64 --- python/paddle/fluid/clip.py | 35 +++++- python/paddle/fluid/layers/nn.py | 2 +- .../tests/unittests/test_gradient_clip.py | 113 ++++++++++++++++++ 3 files changed, 145 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 4cca41b527bc2f..293d6119e75046 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -436,6 +436,8 @@ def __str__(self): def _dygraph_clip(self, params_grads): params_and_grads = [] sum_square_list = [] + sum_square_list_fp16 = [] + sum_square_list_fp32 = [] for p, g in params_grads: if g is None: continue @@ -447,13 +449,36 @@ def _dygraph_clip(self, params_grads): merge_grad = layers.get_tensor_from_selected_rows(merge_grad) sum_square = _squared_l2_norm(merge_grad) - sum_square_list.append(sum_square) + if sum_square.dtype == core.VarDesc.VarType.FP16: + sum_square_list_fp16.append(sum_square) + elif sum_square.dtype == core.VarDesc.VarType.FP32: + sum_square_list_fp32.append(sum_square) + else: + sum_square_list.append(sum_square) # all parameters have been filterd out - if len(sum_square_list) == 0: + if len(sum_square_list) + len(sum_square_list_fp16) + len( + sum_square_list_fp32) == 0: return params_grads - global_norm_var = layers.concat(sum_square_list) + sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32" + global_norm_var = [] + if len(sum_square_list_fp16) > 0: + global_norm_var_fp16 = layers.concat(sum_square_list_fp16) + global_norm_var_fp16 = layers.reduce_sum(global_norm_var_fp16) + global_norm_var.append(global_norm_var_fp16.astype(sum_dtype)) + if len(sum_square_list_fp32) > 0: + global_norm_var_fp32 = layers.concat(sum_square_list_fp32) + global_norm_var_fp32 = layers.reduce_sum(global_norm_var_fp32) + if sum_dtype == 'float32': + global_norm_var.append(global_norm_var_fp32) + else: + global_norm_var.append(global_norm_var_fp32.astype(sum_dtype)) + if len(sum_square_list) > 0: + global_norm_var_fp64 = layers.concat(sum_square_list) + global_norm_var_fp64 = layers.reduce_sum(global_norm_var_fp64) + global_norm_var.append(global_norm_var_fp64) + global_norm_var = layers.concat(global_norm_var) global_norm_var = layers.reduce_sum(global_norm_var) global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant( @@ -469,7 +494,9 @@ def _dygraph_clip(self, params_grads): params_and_grads.append((p, g)) continue # TODO(wangxi): use inplace elementwise_mul - new_grad = layers.elementwise_mul(x=g, y=clip_var) + clip_input = (clip_var.astype('float16') + if g.dtype == core.VarDesc.VarType.FP16 else clip_var) + new_grad = layers.elementwise_mul(x=g, y=clip_input) params_and_grads.append((p, new_grad)) return params_and_grads diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 75b0392ab6ae47..ceda304b26e895 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -12524,7 +12524,7 @@ def clip_by_norm(x, max_norm, name=None): return _C_ops.clip_by_norm(x, 'max_norm', max_norm) helper = LayerHelper("clip_by_norm", **locals()) - check_variable_and_dtype(x, 'X', ['float32'], 'clip_by_norm') + check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm') check_type(max_norm, 'max_norm', (float), 'clip_by_norm') if name is None: diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py index e2050cf32dbddc..29735f1c89c857 100644 --- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py +++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py @@ -453,5 +453,118 @@ def check_clip_result(self, loss, optimizer): "gradient clip by value has wrong results!") +class SimpleNet(paddle.nn.Layer): + def __init__(self): + super(SimpleNet, self).__init__() + self.linear = paddle.nn.Linear(5, 5) + self.batch_norm = paddle.nn.BatchNorm(5) + + def forward(self, x): + x = self.linear(x) + x = self.batch_norm(x) + return x + + +class TestDygraphGradientClipFP16(unittest.TestCase): + def test_gradient_clip(self): + if fluid.core.is_compiled_with_cuda(): + with fluid.dygraph.guard(): + paddle.seed(10) + model = SimpleNet() + sgd_optimizer = paddle.optimizer.SGD( + learning_rate=0.0, parameters=model.parameters()) + model, sgd_optimizer = paddle.amp.decorate( + models=model, optimizers=sgd_optimizer, level='O2') + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + inputs = fluid.layers.uniform_random( + [1, 5], min=-10, max=10).astype('float32') + with paddle.amp.auto_cast(level='O2'): + out = model(fluid.dygraph.to_variable(inputs)) + loss = fluid.layers.reduce_mean(out) + scaled = scaler.scale(loss) + scaled.backward() + scaler.unscale_(sgd_optimizer) + # before clip + params_grads = [] + for param in model.parameters(): + if param.stop_gradient: + continue + if param._grad_ivar() is not None: + params_grads.append((param, param._grad_ivar())) + _, grads = zip(*params_grads) + # clip grads + clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.8) + params_grads = clip(params_grads) + _, grads_clip = zip(*params_grads) + # param update + scaler.step(sgd_optimizer) + scaler.update() + + global_norm = 0 + for u in grads: + u = u.numpy() + global_norm += np.sum(np.power(u, 2)) + global_norm = np.sqrt(global_norm) + global_norm_clip = 0 + for v in grads_clip: + v = v.numpy() + global_norm_clip += np.sum(np.power(v, 2)) + global_norm_clip = np.sqrt(global_norm_clip) + + a = np.minimum(global_norm, 0.8) + b = global_norm_clip + self.assertTrue( + np.isclose( + a=a, b=b, rtol=1e-3, atol=1e-8), + "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f" + % (a, b)) + + +class TestDygraphGradientClipFP64(unittest.TestCase): + def test_gradient_clip(self): + with fluid.dygraph.guard(): + inputs = fluid.layers.uniform_random( + [16, 5], min=-10, max=10).astype('float64') + linear = fluid.dygraph.Linear(5, 5, dtype="float64") + out = linear(fluid.dygraph.to_variable(inputs)) + loss = fluid.layers.reduce_mean(out) + loss.backward() + # before clip + params_grads = [] + for param in linear.parameters(): + if param.stop_gradient: + continue + if param._grad_ivar() is not None: + params_grads.append((param, param._grad_ivar())) + _, grads = zip(*params_grads) + # clip grads + clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.1) + params_grads = clip(params_grads) + _, grads_clip = zip(*params_grads) + + global_norm = 0 + for u in grads: + u = u.numpy() + global_norm += np.sum(np.power(u, 2)) + global_norm = np.sqrt(global_norm) + + global_norm_clip = 0 + for v in grads_clip: + v = v.numpy() + print(v) + global_norm_clip += np.sum(np.power(v, 2)) + global_norm_clip = np.sqrt(global_norm_clip) + print(global_norm_clip) + + a = np.minimum(global_norm, 0.1) + b = global_norm_clip + + self.assertTrue( + np.isclose( + a=a, b=b, rtol=1e-6, atol=1e-8), + "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f" + % (a, b)) + + if __name__ == '__main__': unittest.main() From 9a9953d9b0b32456fdb35e2bdb94679375b694dd Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Wed, 13 Oct 2021 16:01:20 +0800 Subject: [PATCH 147/298] [AMP] add attr is_distributed for layer.to (#36221) * add attr is_distributed * refine code * refine black/white list for pure fp16 --- python/paddle/fluid/dygraph/amp/auto_cast.py | 4 ++-- python/paddle/fluid/dygraph/layers.py | 5 +++++ python/paddle/fluid/framework.py | 1 - 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index d218e6b7490d9c..c807303621aea9 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -70,8 +70,8 @@ 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, } -PURE_FP16_BLACK_LIST = {' '} -PURE_FP16_WHITE_LIST = {'lookup_table', 'lookup_table_v2'} +PURE_FP16_WHITE_LIST = {' '} +PURE_FP16_BLACK_LIST = {'lookup_table', 'lookup_table_v2'} #NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 30d5ee44171f3b..e4b6bc01034268 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -1466,6 +1466,8 @@ def _apply(self, func, device, dtype, blocking): param_applied = func(param, device, dtype, blocking) assert param.is_leaf param_applied.stop_gradient = param.stop_gradient + if hasattr(param_applied, 'is_distributed'): + param_applied.is_distributed = param.is_distributed self._parameters[key] = param_applied if param.grad is not None: @@ -1475,6 +1477,9 @@ def _apply(self, func, device, dtype, blocking): grad_applied.stop_gradient = param._grad_ivar( ).stop_gradient + if hasattr(param._grad_ivar(), 'is_distributed'): + grad_applied.is_distributed = param._grad_ivar( + ).is_distributed self._parameters[key]._set_grad_ivar(grad_applied) self._parameters_transform_map[id(param)] = [param_applied, key] diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 4d90b9159470eb..c6367911b88f82 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -6153,7 +6153,6 @@ def __deepcopy__(self, memo): return new_param def _copy_to(self, device, blocking): - print("in ParamBase copy_to func") state = copy.deepcopy(self.__dict__) new_param = ParamBase(self.shape, self.dtype, **state) core.varbase_copy(self, new_param, device, blocking) From 24418479413961fd8486b87dd7a09e983cf4b0ad Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Wed, 13 Oct 2021 17:12:56 +0800 Subject: [PATCH 148/298] Verify the correctness of graph rewrited by GeneratePass (#36116) Check detail PR description at https://github.com/PaddlePaddle/Paddle/pull/36116 --- paddle/fluid/framework/ir/generate_pass.cc | 117 ++++++++++- python/paddle/fluid/ir.py | 43 +++- .../unittests/ir/test_ir_generate_pass.py | 196 ++++++++++++------ 3 files changed, 275 insertions(+), 81 deletions(-) diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc index 085298314ea3ff..b261cbeb08e3bf 100644 --- a/paddle/fluid/framework/ir/generate_pass.cc +++ b/paddle/fluid/framework/ir/generate_pass.cc @@ -21,6 +21,16 @@ namespace ir { void InitGeneratePattern(const proto::PassDesc& pass_desc, PDPattern* pattern) { const proto::BlockDesc& block = pass_desc.pattern().blocks(0); + for (const proto::VarDesc& var : block.vars()) { + PDNode* var_pdnode = pattern->NewNode(var.name())->AsInput(); + var_pdnode->assert_is_var(); + var_pdnode->assert_more([&](Node* x) { + if (VarDesc(var).GetShape() == x->Var()->GetShape()) { + return true; + } + return false; + }); + } // Traverse all operators to create subgraph. for (int index = 0; index < block.ops_size(); ++index) { const proto::OpDesc& op = block.ops(index); @@ -31,15 +41,32 @@ void InitGeneratePattern(const proto::PassDesc& pass_desc, PDPattern* pattern) { pattern->NewNode(std::to_string(index))->assert_is_op(op.type()); // Create PDNodes for inputs of current operator. for (const proto::OpDesc::Var& var : op.inputs()) { - for (const std::string& argument : var.arguments()) { + for (int n = 0; n < var.arguments_size(); ++n) { + const std::string& argument = var.arguments(n); // The input may be the output of other operator. PDNode* var_pdnode = pattern->RetrieveNode(argument); if (nullptr == var_pdnode) { var_pdnode = pattern->NewNode(argument)->AsInput(); + var_pdnode->assert_is_var(); } else if (var_pdnode->IsOutput()) { var_pdnode->AsIntermediate(); } - var_pdnode->assert_is_op_input(op.type()); + var_pdnode->assert_more([&](Node* x) { + for (auto* out : x->outputs) { + if (out->IsOp() && out->Op()->Type() == op.type()) { + const auto& inputs = out->Op()->Inputs(); + const auto& iter = inputs.find(var.parameter()); + if (inputs.end() != iter) { + if (iter->second.end() != std::find(iter->second.begin(), + iter->second.end(), + x->Name())) { + return true; + } + } + } + } + return false; + }); pattern->AddEdge(var_pdnode, op_pdnode); } } @@ -50,6 +77,24 @@ void InitGeneratePattern(const proto::PassDesc& pass_desc, PDPattern* pattern) { PDNode* var_pdnode = pattern->RetrieveNode(argument); if (nullptr == var_pdnode) { var_pdnode = pattern->NewNode(argument)->AsOutput(); + var_pdnode->assert_is_var(); + var_pdnode->assert_more([&](Node* x) { + for (Node* input : x->inputs) { + if (input && input->IsOp() && input->Op() && + input->Op()->Type() == op.type()) { + const auto& outputs = input->Op()->Outputs(); + const auto& iter = outputs.find(var.parameter()); + if (outputs.end() != iter) { + if (iter->second.end() != std::find(iter->second.begin(), + iter->second.end(), + x->Name())) { + return true; + } + } + } + } + return false; + }); } else if (var_pdnode->IsInput()) { var_pdnode->AsIntermediate(); } @@ -73,18 +118,64 @@ void InitGeneratePattern(const proto::PassDesc& pass_desc, PDPattern* pattern) { } } -GraphPatternDetector::handle_t GetGenerateRewrite( +// There are some duplicate patterns. +bool IsDuplicatePattern(const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + for (auto iter : subgraph) { + if (nullptr == graph->RetrieveNode(iter.second->id())) { + VLOG(3) << "Node [" << iter.second->Name() + << "] of subgraph has been removed. So skip this optimize."; + return true; + } + } + return false; +} + +GraphPatternDetector::handle_t GetGenerateDelete( const PDPattern& pattern, const proto::PassDesc& pass_desc) { GraphPatternDetector::handle_t handler = [&]( - const GraphPatternDetector::subgraph_t subgraph, Graph* graph) { - // There are some duplicate patterns. - for (auto iter : subgraph) { - if (nullptr == graph->RetrieveNode(iter.second->id())) { - VLOG(3) << "Node [" << iter.second->Name() - << "] of subgraph has been removed. So skip this optimize."; - return; + const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + if (IsDuplicatePattern(subgraph, graph)) { + return; + } + // `var_node_maps` record the mapping of variable to the pattern subgraph. + std::map var_node_maps; + for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) { + Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var())); + const auto& iter = var_node_maps.find(var_map.replace_var()); + if (var_node_maps.end() == iter) { + // first node is input + var_node_maps.insert({var_map.replace_var(), node}); + } else { + // output node + for (Node* s_node : node->outputs) { + iter->second->outputs.push_back(s_node); + std::replace(s_node->inputs.begin(), s_node->inputs.end(), node, + iter->second); + s_node->Op()->RenameInput(node->Name(), iter->second->Name()); + } } } + // Remove nodes that are intermediate. + std::unordered_set remove_nodes; + for (const std::unique_ptr& pdnode : pattern.nodes()) { + remove_nodes.emplace(subgraph.at(pdnode.get())); + } + for (auto iter : var_node_maps) { + remove_nodes.erase(iter.second); + } + GraphSafeRemoveNodes(graph, remove_nodes); + }; + return handler; +} + +GraphPatternDetector::handle_t GetGenerateRewrite( + const PDPattern& pattern, const proto::PassDesc& pass_desc) { + GraphPatternDetector::handle_t handler = [&]( + const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + if (IsDuplicatePattern(subgraph, graph)) { + return; + } const proto::BlockDesc& block = pass_desc.replace().blocks(0); // `var_node_maps` record the mapping of variable to the pattern subgraph. std::map var_node_maps; @@ -175,7 +266,11 @@ void GeneratePass::ApplyImpl(Graph* graph) const { for (const proto::PassDesc& pass_desc : multi_pass_desc_.pass_descs()) { GraphPatternDetector detector; InitGeneratePattern(pass_desc, detector.mutable_pattern()); - detector(graph, GetGenerateRewrite(detector.pattern(), pass_desc)); + if (pass_desc.replace().blocks(0).ops_size() == 0) { + detector(graph, GetGenerateDelete(detector.pattern(), pass_desc)); + } else { + detector(graph, GetGenerateRewrite(detector.pattern(), pass_desc)); + } // The rewrited graph needs to be verified. Current Pass should be skipped // if validation failed. Rewrite based on the original graph cannot // implement rollback operation. diff --git a/python/paddle/fluid/ir.py b/python/paddle/fluid/ir.py index 7e2d3df1ce1e43..3c7c8879fd420d 100644 --- a/python/paddle/fluid/ir.py +++ b/python/paddle/fluid/ir.py @@ -127,11 +127,13 @@ def apply_pass(name): class RegisterPassHelper(object): + _register_helpers = list() + def __init__(self, pass_pairs, pass_type=str(), input_specs=dict()): self._pass_type = pass_type self._pass_pairs = pass_pairs - if isinstance(input_specs, dict): - self._input_specs = input_specs + self._input_specs = input_specs + RegisterPassHelper._register_helpers.append(self) def _get_args_from_func(self, func): args = list() @@ -148,6 +150,35 @@ def _get_args_from_func(self, func): args.append(paddle.static.data(arg_name, [-1])) return args + def _prune_program_desc(self, program_desc): + block_desc = program_desc.blocks[0] + # block_desc.ClearField("vars") + for var in [ + var for var in block_desc.vars + if var.name not in self._input_specs + ]: + block_desc.vars.remove(var) + for op_desc in block_desc.ops: + default_attrs = core.get_op_attrs_default_value( + paddle.compat.to_bytes(op_desc.type)) + remove_attrs = list() + for attr in op_desc.attrs: + # attr must not in + if attr.name not in [ + "op_namescope", "op_callstack", "op_device" + ]: + attr_list_fields = attr.ListFields() + # attr format must be: name, type, value + if len(attr_list_fields) == 3: + attr_value = attr.ListFields()[-1][-1] + default_attr_value = default_attrs.get(attr.name) + # value must not default + if default_attr_value != attr_value: + continue + remove_attrs.append(attr) + for attr in remove_attrs: + op_desc.attrs.remove(attr) + def _func_to_program_desc(self, func, program_desc, is_replace=False): vars = list() program = paddle.static.Program() @@ -166,6 +197,7 @@ def _func_to_program_desc(self, func, program_desc, is_replace=False): elif isinstance(out, paddle.fluid.framework.Variable): vars.append(out.name) program_desc.ParseFromString(program.desc.serialize_to_string()) + self._prune_program_desc(program_desc) if is_replace: attrs = list() for op in program.current_block().ops: @@ -296,7 +328,7 @@ def Outputs(self): OP = OpHelper() -def RegisterPass(function=None, input_specs=None): +def RegisterPass(function=None, input_specs=dict()): """ The function decorator of Register Pass. Decorator @RegisterPass handles the function and register it into a core.Pass instance. Use name of function @@ -305,11 +337,11 @@ def RegisterPass(function=None, input_specs=None): Args: function (callable): The function with return of callable pair(s) that represents the pattern subgraph and the replace subgraph. - input_specs (dict[str, InputSpec]|None): Dict of InputSpec to specific the shape/dtype + input_specs (dict[str, InputSpec]): Dict of InputSpec to specific the shape/dtype information of Tensor. Some operators limit the shape and dtype of datas when create subgraph with Paddle APIs. So user need specify InputSpec of data to ensure create a correctly subgraph. Of course, this argument is not limited to - matching subgraph. The default is None. + matching subgraph. The default is dict(). Returns: callables: Callable pair(s). @@ -351,6 +383,7 @@ def decorated(python_func): "Return value of Pass function must be (callable, callable)." ) helper = RegisterPassHelper(pass_pairs, pass_type, input_specs) + core.register_pass(pass_type, helper.SerializeMultiPassDesc) return python_func if inspect.isfunction(function): diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py index 851ae21c38378f..61bd554ad2616a 100644 --- a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py @@ -15,7 +15,7 @@ import unittest import paddle from paddle.static import InputSpec -from paddle.fluid import ir +from paddle.fluid import core, ir import numpy as np @@ -45,23 +45,37 @@ def replace(x, w, b): return list(map(create_pass_pair, [True, False])) -# add(X=add(x, y), Y=z)z => add_n(X=[x, y, z]) +# add(X=add(X=x, Y=y), Y=z) => sum(X=[x, y, z]) @ir.RegisterPass -def generate_add_n(): +def multi_add_to_sum_v1(): + pattern = lambda x, y, z: paddle.add(paddle.add(x, y), z) + replace = lambda x, y, z: paddle.add_n([x, y, z]) + return pattern, replace + + +@ir.RegisterPass +def multi_add_to_sum_v2(): def pattern(x, y, z): - return paddle.add(paddle.add(x, y), z) + ewadd1 = ir.PassDesc.OP.elementwise_add(X=x, Y=y) + ewadd2 = ir.PassDesc.OP.elementwise_add(X=ewadd1, Y=z) + return ewadd2 + + replace = lambda x, y, z: ir.PassDesc.OP.sum(X=[x, y, z]) + return pattern, replace - def replace(x, y, z): - return paddle.add_n([x, y, z]) +@ir.RegisterPass +def multi_add_to_sum_v3(): + pattern = lambda x, y, z: paddle.add(paddle.add(x, y), z) + replace = lambda x, y, z: ir.PassDesc.OP.sum(X=[x, y, z]) return pattern, replace # mul(x, y1), mul(x, y2) => slice(mul(x, concat(y1, y2))) @ir.RegisterPass(input_specs={ - 'x': InputSpec([1, 1]), - 'y1': InputSpec([1, 1]), - 'y2': InputSpec([1, 1]) + 'x': InputSpec([16, 32]), + 'y1': InputSpec([32, 12]), + 'y2': InputSpec([32, 48]) }) def generate_combine_mul_v1(): def pattern(x, y1, y2): @@ -72,8 +86,8 @@ def pattern(x, y1, y2): def replace(x, y1, y2): concat_out = paddle.concat([y1, y2], axis=-1) mul_out = paddle.matmul(x, concat_out) - out1 = paddle.slice(mul_out, axes=[1], starts=[0], ends=[1]) - out2 = paddle.slice(mul_out, axes=[1], starts=[1], ends=[2]) + out1 = paddle.slice(mul_out, axes=[1], starts=[0], ends=[12]) + out2 = paddle.slice(mul_out, axes=[1], starts=[12], ends=[60]) return out1, out2 return pattern, replace @@ -97,11 +111,22 @@ def replace(x, y1, y2): # reshape(reshape(x)) => x -@ir.RegisterPass(input_specs={'x': InputSpec([-1, 16, 16, 16])}) -def generate_simplify_inference(): +@ir.RegisterPass(input_specs={'x': InputSpec([10, 16, 16])}) +def generate_simplify_inference_v1(): def pattern(x): - transpose = paddle.transpose(x, [0, 3, 1, 2]) - return paddle.transpose(transpose, [0, 3, 1, 2]) + transpose = paddle.transpose(x, [0, 2, 1]) + return paddle.transpose(transpose, [0, 2, 1]) + + return pattern, lambda x: x + + +@ir.RegisterPass +def generate_simplify_inference_v2(): + def pattern(x): + op1 = ir.PassDesc.OP.transpose2 + op2 = ir.PassDesc.OP.transpose2 + # op2.Attr("axis").EQ(op1.Attr("axis")) + return op2(X=op1(X=x)) return pattern, lambda x: x @@ -153,46 +178,73 @@ def _check_fc_fuse_pass(pass_desc, with_relu): _check_fc_fuse_pass(multi_pass_desc.pass_descs[0], True) _check_fc_fuse_pass(multi_pass_desc.pass_descs[1], False) - def test_generate_add_n(self): - helper = ir.RegisterPassHelper([generate_add_n()]) - s = helper.SerializeMultiPassDesc() - multi_pass_desc = get_multi_pass_desc_from_str(s) - self.assertEqual(len(multi_pass_desc.pass_descs), 1) - pass_desc = multi_pass_desc.pass_descs[0] - self.assertEqual(len(pass_desc.var_maps), 4) - self.assertEqual(len(pass_desc.attr_maps), 0) - self.assertEqual(len(pass_desc.pattern.blocks[0].ops), 2) - self.assertEqual(len(pass_desc.replace.blocks[0].ops), 1) - pattern_op_dicts = self.convert_ops_to_op_dicts( - pass_desc.pattern.blocks[0].ops) - replace_op_dicts = self.convert_ops_to_op_dicts( - pass_desc.replace.blocks[0].ops) - self.assertEqual(len(pattern_op_dicts.get("elementwise_add", [])), 2) - self.assertEqual(len(replace_op_dicts.get("sum", [])), 1) + def check_multi_add_to_sum(self, pass_type): + program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.static.program_guard(program, startup_program): + x = paddle.static.data("x", [10, 10, 10], "float32") + y = paddle.static.data("y", [10, 10, 10], "float32") + z = paddle.static.data("z", [10, 10, 10], "float32") + add_1 = paddle.add(paddle.add(x, y), z) + matmul_1 = paddle.matmul(add_1, z) + add_tmp = paddle.add(x, y) + add_2 = paddle.add(add_tmp, z) + matmul_2 = paddle.matmul(add_2, add_tmp) + out = paddle.add(matmul_1, matmul_2) + graph = core.Graph(program.desc) + before_node_nums = len(graph.nodes()) + core.get_pass(pass_type).apply(graph) + after_node_nums = len(graph.nodes()) + self.assertEqual(after_node_nums, before_node_nums - 2) + after_program = paddle.fluid.framework.IrGraph(graph).to_program() + executor = paddle.static.Executor(paddle.CPUPlace()) + executor.run(startup_program) + feed = { + "x": np.random.random([10, 10, 10]).astype("float32"), + "y": np.random.random([10, 10, 10]).astype("float32"), + "z": np.random.random([10, 10, 10]).astype("float32") + } + before_out = executor.run(program, feed=feed, fetch_list=[out.name]) + after_out = executor.run(after_program, + feed=feed, + fetch_list=[out.name]) + self.assertTrue(np.allclose(before_out, after_out)) + + def test_multi_add_to_sum(self): + paddle.enable_static() + self.check_multi_add_to_sum("multi_add_to_sum_v1") + self.check_multi_add_to_sum("multi_add_to_sum_v2") + self.check_multi_add_to_sum("multi_add_to_sum_v3") def test_generate_combine_mul_v1(self): - input_specs = { - 'x': InputSpec([1, 1]), - 'y1': InputSpec([1, 1]), - 'y2': InputSpec([1, 1]) + paddle.enable_static() + program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.static.program_guard(program, startup_program): + x = paddle.static.data("x", [16, 32]) + y = paddle.static.data("y", [32, 12]) + z = paddle.static.data("z", [32, 48]) + out1 = paddle.matmul(x, y) + out2 = paddle.matmul(x, z) + graph = core.Graph(program.desc) + before_node_nums = len(graph.nodes()) + core.get_pass("generate_combine_mul_v1").apply(graph) + after_node_nums = len(graph.nodes()) + self.assertEqual(after_node_nums, before_node_nums + 4) + after_program = paddle.fluid.framework.IrGraph(graph).to_program() + executor = paddle.static.Executor(paddle.CPUPlace()) + executor.run(startup_program) + feed = { + "x": np.random.random([16, 32]).astype("float32"), + "y": np.random.random([32, 12]).astype("float32"), + "z": np.random.random([32, 48]).astype("float32") } - helper = ir.RegisterPassHelper( - [generate_combine_mul_v1()], input_specs=input_specs) - s = helper.SerializeMultiPassDesc() - multi_pass_desc = get_multi_pass_desc_from_str(s) - self.assertEqual(len(multi_pass_desc.pass_descs), 1) - pass_desc = multi_pass_desc.pass_descs[0] - self.assertEqual(len(pass_desc.var_maps), 5) - self.assertEqual(len(pass_desc.pattern.blocks[0].ops), 2) - self.assertEqual(len(pass_desc.replace.blocks[0].ops), 4) - pattern_op_dicts = self.convert_ops_to_op_dicts( - pass_desc.pattern.blocks[0].ops) - replace_op_dicts = self.convert_ops_to_op_dicts( - pass_desc.replace.blocks[0].ops) - self.assertEqual(len(pattern_op_dicts.get("matmul_v2", [])), 2) - self.assertEqual(len(replace_op_dicts.get("concat", [])), 1) - self.assertEqual(len(replace_op_dicts.get("matmul_v2", [])), 1) - self.assertEqual(len(replace_op_dicts.get("slice", [])), 2) + before_out1, before_out2 = executor.run( + program, feed=feed, fetch_list=[out1.name, out2.name]) + after_out1, after_out2 = executor.run( + after_program, feed=feed, fetch_list=[out1.name, out2.name]) + self.assertTrue(np.allclose(before_out1, after_out1)) + self.assertTrue(np.allclose(before_out2, after_out2)) def test_generate_combine_mul_v2(self): helper = ir.RegisterPassHelper([generate_combine_mul_v2()]) @@ -212,17 +264,31 @@ def test_generate_combine_mul_v2(self): self.assertEqual(len(replace_op_dicts.get("matmul_v2", [])), 1) self.assertEqual(len(replace_op_dicts.get("slice", [])), 2) + def check_generate_simplify_inference(self, pass_type): + paddle.enable_static() + program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.static.program_guard(program, startup_program): + x = paddle.static.data("x", [10, 16, 16], "float32") + x1 = paddle.transpose(paddle.transpose(x, [0, 2, 1]), [0, 2, 1]) + tmp = paddle.transpose(x, [0, 2, 1]) + x2 = paddle.transpose(tmp, [0, 2, 1]) + out = paddle.add(x1, paddle.matmul(x2, tmp)) + graph = core.Graph(program.desc) + before_node_nums = len(graph.nodes()) + core.get_pass(pass_type).apply(graph) + after_node_nums = len(graph.nodes()) + self.assertEqual(after_node_nums, before_node_nums - 6) + after_program = paddle.fluid.framework.IrGraph(graph).to_program() + executor = paddle.static.Executor(paddle.CPUPlace()) + executor.run(startup_program) + feed = {"x": np.random.random([10, 16, 16]).astype("float32")} + before_out = executor.run(program, feed=feed, fetch_list=[out.name]) + after_out = executor.run(after_program, + feed=feed, + fetch_list=[out.name]) + self.assertTrue(np.allclose(before_out, after_out)) + def test_generate_simplify_inference(self): - input_specs = {'x': InputSpec([-1, 16, 16, 16])} - helper = ir.RegisterPassHelper( - [generate_simplify_inference()], input_specs=input_specs) - s = helper.SerializeMultiPassDesc() - multi_pass_desc = get_multi_pass_desc_from_str(s) - self.assertEqual(len(multi_pass_desc.pass_descs), 1) - pass_desc = multi_pass_desc.pass_descs[0] - self.assertEqual(len(pass_desc.var_maps), 2) - self.assertEqual(len(pass_desc.pattern.blocks[0].ops), 2) - self.assertEqual(len(pass_desc.replace.blocks[0].ops), 0) - pattern_op_dicts = self.convert_ops_to_op_dicts( - pass_desc.pattern.blocks[0].ops) - self.assertEqual(len(pattern_op_dicts.get("transpose2", [])), 2) + self.check_generate_simplify_inference("generate_simplify_inference_v1") + self.check_generate_simplify_inference("generate_simplify_inference_v2") From 0c31579c1c0242e184fe2dc7f8e14f4949da62a7 Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Wed, 13 Oct 2021 18:33:10 +0800 Subject: [PATCH 149/298] Merge lars op (#35476) * A leap of try for cudaLaunchCooperativeKernel * fix bugs * Totally replace the lar cuda kernel * Fix bugs * a test for lars merge * Adding las_op_momentum infer_shape * Fix codes * use avg_numel instead of max_numel to acquire grid num * modify unittest files about lars op * Finally converge when merged-lars works * fix ctest files * add merged_operation kernel when cuda version is older than 11 * Fix code style * fix ctest failure * fix error * fix all ctest error and change lars compute code of cpu * fix bugs on v100. * revert python modififation about lars * revert python modification codes --- .../operators/optimizers/lars_momentum_op.cc | 140 ++++- .../operators/optimizers/lars_momentum_op.cu | 545 +++++++++++------- .../operators/optimizers/lars_momentum_op.h | 74 ++- python/paddle/fluid/optimizer.py | 2 +- .../test_fleet_lars_meta_optimizer.py | 2 +- .../fluid/tests/unittests/test_momentum_op.py | 133 +++-- 6 files changed, 594 insertions(+), 302 deletions(-) mode change 100755 => 100644 paddle/fluid/operators/optimizers/lars_momentum_op.h diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc index 8f30dd5b2e68a4..65be35843bdf99 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc @@ -13,46 +13,158 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/optimizers/lars_momentum_op.h" -#include "paddle/fluid/operators/optimizers/momentum_op.h" namespace paddle { namespace operators { +class LarsMomentumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInputs("Param"), "Input", "Param", "LarsMomentum"); + OP_INOUT_CHECK(ctx->HasInputs("Grad"), "Input", "Grad", "LarsMomentum"); + OP_INOUT_CHECK(ctx->HasInputs("Velocity"), "Input", "Velocity", + "LarsMomentum"); + OP_INOUT_CHECK(ctx->HasInputs("LearningRate"), "Input", "LearningRate", + "LarsMomentum"); + OP_INOUT_CHECK(ctx->HasOutputs("ParamOut"), "Output", "ParamOut", + "LarsMomentum"); + OP_INOUT_CHECK(ctx->HasOutputs("VelocityOut"), "Output", "VelocityOut", + "LarsMomentum"); + PADDLE_ENFORCE_EQ( + ctx->GetInputsVarType("Param").front(), + framework::proto::VarType::LOD_TENSOR, + platform::errors::InvalidArgument( + "The input var's type should be LoDTensor, but the received is %s", + ctx->GetInputsVarType("Param").front())); + + auto lr_dims = ctx->GetInputsDim("LearningRate"); + auto grad_dim = ctx->GetInputsDim("Grad"); + auto param_dim = ctx->GetInputsDim("Param"); + auto velocity_dim = ctx->GetInputsDim("Velocity"); + auto lars_weight_decays = + ctx->Attrs().Get>("lars_weight_decay"); + auto multi_precision = ctx->Attrs().Get("multi_precision"); + + PADDLE_ENFORCE_EQ( + param_dim.size(), grad_dim.size(), + platform::errors::InvalidArgument( + "Input(Param) and Input(Grad) of LarsMomentumOp should have " + "same quantity. But number of Param is [%d] and Grad is [%d].", + param_dim.size(), grad_dim.size())); + PADDLE_ENFORCE_EQ( + param_dim.size(), velocity_dim.size(), + platform::errors::InvalidArgument( + "Input(Param) and Input(Velocity) of LarsMomentumOp should " + "have same quantity. But number of Param is [%d] and Velocity " + "is [%d].", + param_dim.size(), velocity_dim.size())); + PADDLE_ENFORCE_EQ( + lars_weight_decays.size(), grad_dim.size(), + platform::errors::InvalidArgument( + "Attr(Lars_weight_decay) and " + "Input(Grad) of LarsMomentumOp should have same quantity. " + "But number of Lars_weight_decay is [%d] and Grad is [%d].", + lars_weight_decays.size(), grad_dim.size())); + + if (multi_precision) { + OP_INOUT_CHECK(ctx->HasInputs("MasterParam"), "Input", "MasterParam", + "LarsMomentumMultiPrecision"); + OP_INOUT_CHECK(ctx->HasOutputs("MasterParamOut"), "Output", + "MasterParamOut", "LarsMomentumMultiPrecision"); + } + for (size_t i = 0; i < lr_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(framework::product(lr_dims[i]), 1, + platform::errors::InvalidArgument( + "Learning_rate should be a scalar. But Received " + "LearningRate's dim [%s]", + framework::product(lr_dims[i]))); + } + + for (size_t i = 0; i < param_dim.size(); ++i) { + PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Grad")[i], + framework::proto::VarType::LOD_TENSOR, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx->Inputs("Grad")[i].front(), + ctx->GetInputsVarType("Grad")[i])); + PADDLE_ENFORCE_EQ( + param_dim[i], grad_dim[i], + platform::errors::InvalidArgument( + "Input(Param) and Input(Grad) input of LarsMomentumOp shall " + "have same dimension. But Param`s dim is [%s] and Grad's dim " + "is [%s].", + param_dim[i], grad_dim[i])); + PADDLE_ENFORCE_EQ( + param_dim[i], velocity_dim[i], + platform::errors::InvalidArgument( + "Input(Param) and Input(Velocity) of LarsMomentumOp shall have " + "same dimension. But Param dim [%s] differs with Velocity dim " + "[%s].", + param_dim[i], velocity_dim[i])); + } + ctx->SetOutputsDim("ParamOut", param_dim); + ctx->SetOutputsDim("VelocityOut", param_dim); + if (ctx->HasOutputs("MasterParamOut")) { + ctx->SetOutputsDim("MasterParamOut", param_dim); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = + OperatorWithKernel::IndicateVarDataType(ctx, "Param"); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("Param", "(LoDTensor, default LoDTensor) " - "Input parameter that has to be updated"); + "Input parameter that has to be updated") + .AsDuplicable(); AddInput("Grad", "(LoDTensor, default LoDTensor) " - "Input gradient of the parameter"); + "Input gradient of the parameter") + .AsDuplicable(); AddInput("Velocity", "(LoDTensor, default LoDTensor) " "Input velocity (corresponding to the parameter) " - "that has to be updated"); + "that has to be updated") + .AsDuplicable(); AddInput("LearningRate", "(LoDTensor, default LoDTensor) " - "Input learning rate"); - AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable(); - + "Input learning rate") + .AsDuplicable(); + AddInput("MasterParam", "FP32 master weight for AMP.") + .AsDuplicable() + .AsDispensable(); AddOutput("ParamOut", "(LoDTensor) This output is updated parameter. " - "It shared memory with Input(Param)."); + "It shared memory with Input(Param).") + .AsDuplicable(); AddOutput("VelocityOut", "(LoDTensor) This output is updated velocity. " - "It shared memory with Input(Velocity)."); + "It shared memory with Input(Velocity).") + .AsDuplicable(); AddOutput("MasterParamOut", "The updated FP32 master weight for AMP. " "It shared memory with Input(MasterParam).") + .AsDuplicable() .AsDispensable(); - AddAttr("mu", "(float) Momentum coefficient"); AddAttr("lars_coeff", "(float, default 0.001) LARS coefficient.") .SetDefault(0.001); - AddAttr("lars_weight_decay", - "(float, default 0.0005) LARS weight decay") - .SetDefault(0.0005); + AddAttr>( + "lars_weight_decay", + "(std::vector, default 0.0005) LARS weight decay params") + .SetDefault({0.0005}); AddAttr("epsilon", "(float, default 0.0) epsilon to avoid Division by Zero.") .SetDefault(0.0); @@ -96,7 +208,7 @@ class LarsMomentumOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; REGISTER_OPERATOR( - lars_momentum, ops::MomentumOp, ops::LarsMomentumOpMaker, + lars_momentum, ops::LarsMomentumOp, ops::LarsMomentumOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, ops::LarsMomentumOpVarTypeInference); diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index 3e7023bd1260f5..caefd496978af2 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -18,18 +18,8 @@ limitations under the License. */ #include "paddle/fluid/operators/optimizers/lars_momentum_op.h" #include "paddle/fluid/platform/fast_divmod.h" -#if defined(__NVCC__) && CUDA_VERSION >= 11000 -/* Once CUDA_VERSION is beyond 11.0, cooperative_groups can be involved in - without adding --rdc=true compile flag, then L2_norm cuda kernel can be - set as a __device__ kernel rather than global kernel. On the contrary, - the compile flag shall be set in old version, which may affect the cuda - kernel performance in paddle, consequently, L2_norm kernel shall be set - as a __global__ kernel. -*/ +#if CUDA_VERSION >= 11000 #include -#define LARS_FUNCTION_FLAG __device__ -#else -#define LARS_FUNCTION_FLAG __global__ #endif #ifdef __HIPCC__ @@ -38,6 +28,8 @@ limitations under the License. */ #define LARS_BLOCK_SIZE 512 #endif +#define LARS_MAX_MERGED_OPS 150 + namespace paddle { namespace operators { @@ -53,6 +45,43 @@ __device__ __forceinline__ double Fma(double x, double y, double z) { return fma(x, y, z); } +template +class LarsThreadConfig { + public: + int grid_for_norm; + int grid_for_lars; +#if CUDA_VERSION >= 11000 + + private: + int grid_stride; + + public: + explicit LarsThreadConfig(int64_t numel, int sm_num, int num_blocks_per_sm) { + int grid = (numel + LARS_BLOCK_SIZE - 1) / LARS_BLOCK_SIZE; + grid_for_lars = + std::min(std::min(sm_num * num_blocks_per_sm, grid), LARS_BLOCK_SIZE); + grid_stride = LARS_BLOCK_SIZE * grid_for_lars; + } + + int GetRepeatTimes(int64_t numel) { + return (numel + grid_stride - 1) / grid_stride - 1; + } +#else + int repeat_times; + explicit LarsThreadConfig(const int64_t numel) { + int grid = (numel + LARS_BLOCK_SIZE - 1) / LARS_BLOCK_SIZE; + grid_for_norm = std::min(grid, LARS_BLOCK_SIZE); + const int grid_stride = grid_for_norm * LARS_BLOCK_SIZE; + repeat_times = (numel + grid_stride - 1) / grid_stride - 1; + // Determine to read 4 fp16 or float data once, but 2 double data once. + grid_for_lars = + std::is_same::value + ? (numel + (LARS_BLOCK_SIZE << 1) - 1) / (LARS_BLOCK_SIZE << 1) + : (numel + (LARS_BLOCK_SIZE << 2) - 1) / (LARS_BLOCK_SIZE << 2); + } +#endif +}; + template __device__ inline void VectorizeLarsUpdate( const T* __restrict__ grad, const MT* __restrict__ param, @@ -85,7 +114,6 @@ __device__ inline void VectorizeLarsUpdate( VecType grad_data = grad_vec[i]; VecMType param_data = param_vec[i]; VecMType velocity_data = velocity_vec[i]; - #pragma unroll for (int j = 0; j < VecSize; ++j) { MT grad_val = static_cast(grad_data[j]) * rescale_grad; @@ -116,41 +144,49 @@ __device__ inline void VectorizeLarsUpdate( } } +#if CUDA_VERSION >= 11000 +/* Once CUDA_VERSION is beyond 11, cooperative_groups can be involved in without + --rdc=true compile flag, then L2_norm kernel can be set with __device__ and + cooperative_groups::grid_group also can be involved. Otherwise, adding this + flag may affect much, L2_norm kernel shall be set with __global__.*/ +// TODO(limingshu): declaration of cooperative_groups wapper is invalid in host. +template +__forceinline__ __device__ void L2NormKernel( + const cooperative_groups::grid_group* cg, +#else template -LARS_FUNCTION_FLAG void L2NormKernel( +__global__ void L2NormKernel( +#endif const T* __restrict__ p_data, const T* __restrict__ g_data, - MT* __restrict__ p_buffer, MT* __restrict__ g_buffer, - const int repeat_times, const int64_t numel, const MT rescale_grad, + MT* __restrict__ p_buffer, MT* __restrict__ g_buffer, const int64_t numel, + const int repeat_times, const MT rescale_grad, const int thresh = 0, MT* __restrict__ p_n = nullptr, MT* __restrict__ g_n = nullptr) { + __shared__ MT s_buffer[2]; int tid = threadIdx.x + blockDim.x * blockIdx.x; int grid_stride = LARS_BLOCK_SIZE * gridDim.x; - const MT rescale_grad_pow = rescale_grad * rescale_grad; - __shared__ MT s_buffer[2]; + const MT rescale_pow = rescale_grad * rescale_grad; s_buffer[0] = static_cast(0); s_buffer[1] = static_cast(0); - MT p_tmp_val = static_cast(0); - MT g_tmp_val = static_cast(0); + MT p_tmp = static_cast(0); + MT g_tmp = static_cast(0); if (repeat_times == 0) { if (tid < numel) { - p_tmp_val = static_cast(p_data[tid]); - g_tmp_val = static_cast(g_data[tid]); + p_tmp = static_cast(p_data[tid]); + g_tmp = static_cast(g_data[tid]); } - s_buffer[0] += math::blockReduceSum(p_tmp_val * p_tmp_val, FINAL_MASK); - s_buffer[1] += math::blockReduceSum(g_tmp_val * g_tmp_val, FINAL_MASK); + s_buffer[0] += math::blockReduceSum(p_tmp * p_tmp, FINAL_MASK); + s_buffer[1] += math::blockReduceSum(g_tmp * g_tmp, FINAL_MASK); } else { - /* To avoid occupy too much temp buffer. Hence, slice the whole data into 2 - parts, the front of them whose quantity is excatly multiple of grid-thread - number, and this part of data is delt in for loop, the rest of data is delt - with another step to avoid visiting data address beyond bound. */ + /* Avoid occupy too much temp buffer. Slice the whole data into 2 parts, + the front of data whose quantity is excatly multiple of grid-thread + number, and delt in for loop, the rest is delt with another step. */ for (int i = 0; i < repeat_times; ++i) { - p_tmp_val = static_cast(p_data[tid]); - g_tmp_val = static_cast(g_data[tid]); + p_tmp = static_cast(p_data[tid]); + g_tmp = static_cast(g_data[tid]); tid += grid_stride; - s_buffer[0] += - math::blockReduceSum(p_tmp_val * p_tmp_val, FINAL_MASK); - s_buffer[1] += - math::blockReduceSum(g_tmp_val * g_tmp_val, FINAL_MASK); + s_buffer[0] += math::blockReduceSum(p_tmp * p_tmp, FINAL_MASK); + s_buffer[1] += math::blockReduceSum(g_tmp * g_tmp, FINAL_MASK); __syncthreads(); } MT p_val = 0; @@ -168,69 +204,46 @@ LARS_FUNCTION_FLAG void L2NormKernel( p_buffer[blockIdx.x] = s_buffer[0]; g_buffer[blockIdx.x] = s_buffer[1]; } - #if CUDA_VERSION >= 11000 - // Grid sync for completely writring partial result back to gloabl memory - const cooperative_groups::grid_group cg = cooperative_groups::this_grid(); - cg.sync(); - MT p_partial_sum = threadIdx.x < gridDim.x ? p_buffer[threadIdx.x] : 0; - MT g_partial_sum = threadIdx.x < gridDim.x ? g_buffer[threadIdx.x] : 0; - *p_n = Sqrt(math::blockReduceSum(p_partial_sum, FINAL_MASK)); - *g_n = Sqrt(rescale_grad_pow * - math::blockReduceSum(g_partial_sum, FINAL_MASK)); + cg->sync(); // Grid sync for writring partial result to gloabl memory + MT p_part_sum = threadIdx.x < gridDim.x ? p_buffer[threadIdx.x] : 0; + MT g_part_sum = threadIdx.x < gridDim.x ? g_buffer[threadIdx.x] : 0; + *p_n = Sqrt(math::blockReduceSum(p_part_sum, FINAL_MASK)); + *g_n = Sqrt(rescale_pow * math::blockReduceSum(g_part_sum, FINAL_MASK)); #endif } template -__global__ void MomentumLarsKernel( +__forceinline__ __device__ void MomentumUpdate( const T* __restrict__ param, const T* __restrict__ grad, const MT* __restrict__ velocity, T* param_out, MT* velocity_out, const MT* __restrict__ master_param, MT* __restrict__ master_param_out, - const MT* __restrict__ learning_rate, MT* __restrict__ p_buffer, - MT* __restrict__ g_buffer, const MT mu, const MT lars_coeff, - const MT lars_weight_decay, const MT epsilon, const MT rescale_grad, - const int repeat_times, const int thresh, const int64_t numel) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - int grid_stride = gridDim.x * LARS_BLOCK_SIZE; -#if CUDA_VERSION >= 11000 - MT param_norm = static_cast(0); - MT grad_norm = static_cast(0); - L2NormKernel(param, grad, p_buffer, g_buffer, repeat_times, numel, - rescale_grad, ¶m_norm, &grad_norm); -#else - const MT rescale_grad_pow = rescale_grad * rescale_grad; - MT param_parital_norm = threadIdx.x < thresh ? p_buffer[threadIdx.x] : 0; - MT grad_parital_norm = threadIdx.x < thresh ? g_buffer[threadIdx.x] : 0; - __syncthreads(); - MT param_norm = - Sqrt(math::blockReduceSum(param_parital_norm, FINAL_MASK)); - MT grad_norm = Sqrt(rescale_grad_pow * - math::blockReduceSum(grad_parital_norm, FINAL_MASK)); -#endif - + const MT* __restrict__ learning_rate, const MT mu, + const MT lars_weight_decay, const MT lars_coeff, const MT epsilon, + const MT rescale_grad, const MT param_norm, const MT grad_norm, + const int tid, const int grid_stride, const int64_t numel, + const bool is_amp) { const MT lr = learning_rate[0]; MT local_lr = lr; if (lars_weight_decay > static_cast(0)) { local_lr = lr * lars_coeff * param_norm / - (Fma(lars_weight_decay, param_norm, grad_norm) + epsilon); + (fma(lars_weight_decay, param_norm, grad_norm) + epsilon); } - - if (master_param_out) { - VectorizeLarsUpdate(grad, master_param, velocity, param_out, - velocity_out, mu, local_lr, - lars_weight_decay, rescale_grad, tid, - grid_stride, numel, master_param_out); + if (is_amp) { + VectorizeLarsUpdate( + grad, master_param, velocity, param_out, velocity_out, mu, local_lr, + lars_weight_decay, rescale_grad, tid, grid_stride, numel, + master_param_out); } else { if (std::is_same::value || std::is_same::value) { - // As for multiple-precision, type T and MT cannot be more than fp16 or - // fp32, Then, the maximum data IO size could be set to 4. - VectorizeLarsUpdate( + /* TODO(limingshu): pointer cast may damage memory accessing for fp16 */ + VectorizeLarsUpdate( grad, reinterpret_cast(param), velocity, param_out, velocity_out, mu, local_lr, lars_weight_decay, rescale_grad, tid, grid_stride, numel); } else { - VectorizeLarsUpdate( + VectorizeLarsUpdate( grad, reinterpret_cast(param), velocity, param_out, velocity_out, mu, local_lr, lars_weight_decay, rescale_grad, tid, grid_stride, numel); @@ -238,144 +251,278 @@ __global__ void MomentumLarsKernel( } } +#if CUDA_VERSION >= 11000 +template +struct LarsParamWarpper { + int64_t numel_arr[LARS_MAX_MERGED_OPS]; + int repeat_arr[LARS_MAX_MERGED_OPS]; + const T* __restrict__ p_arr[LARS_MAX_MERGED_OPS]; + const T* __restrict__ g_arr[LARS_MAX_MERGED_OPS]; + const MT* __restrict__ v_arr[LARS_MAX_MERGED_OPS]; + const MT* __restrict__ lr_arr[LARS_MAX_MERGED_OPS]; + const MT* __restrict__ master_p_arr[LARS_MAX_MERGED_OPS]; + T* __restrict__ p_out_arr[LARS_MAX_MERGED_OPS]; + MT* __restrict__ v_out_arr[LARS_MAX_MERGED_OPS]; + MT* __restrict__ master_p_out_arr[LARS_MAX_MERGED_OPS]; + MT weight_decay_arr[LARS_MAX_MERGED_OPS]; +}; + +template +__global__ void MergedMomentumLarsKernel(LarsParamWarpper* lars_warpper, + MT* __restrict__ p_buffer, + MT* __restrict__ g_buffer, + const int op_num, const MT mu, + const MT lars_coeff, const MT epsilon, + const MT rescale_grad, + const bool is_amp) { + int grid_stride = gridDim.x * LARS_BLOCK_SIZE; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const cooperative_groups::grid_group cg = cooperative_groups::this_grid(); + for (int i = 0; i < op_num; ++i) { + int numel = lars_warpper->numel_arr[i]; + MT param_norm = static_cast(0); + MT grad_norm = static_cast(0); + L2NormKernel(&cg, lars_warpper->p_arr[i], lars_warpper->g_arr[i], + p_buffer, g_buffer, numel, lars_warpper->repeat_arr[i], + rescale_grad, 0, ¶m_norm, &grad_norm); + MomentumUpdate( + lars_warpper->p_arr[i], lars_warpper->g_arr[i], + lars_warpper->v_out_arr[i], lars_warpper->p_out_arr[i], + lars_warpper->v_out_arr[i], lars_warpper->master_p_arr[i], + lars_warpper->master_p_out_arr[i], lars_warpper->lr_arr[i], mu, + lars_warpper->weight_decay_arr[i], lars_coeff, epsilon, rescale_grad, + param_norm, grad_norm, tid, grid_stride, numel, is_amp); + } +} +#endif + +template +__global__ void MomentumLarsKernel( + const T* __restrict__ param, const T* __restrict__ grad, + const MT* __restrict__ velocity, T* param_out, MT* velocity_out, + const MT* __restrict__ master_param, MT* __restrict__ master_param_out, + const MT* __restrict__ learning_rate, MT* __restrict__ p_buffer, + MT* __restrict__ g_buffer, const MT mu, const MT lars_coeff, + const MT lars_weight_decay, const MT epsilon, const MT rescale_grad, + const int repeat_times, const int thresh, const int64_t numel, + const bool is_amp) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int grid_stride = gridDim.x * LARS_BLOCK_SIZE; +#if CUDA_VERSION >= 11000 + const cooperative_groups::grid_group cg = cooperative_groups::this_grid(); + MT param_norm = static_cast(0); + MT grad_norm = static_cast(0); + L2NormKernel(&cg, param, grad, p_buffer, g_buffer, numel, repeat_times, + rescale_grad, gridDim.x, ¶m_norm, &grad_norm); +#else + const MT rescale_grad_pow = rescale_grad * rescale_grad; + MT param_part_norm = threadIdx.x < thresh ? p_buffer[threadIdx.x] : 0; + MT grad_part_norm = threadIdx.x < thresh ? g_buffer[threadIdx.x] : 0; + __syncthreads(); + MT param_norm = Sqrt(math::blockReduceSum(param_part_norm, FINAL_MASK)); + MT grad_norm = Sqrt(rescale_grad_pow * + math::blockReduceSum(grad_part_norm, FINAL_MASK)); +#endif + MomentumUpdate(param, grad, velocity, param_out, velocity_out, + master_param, master_param_out, learning_rate, mu, + lars_weight_decay, lars_coeff, epsilon, rescale_grad, + param_norm, grad_norm, tid, grid_stride, numel, is_amp); +} + +template +inline void SeparatedLarsMomentumOpCUDAKernel( + const platform::CUDADeviceContext& cuda_ctx, const T* param_data, + T* param_out_data, const MT* velocity_data, MT* velocity_out_data, + const T* grad_data, const MT* lr, MT* p_buffer, MT* g_buffer, const MT mu, + const MT lars_coeff, const MT weight_decay, const MT epsilon, + const MT rescale_grad, const int64_t numel, const MT* master_param_data, + MT* master_out_data, const bool is_amp) { + LarsThreadConfig lars_thread_config(numel); + L2NormKernel<<>>( + param_data, grad_data, p_buffer, g_buffer, numel, + lars_thread_config.repeat_times, rescale_grad); + + MomentumLarsKernel<<>>( + param_data, grad_data, velocity_data, param_out_data, velocity_out_data, + master_param_data, master_out_data, lr, p_buffer, g_buffer, mu, + lars_coeff, weight_decay, epsilon, rescale_grad, 0, + lars_thread_config.grid_for_norm, numel, is_amp); +} + template class LarsMomentumOpCUDAKernel : public framework::OpKernel { using MT = MultiPrecisionType; public: void Compute(const framework::ExecutionContext& ctx) const override { - const bool multi_precision = ctx.Attr("multi_precision"); - auto param_out = ctx.Output("ParamOut"); - auto velocity_out = ctx.Output("VelocityOut"); - auto param = ctx.Input("Param"); - auto velocity = ctx.Input("Velocity"); - auto grad = ctx.Input("Grad"); - auto learning_rate = ctx.Input("LearningRate"); - - int64_t numel = param->numel(); - int grid = (numel + LARS_BLOCK_SIZE - 1) / LARS_BLOCK_SIZE; - const framework::Tensor* master_param = nullptr; - framework::Tensor* master_param_out = nullptr; - const MT* master_param_data = nullptr; - MT* master_param_out_data = nullptr; - - if (multi_precision) { - bool has_master = - ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); - PADDLE_ENFORCE_EQ(has_master, true, - platform::errors::InvalidArgument( - "The Input(MasterParam) and Output(MasterParamOut) " - "should not be null when " - "the attr `multi_precision` is true")); - master_param = ctx.Input("MasterParam"); - master_param_out = ctx.Output("MasterParamOut"); - master_param_data = master_param->data(); - master_param_out_data = - master_param_out->mutable_data(ctx.GetPlace()); - } - MT mu = static_cast(ctx.Attr("mu")); - MT lars_coeff = static_cast(ctx.Attr("lars_coeff")); - MT lars_weight_decay = - static_cast(ctx.Attr("lars_weight_decay")); - MT epsilon = static_cast(ctx.Attr("epsilon")); - MT rescale_grad = static_cast(ctx.Attr("rescale_grad")); - - auto* param_data = param->data(); - auto* grad_data = grad->data(); - auto* velocity_data = velocity->data(); - auto* lr = learning_rate->data(); - auto& cuda_ctx = ctx.template device_context(); - T* param_out_data = param_out->mutable_data(ctx.GetPlace()); - MT* velocity_out_data = velocity_out->mutable_data(ctx.GetPlace()); - -#if CUDA_VERSION >= 11000 - /* - Once model trainning with lars optimizer, whose principal implementation - is achieved by following two steps: - 1. Figure out the L2 norm statistic result of grad data and param data. - 2. Update param and velocity data with usage of L2 norm statistic result. - - Orignally, these two steps were fulfilled by respective eigen function and - cuda kernel, however the overhead of eigen function occupied much ratio in - total, consequently affect the performance of lars op, make it necessary - to combine 2 steps into one cuda kernel. - Since the step1 is l2 norm statistic, grid level reduce is needed. To - achieve this and continuous calculation of step 2 in only one global - lanuch, essential basis is to control all grid-threads while running. Apart - from normal lanuch form, cuda9.0 provides `cudaLaunchCooperativeKernel` - api : - - The thread quantity shall less than pyhsical SM limited threads - - Launches a device function where thread blocks can cooperate and - synchronize as they execute. - */ - // Figure out how many blocks can be active in each sm. int num_blocks_per_sm = 0; - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm, - MomentumLarsKernel, - LARS_BLOCK_SIZE, sizeof(MT)); + bool multi_precision = ctx.Attr("multi_precision"); + auto& cuda_ctx = ctx.template device_context(); int sm_num = cuda_ctx.GetSMCount(); - int grid_real = - std::min(std::min(sm_num * num_blocks_per_sm, grid), LARS_BLOCK_SIZE); framework::Tensor tmp_buffer_t = ctx.AllocateTmpTensor( {LARS_BLOCK_SIZE << 1}, cuda_ctx); auto* p_buffer = tmp_buffer_t.mutable_data(ctx.GetPlace()); auto* g_buffer = p_buffer + LARS_BLOCK_SIZE; - int grid_stride = LARS_BLOCK_SIZE * grid; - int repeat_times = (numel + grid_stride - 1) / grid_stride - 1; - int thresh = 0; - - // Uniform kernel parameter for cudaLaunchCooperativeKernel - void* cuda_param[] = { - reinterpret_cast(¶m_data), - reinterpret_cast(&grad_data), - reinterpret_cast(&velocity_data), - reinterpret_cast(¶m_out_data), - reinterpret_cast(&velocity_out_data), - reinterpret_cast(&master_param_data), - reinterpret_cast(&master_param_out_data), - reinterpret_cast(&lr), - reinterpret_cast(&p_buffer), - reinterpret_cast(&g_buffer), - reinterpret_cast(&mu), - reinterpret_cast(&lars_coeff), - reinterpret_cast(&lars_weight_decay), - reinterpret_cast(&epsilon), - reinterpret_cast(&rescale_grad), - reinterpret_cast(&repeat_times), - reinterpret_cast(&thresh), // Just a placeholder - reinterpret_cast(&numel)}; - // Lanuch all sm theads. - cudaLaunchCooperativeKernel( - reinterpret_cast(MomentumLarsKernel), grid_real, - LARS_BLOCK_SIZE, cuda_param, 0, cuda_ctx.stream()); -#else - // Determine to read 4 fp16 or float data once, but 2 double data once. - int grid_lars = - sizeof(T) < sizeof(double) - ? (numel + (LARS_BLOCK_SIZE << 2) - 1) / (LARS_BLOCK_SIZE << 2) - : (numel + (LARS_BLOCK_SIZE << 1) - 1) / (LARS_BLOCK_SIZE << 1); - int grid_norm = std::min(grid, LARS_BLOCK_SIZE); - framework::Tensor p_buffer_t = - ctx.AllocateTmpTensor( - {LARS_BLOCK_SIZE << 1}, cuda_ctx); - auto* p_buffer = p_buffer_t.mutable_data(ctx.GetPlace()); - auto* g_buffer = p_buffer + LARS_BLOCK_SIZE; - - const int grid_stride = LARS_BLOCK_SIZE * grid_norm; - const int repeat_times = (numel + grid_stride - 1) / grid_stride - 1; - - L2NormKernel<<>>( - param_data, grad_data, p_buffer, g_buffer, repeat_times, numel, - rescale_grad); + MT mu = static_cast(ctx.Attr("mu")); + MT lars_coeff = static_cast(ctx.Attr("lars_coeff")); + MT epsilon = static_cast(ctx.Attr("epsilon")); + MT rescale_grad = static_cast(ctx.Attr("rescale_grad")); - MomentumLarsKernel< - T, MT><<>>( - param_data, grad_data, velocity_data, param_out_data, velocity_out_data, - master_param_data, master_param_out_data, lr, p_buffer, g_buffer, mu, - lars_coeff, lars_weight_decay, epsilon, rescale_grad, 0, grid_norm, - numel); // 0 is just a placeholder. + auto weight_decay_arr = ctx.Attr>("lars_weight_decay"); + auto grad = ctx.MultiInput("Grad"); + auto param = ctx.MultiInput("Param"); + auto velocity = ctx.MultiInput("Velocity"); + auto param_out = ctx.MultiOutput("ParamOut"); + auto velocity_out = ctx.MultiOutput("VelocityOut"); + auto learning_rate = ctx.MultiInput("LearningRate"); + auto master_param = ctx.MultiInput("MasterParam"); + auto master_param_out = + ctx.MultiOutput("MasterParamOut"); + + int op_num = grad.size(); +#if CUDA_VERSION >= 11000 + if (op_num > 1) { + LarsParamWarpper lars_warpper; + PADDLE_ENFORCE_LT( + op_num, LARS_MAX_MERGED_OPS, + platform::errors::InvalidArgument( + "The maximum number of merged-ops supported is (%d), but" + "lars op required for trainning this model is (%d)\n", + LARS_MAX_MERGED_OPS, op_num)); + + /* Implementation of lars optimizer consists of following two steps: + 1. Figure out the L2 norm statistic result of grad data and param data. + 2. Update param and velocity with usage of L2 norm statistic result. + Step1 and step2 can be merged with api provided by nvida + cudaLaunchCooperativeKernel: + - The thread quantity shall less than pyhsical SM limited threads + - Launche as thread-block can synchronizlly execute. */ + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks_per_sm, MergedMomentumLarsKernel, LARS_BLOCK_SIZE, + sizeof(MT) << 1); + + size_t total_numel = 0; + for (int i = 0; i < op_num; ++i) { + size_t temp_numel = param[i]->numel(); + total_numel += temp_numel; + lars_warpper.numel_arr[i] = temp_numel; + lars_warpper.p_arr[i] = param[i]->data(); + lars_warpper.g_arr[i] = grad[i]->data(); + lars_warpper.v_arr[i] = velocity[i]->data(); + lars_warpper.lr_arr[i] = learning_rate[i]->data(); + lars_warpper.p_out_arr[i] = + param_out[i]->mutable_data(ctx.GetPlace()); + lars_warpper.v_out_arr[i] = + velocity_out[i]->mutable_data(ctx.GetPlace()); + lars_warpper.weight_decay_arr[i] = static_cast(weight_decay_arr[i]); + } + int64_t avg_numel = total_numel / op_num; + LarsThreadConfig lars_thread_config(avg_numel, sm_num, + num_blocks_per_sm); + for (int i = 0; i < op_num; ++i) { + lars_warpper.repeat_arr[i] = + lars_thread_config.GetRepeatTimes(lars_warpper.numel_arr[i]); + } + if (multi_precision) { + for (int i = 0; i < op_num; ++i) { + lars_warpper.master_p_arr[i] = master_param[i]->data(); + lars_warpper.master_p_out_arr[i] = + master_param_out[i]->mutable_data(ctx.GetPlace()); + } + } + auto merged_buf = memory::Alloc(cuda_ctx, sizeof(lars_warpper)); + auto* merged_ptr = + reinterpret_cast*>(merged_buf->ptr()); + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, cuda_ctx.GetPlace()), + reinterpret_cast(merged_ptr), platform::CPUPlace(), + reinterpret_cast(&lars_warpper), sizeof(lars_warpper), + cuda_ctx.stream()); + void* cuda_param[] = {reinterpret_cast(&merged_ptr), + reinterpret_cast(&p_buffer), + reinterpret_cast(&g_buffer), + reinterpret_cast(&op_num), + reinterpret_cast(&mu), + reinterpret_cast(&lars_coeff), + reinterpret_cast(&epsilon), + reinterpret_cast(&rescale_grad), + reinterpret_cast(&multi_precision)}; + // Lanuch all sm theads, and thead of each block synchronizedly cooperate. + cudaLaunchCooperativeKernel( + reinterpret_cast(MergedMomentumLarsKernel), + lars_thread_config.grid_for_lars, LARS_BLOCK_SIZE, cuda_param, 0, + cuda_ctx.stream()); + } else { + auto* param_data = param[0]->data(); + auto* grad_data = grad[0]->data(); + auto* velocity_data = velocity[0]->data(); + auto* lr = learning_rate[0]->data(); + auto* param_out_data = param_out[0]->mutable_data(ctx.GetPlace()); + auto* velocity_out_data = + velocity_out[0]->mutable_data(ctx.GetPlace()); + const MT* master_param_data = + multi_precision ? master_param[0]->data() : nullptr; + MT* master_param_out_data = + multi_precision + ? master_param_out[0]->mutable_data(ctx.GetPlace()) + : nullptr; + int64_t numel = param[0]->numel(); + MT lars_weight_decay = weight_decay_arr[0]; + + // Figure out how many blocks can be active in each sm. + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks_per_sm, MomentumLarsKernel, LARS_BLOCK_SIZE, + sizeof(MT) << 1); + LarsThreadConfig lars_thread_config(numel, sm_num, + num_blocks_per_sm); + int repeat_times = lars_thread_config.GetRepeatTimes(numel); + int thresh = 0; + void* cuda_param[] = { + reinterpret_cast(¶m_data), + reinterpret_cast(&grad_data), + reinterpret_cast(&velocity_data), + reinterpret_cast(¶m_out_data), + reinterpret_cast(&velocity_out_data), + reinterpret_cast(&master_param_data), + reinterpret_cast(&master_param_out_data), + reinterpret_cast(&lr), + reinterpret_cast(&p_buffer), + reinterpret_cast(&g_buffer), + reinterpret_cast(&mu), + reinterpret_cast(&lars_coeff), + reinterpret_cast(&lars_weight_decay), + reinterpret_cast(&epsilon), + reinterpret_cast(&rescale_grad), + reinterpret_cast(&repeat_times), + reinterpret_cast(&thresh), // Just a placeholder + reinterpret_cast(&numel), + reinterpret_cast(&multi_precision)}; + // Lanuch all sm theads. + cudaLaunchCooperativeKernel( + reinterpret_cast(MomentumLarsKernel), + lars_thread_config.grid_for_lars, LARS_BLOCK_SIZE, cuda_param, 0, + cuda_ctx.stream()); + } +#else + for (int i = 0; i < op_num; ++i) { + const MT* master_param_data = + multi_precision ? master_param[i]->data() : nullptr; + MT* master_param_out_data = + multi_precision + ? master_param_out[i]->mutable_data(ctx.GetPlace()) + : nullptr; + SeparatedLarsMomentumOpCUDAKernel( + cuda_ctx, param[i]->data(), + param_out[i]->mutable_data(ctx.GetPlace()), + velocity[i]->data(), + velocity_out[i]->mutable_data(ctx.GetPlace()), grad[i]->data(), + learning_rate[i]->data(), p_buffer, g_buffer, mu, lars_coeff, + weight_decay_arr[i], epsilon, rescale_grad, param[i]->numel(), + master_param_data, master_param_out_data, multi_precision); + } #endif } }; diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h old mode 100755 new mode 100644 index 55775bc08fb5eb..df4d7b9a0438bc --- a/paddle/fluid/operators/optimizers/lars_momentum_op.h +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h @@ -23,54 +23,48 @@ template class LarsMomentumOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto param_out = ctx.Output("ParamOut"); - auto velocity_out = ctx.Output("VelocityOut"); - auto param = ctx.Input("Param"); - auto velocity = ctx.Input("Velocity"); - auto learning_rate = ctx.Input("LearningRate"); - auto* grad_var = ctx.InputVar("Grad"); - // only support dense for now. - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); - auto grad = ctx.Input("Grad"); - - param_out->mutable_data(ctx.GetPlace()); - velocity_out->mutable_data(ctx.GetPlace()); - + auto param_out = ctx.MultiOutput("ParamOut"); + auto velocity_out = ctx.MultiOutput("VelocityOut"); + auto param = ctx.MultiInput("Param"); + auto velocity = ctx.MultiInput("Velocity"); + auto learning_rate = ctx.MultiInput("LearningRate"); + auto grad = ctx.MultiInput("Grad"); + auto weight_decay_arr = ctx.Attr>("lars_weight_decay"); T mu = static_cast(ctx.Attr("mu")); T lars_coeff = ctx.Attr("lars_coeff"); - T lars_weight_decay = ctx.Attr("lars_weight_decay"); T epsilon = ctx.Attr("epsilon"); - auto p_out = framework::EigenVector::Flatten(*param_out); - auto v_out = framework::EigenVector::Flatten(*velocity_out); + int op_num = param.size(); + for (int i = 0; i < op_num; ++i) { + auto* lr = learning_rate[i]->data(); + T lars_weight_decay = weight_decay_arr[i]; + param_out[i]->mutable_data(ctx.GetPlace()); + velocity_out[i]->mutable_data(ctx.GetPlace()); - auto p = framework::EigenVector::Flatten(*param); - auto v = framework::EigenVector::Flatten(*velocity); - auto g = framework::EigenVector::Flatten(*grad); - auto* lr = learning_rate->data(); + auto p_out = framework::EigenVector::Flatten(*(param_out[i])); + auto v_out = framework::EigenVector::Flatten(*(velocity_out[i])); + auto p = framework::EigenVector::Flatten(*(param[i])); + auto v = framework::EigenVector::Flatten(*(velocity[i])); + auto g = framework::EigenVector::Flatten(*(grad[i])); - framework::Tensor p_norm_t, g_norm_t; - p_norm_t.Resize({1}); - g_norm_t.Resize({1}); - p_norm_t.mutable_data(ctx.GetPlace()); - g_norm_t.mutable_data(ctx.GetPlace()); - auto ep_norm = framework::EigenScalar::From(p_norm_t); - auto eg_norm = framework::EigenScalar::From(g_norm_t); + framework::Tensor p_norm_t, g_norm_t; + p_norm_t.Resize({1}); + g_norm_t.Resize({1}); + p_norm_t.mutable_data(ctx.GetPlace()); + g_norm_t.mutable_data(ctx.GetPlace()); + auto ep_norm = framework::EigenScalar::From(p_norm_t); + auto eg_norm = framework::EigenScalar::From(g_norm_t); + ep_norm = p.square().sum().sqrt(); + eg_norm = g.square().sum().sqrt(); - ep_norm = p.square().sum().sqrt(); - eg_norm = g.square().sum().sqrt(); - T local_lr = lr[0]; - if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) { - local_lr = lr[0] * lars_coeff * ep_norm(0) / - (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon); + T local_lr = lr[0]; + if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) { + local_lr = lr[0] * lars_coeff * ep_norm(0) / + (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon); + } + v_out = v * mu + local_lr * (g + lars_weight_decay * p); + p_out = p - v_out; } - v_out = v * mu + local_lr * (g + lars_weight_decay * p); - p_out = p - v_out; } }; diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 24076e82b0365d..b81862adf5e656 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -2066,7 +2066,7 @@ def _append_optimize_op(self, block, param_and_grad): attrs = { "mu": self._momentum, "lars_coeff": self._lars_coeff, - "lars_weight_decay": _lars_weight_decay, + "lars_weight_decay": [_lars_weight_decay], "multi_precision": find_master, "rescale_grad": self._rescale_grad } diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py index e4cc3682d1a24f..bee6acf732460b 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py @@ -103,7 +103,7 @@ def test_lars_exclude_fn(self): 'op_role_var')[0] or ".b" in op.attr('op_role_var')[0]) ] for op in ops_without_wd: - self.assertEqual(op.attr('lars_weight_decay'), 0) + self.assertEqual(op.attr('lars_weight_decay')[0], 0) def test_lars_apply_with_amp(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index b42de853c00d54..34e057a5a8a612 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -138,50 +138,70 @@ def test_check_output(self): "core is not compiled with CUDA") class TestLarsMomentumOpWithMP(OpTest): def setUp(self): + self.config() self.op_type = "lars_momentum" - - master_param = np.random.random((123, 321)).astype("float32") - param = master_param.astype("float16") - grad = np.random.random((123, 321)).astype("float16") - velocity = np.zeros((123, 321)).astype("float32") - learning_rate = np.array([0.001]).astype("float32") mu = 0.0001 lars_coeff = 0.001 lars_weight_decay = 0.0005 rescale_grad = 1.0 + params = [] + grads = [] + velocitys = [] + learning_rates = [] + master_params = [] + param_outs = [] + velocity_outs = [] + master_param_outs = [] + for i in range(self.params_num): + master_param = np.random.random((123, 321)).astype("float32") + param = master_param.astype("float16") + grad = np.random.random((123, 321)).astype("float16") + velocity = np.zeros((123, 321)).astype("float32") + learning_rate = np.array([0.001]).astype("float32") + + fp32_grad = grad.astype("float32") + pnorm = np.sqrt(np.square(master_param).sum()) + gnorm = np.sqrt(np.square(fp32_grad).sum()) + local_lr = learning_rate * lars_coeff * pnorm / ( + gnorm + lars_weight_decay * pnorm) + fp32_grad = fp32_grad * rescale_grad + velocity_out = mu * velocity + local_lr * ( + fp32_grad + lars_weight_decay * master_param) + p_new = master_param - velocity_out + param_out = p_new.astype("float16") + master_param_out = p_new + + params.append(("SubParam_" + str(i), param)) + grads.append(("SubGrad_" + str(i), grad)) + velocitys.append(("SubVelocity_" + str(i), velocity)) + learning_rates.append(("SubLearning_rate_" + str(i), learning_rate)) + velocity_outs.append(("SubVelocity_out_" + str(i), velocity_out)) + param_outs.append(("SubParam_out_" + str(i), param_out)) + master_params.append(("SubMasterParam_" + str(i), master_param)) + master_param_outs.append( + ("SubMasterParamOut_" + str(i), master_param_out)) + self.inputs = { - 'Param': param, - 'Grad': grad, - 'Velocity': velocity, - 'LearningRate': learning_rate, - 'MasterParam': master_param, + 'Param': params, + 'Grad': grads, + 'Velocity': velocitys, + 'LearningRate': learning_rates, + 'MasterParam': master_params, } self.attrs = { 'mu': mu, 'lars_coeff': lars_coeff, - 'lars_weight_decay': lars_weight_decay, + 'lars_weight_decay': [lars_weight_decay], 'multi_precision': True, 'rescale_grad': rescale_grad } - fp32_grad = grad.astype("float32") - pnorm = np.sqrt(np.square(master_param).sum()) - gnorm = np.sqrt(np.square(fp32_grad).sum()) - local_lr = learning_rate * lars_coeff * pnorm / ( - gnorm + lars_weight_decay * pnorm) - fp32_grad = fp32_grad * rescale_grad - velocity_out = mu * velocity + local_lr * (fp32_grad + lars_weight_decay - * master_param) - p_new = master_param - velocity_out - param_out = p_new.astype("float16") - master_param_out = p_new - self.outputs = { - 'ParamOut': param_out, - 'VelocityOut': velocity_out, - 'MasterParamOut': master_param_out + 'ParamOut': param_outs, + 'VelocityOut': velocity_outs, + 'MasterParamOut': master_param_outs } def test_check_output(self): @@ -191,46 +211,65 @@ def test_check_output(self): if core.is_float16_supported(place): self.check_output_with_place(place) + def config(self): + self.params_num = 1 + class TestLarsMomentumOp(OpTest): def setUp(self): + self.config() self.op_type = "lars_momentum" - - param = np.random.random((123, 321)).astype("float32") - grad = np.random.random((123, 321)).astype("float32") - velocity = np.zeros((123, 321)).astype("float32") - learning_rate = np.array([0.001]).astype("float32") mu = 0.0001 lars_coeff = 0.001 lars_weight_decay = 0.0005 + params = [] + grads = [] + velocitys = [] + param_outs = [] + velocity_outs = [] + learning_rates = [] + for i in range(self.params_num): + param = np.random.random((123, 321)).astype("float32") + grad = np.random.random((123, 321)).astype("float32") + velocity = np.zeros((123, 321)).astype("float32") + learning_rate = np.array([0.001]).astype("float32") + pnorm = np.sqrt(np.square(param).sum()) + gnorm = np.sqrt(np.square(grad).sum()) + local_lr = learning_rate * lars_coeff * pnorm / ( + gnorm + lars_weight_decay * param) + velocity_out = mu * velocity + local_lr * (grad + lars_weight_decay + * param) + param_out = param - velocity_out + + params.append(("SubParam_" + str(i), param)) + grads.append(("SubGrad_" + str(i), grad)) + velocitys.append(("SubVelocity_" + str(i), velocity)) + learning_rates.append(("SubLearning_rate_" + str(i), learning_rate)) + velocity_outs.append(("SubVelocity_out_" + str(i), velocity_out)) + param_outs.append(("SubParam_out_" + str(i), param_out)) + self.inputs = { - 'Param': param, - 'Grad': grad, - 'Velocity': velocity, - 'LearningRate': learning_rate + 'Param': params, + 'Grad': grads, + 'Velocity': velocitys, + 'LearningRate': learning_rates } self.attrs = { 'mu': mu, 'lars_coeff': lars_coeff, - 'lars_weight_decay': lars_weight_decay + 'lars_weight_decay': [lars_weight_decay] } - - pnorm = np.sqrt(np.square(param).sum()) - gnorm = np.sqrt(np.square(grad).sum()) - local_lr = learning_rate * lars_coeff * pnorm / ( - gnorm + lars_weight_decay * param) - velocity_out = mu * velocity + local_lr * (grad + lars_weight_decay * - param) - param_out = param - velocity_out - - self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out} + self.outputs = {'ParamOut': param_outs, 'VelocityOut': velocity_outs} def test_check_output(self): paddle.enable_static() self.check_output() + def config(self): + self.params_num = 1 + class TestSparseMomentumOp(unittest.TestCase): def setUp(self): From d7064f0435ce1c35c2b57bf6fcbef6b2597c5f4f Mon Sep 17 00:00:00 2001 From: yujun <50394665+JunnYu@users.noreply.github.com> Date: Wed, 13 Oct 2021 18:43:56 +0800 Subject: [PATCH 150/298] [PaddlePaddle hackathon] + ADD CELU (#36088) * update * update * update * try make CI pass * doc typo * update doc string --- paddle/fluid/operators/activation_op.cc | 74 ++++++++++++ paddle/fluid/operators/activation_op.cu | 66 +++++++++++ paddle/fluid/operators/activation_op.h | 111 ++++++++++++++++++ .../unittests/test_activation_nn_grad.py | 27 +++++ .../tests/unittests/test_activation_op.py | 89 ++++++++++++++ .../tests/unittests/test_imperative_layers.py | 3 + python/paddle/nn/__init__.py | 2 + python/paddle/nn/functional/__init__.py | 2 + python/paddle/nn/functional/activation.py | 44 +++++++ python/paddle/nn/layer/__init__.py | 1 + python/paddle/nn/layer/activation.py | 42 +++++++ 11 files changed, 461 insertions(+) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index ac98e49b1c205e..3cdcfd79235596 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -560,6 +560,28 @@ Applies the following element-wise computation on the input according to } }; +class CELUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input is a multi-dimensional Tensor. The data type is " + "float32 or float64."); + AddOutput("Out", + "The output is a multi-dimensional Tensor which has same " + "dimension and data type as the ``x``."); + AddAttr("alpha", "The alpha value of CELU").SetDefault(1.0f); + AddComment(R"DOC( +CELU Activation Operator. + +Applies the following element-wise computation on the input according to +https://arxiv.org/abs/1704.07483. + +$$out = \max(0, x) + \min(0, \alpha * (e^(x/\alpha) - 1))$$ + +)DOC"); + } +}; + class Relu6OpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -982,6 +1004,29 @@ class ELUDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { } }; +// celu grad: dx=dy if y>0 else dy*(x/alpha).exp() +// celu gradgrad: ddx=ddy if y>0 else ddy*(x/alpha).exp()/alpha +template +class CELUDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("celu_grad_grad"); + + op->SetInput("X", this->Input("X")); + op->SetInput("DOut", this->Input(framework::GradVarName("Out"))); + // X@GRAD@GRAD: ddx + op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); + op->SetAttrMap(this->Attrs()); + + // Out@GRAD@GRAD: ddy + op->SetOutput("DX", this->InputGrad("X")); + op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out"))); + } +}; + // sqrt Grad: dx = 0.5 * dy / y // sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx template @@ -1353,6 +1398,35 @@ REGISTER_OP_CPU_KERNEL( /* ========================================================================== */ +/* ======================== celu register ============================ + */ +REGISTER_OPERATOR( + celu, ops::ActivationOp, ops::CELUOpMaker, ops::ActivationOpInferVarType, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::framework::OpDesc>, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::imperative::OpBase>, + ops::ActFwdInplaceInferer); +REGISTER_OPERATOR(celu_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInferer, + ops::CELUDoubleGradMaker, + ops::CELUDoubleGradMaker); +REGISTER_OPERATOR( + celu_grad_grad, + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInferer); + +REGISTER_ACTIVATION_CPU_KERNEL(celu, CELU, CELUFunctor, CELUGradFunctor); +REGISTER_OP_CPU_KERNEL( + celu_grad_grad, ops::CELUDoubleGradKernel>, + ops::CELUDoubleGradKernel>, + ops::CELUDoubleGradKernel>); + +/* ========================================================================== */ + /* =========================== sqrt register ============================= */ REGISTER_OPERATOR( sqrt, ops::ActivationOp, ops::SqrtOpMaker, ops::ActivationOpInferVarType, diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index f330f2d7e87ba7..d83a63015cfe5b 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -1202,6 +1202,59 @@ struct CudaELUGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; +template +struct CudaCELUFunctor : public BaseActivationFunctor { + using CT = typename details::MPTypeTrait::Type; + CT zero = static_cast(0.0f); + CT one = static_cast(1.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // celu(x) = max(0, x) + min(0, alpha * (exp(x/alpha) - 1)) + __device__ __forceinline__ T operator()(const T& arg_x) const { + CT x = static_cast(arg_x); + CT temp = static_cast(alpha) * (exp(x / static_cast(alpha)) - one); + CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp); + return static_cast(res); + } +}; + +template +struct CudaCELUGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + MPType one = static_cast(1.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // dx = dout, if alpha > 0 and x > 0 + // dx = dout * (x/alpha).exp(), if alpha > 0 and x <= 0 + // dx = dout , if alpha < 0 and x > 0 + // dx = dout * (x/alpha).exp(), if alpha < 0 and x <=0 + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + MPType a = static_cast(alpha); + MPType temp_a_pos = static_cast(alpha > 0.0f); + MPType temp_a_neg = static_cast(alpha <= 0.0f); + MPType temp_x_pos = static_cast(x > zero); + MPType temp_x_neg = static_cast(x <= zero); + return static_cast( + dout * + (temp_a_pos * temp_x_pos + temp_a_pos * temp_x_neg * exp(x / a) + + temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + template class ActivationCudaKernel : public framework::OpKernel { @@ -1341,6 +1394,19 @@ REGISTER_OP_CUDA_KERNEL( ops::ELUGradGradFunctor>); /* ========================================================================== */ +/* ======================== celu register ============================ */ +REGISTER_ACTIVATION_CUDA_KERNEL(celu, CELU, CudaCELUFunctor, + CudaCELUGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + celu_grad_grad, ops::CELUDoubleGradKernel>, + ops::CELUDoubleGradKernel>, + ops::CELUDoubleGradKernel>); +/* ========================================================================== */ + /* =========================== relu register ============================ */ #ifdef PADDLE_WITH_HIP REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor, diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 4f26cb095c5a72..a6240c038b1100 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -1389,6 +1389,51 @@ struct ELUGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; +template +struct CELUFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + (x < static_cast(0)) + .select(static_cast(alpha) * + ((x / static_cast(alpha)).exp() - static_cast(1)), + x); + } +}; + +template +struct CELUGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp_a_pos = static_cast(alpha > 0); + auto temp_a_neg = static_cast(alpha <= 0); + auto temp_x_pos = (x > static_cast(0)).template cast(); + auto temp_x_neg = (x <= static_cast(0)).template cast(); + + // dx = dout, if alpha > 0 and x > 0 + // dx = dout * (x/alpha).exp(), if alpha > 0 and x <= 0 + // dx = dout , if alpha < 0 and x > 0 + // dx = dout * (x/alpha).exp(), if alpha < 0 and x <=0 + dx.device(d) = + dout * temp_a_pos * temp_x_pos + + dout * (x / static_cast(alpha)).exp() * temp_a_pos * temp_x_neg + + dout * temp_a_neg * temp_x_pos + + dout * (x / static_cast(alpha)).exp() * temp_a_neg * temp_x_neg; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198 template struct PowFunctor : public BaseActivationFunctor { @@ -1775,6 +1820,45 @@ struct ELUGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; +template +struct CELUGradGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(const Device& dev, const framework::Tensor* X, + const framework::Tensor* ddX, framework::Tensor* ddOut, + const framework::Tensor* dOut, framework::Tensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "CELUGradGrad")); + auto x = framework::EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "CELUGradGrad")); + + if (dX) { + auto dx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Output", "DX", "CELUGradGrad")); + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Output", "DOut", "CELUGradGrad")); + dx.device(*d) = ddx * dout / static_cast(alpha) * + (x / static_cast(alpha)).exp() * + (x <= static_cast(0)).template cast(); + } + + if (ddOut) { + auto ddout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "CELUGradGrad")); + ddout.device(*d) = ddx * + ((x > static_cast(0)).template cast() + + (x / static_cast(alpha)).exp() * + (x <= static_cast(0)).template cast()) + .template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + template struct SqrtGradGradFunctor : public BaseActivationFunctor { template @@ -2107,6 +2191,33 @@ class ELUDoubleGradKernel } }; +template +class CELUDoubleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *X, *ddX, *dOut; + X = ddX = dOut = nullptr; + framework::Tensor *dX, *ddOut; + dX = ddOut = nullptr; + + ExtractDoubleGradTensorWithInputDOut(ctx, &X, &ddX, &dX, &dOut, &ddOut); + + if (dX) dX->mutable_data(X->dims(), ctx.GetPlace()); + if (ddOut) ddOut->mutable_data(ctx.GetPlace()); + + auto& place = ctx.template device_context(); + + Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = ctx.Attr(attr.first); + } + functor(place, X, ddX, ddOut, dOut, dX); + } +}; + template class SqrtDoubleGradKernel : public framework::OpKernel { diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index 8f3353d1155f6f..c54f711c7ce129 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -22,6 +22,7 @@ import paddle.fluid.layers as layers import paddle.fluid.core as core import gradient_checker +import paddle.nn.functional as F from decorator_helper import prog_scope @@ -168,6 +169,32 @@ def test_grad(self): self.func(p) +class TestCELUDoubleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + shape = [2, 4, 4, 4] + eps = 1e-6 + alpha = 0.2 + dtype = np.float64 + SEED = 0 + + x = layers.data('x', shape, False, dtype) + x.persistable = True + + y = F.celu(x, alpha=alpha) + np.random.RandomState(SEED) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + gradient_checker.double_grad_check( + [x], y, x_init=x_arr, place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + class TestSqrtDoubleGradCheck(unittest.TestCase): @prog_scope() def func(self, place): diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 346accac01cc70..b82dd631c64890 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -1827,6 +1827,94 @@ def test_errors(self): self.elu(x_fp16) +def celu(x, alpha): + out_ref = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x / alpha) - 1)) + return out_ref.astype(x.dtype) + + +class TestCELU(TestActivation): + def setUp(self): + self.op_type = "celu" + self.init_dtype() + + np.random.seed(1024) + x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype) + alpha = 1.5 + out = celu(x, alpha) + self.inputs = {'X': x} + self.attrs = {'alpha': alpha} + self.outputs = {'Out': out} + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad(['X'], 'Out') + + +class TestCELUAPI(unittest.TestCase): + # test paddle.nn.CELU, paddle.nn.functional.celu + def setUp(self): + np.random.seed(1024) + self.x_np = np.random.uniform(-3, 3, [10, 12]).astype('float32') + self.place=paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \ + else paddle.CPUPlace() + self.executed_api() + + def executed_api(self): + self.celu = F.celu + + def test_static_api(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.fluid.data('X', [10, 12]) + out1 = self.celu(x, 1.5) + m = paddle.nn.CELU(1.5) + out2 = m(x) + exe = paddle.static.Executor(self.place) + res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2]) + out_ref = celu(self.x_np, 1.5) + for r in res: + self.assertEqual(np.allclose(out_ref, r), True) + + def test_dygraph_api(self): + paddle.disable_static(self.place) + x = paddle.to_tensor(self.x_np) + out1 = self.celu(x, 1.5) + x = paddle.to_tensor(self.x_np) + m = paddle.nn.CELU(1.5) + out2 = m(x) + out_ref = celu(self.x_np, 1.5) + for r in [out1, out2]: + self.assertEqual(np.allclose(out_ref, r.numpy()), True) + + out1 = self.celu(x, 0.2) + x = paddle.to_tensor(self.x_np) + m = paddle.nn.CELU(0.2) + out2 = m(x) + out_ref = celu(self.x_np, 0.2) + for r in [out1, out2]: + self.assertEqual(np.allclose(out_ref, r.numpy()), True) + paddle.enable_static() + + def test_errors(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + # The input type must be Variable. + self.assertRaises(TypeError, self.celu, 1) + # The input dtype must be float16, float32, float64. + x_int32 = paddle.fluid.data( + name='x_int32', shape=[10, 12], dtype='int32') + self.assertRaises(TypeError, self.celu, x_int32) + # The alpha must be not equal 0 + x_fp32 = paddle.fluid.data( + name='x_fp32', shape=[10, 12], dtype='float32') + self.assertRaises(ZeroDivisionError, F.celu, x_fp32, 0) + # support the input dtype is float16 + x_fp16 = paddle.fluid.data( + name='x_fp16', shape=[10, 12], dtype='float16') + self.celu(x_fp16) + + class TestELUInplaceAPI(TestELUAPI): # test paddle.nn.functional.elu_ def executed_api(self): @@ -2791,6 +2879,7 @@ def test_check_grad(self): create_test_act_fp16_class(TestRelu6) create_test_act_fp16_class(TestSoftRelu, grad_atol=0.85) create_test_act_fp16_class(TestELU) +create_test_act_fp16_class(TestCELU) create_test_act_fp16_class(TestReciprocal) create_test_act_fp16_class(TestLog) if core.is_compiled_with_rocm(): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layers.py b/python/paddle/fluid/tests/unittests/test_imperative_layers.py index dc15566f85475c..3561405ae090bd 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_layers.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_layers.py @@ -22,6 +22,9 @@ def test_layer_str(self): module = nn.ELU(0.2) self.assertEqual(str(module), 'ELU(alpha=0.2)') + module = nn.CELU(0.2) + self.assertEqual(str(module), 'CELU(alpha=0.2)') + module = nn.GELU(True) self.assertEqual(str(module), 'GELU(approximate=True)') diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index 98444e69d0b1b3..064052c07695de 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -25,6 +25,7 @@ from .clip import ClipGradByValue # noqa: F401 from .decode import BeamSearchDecoder # noqa: F401 from .decode import dynamic_decode # noqa: F401 +from .layer.activation import CELU # noqa: F401 from .layer.activation import ELU # noqa: F401 from .layer.activation import GELU # noqa: F401 from .layer.activation import Tanh # noqa: F401 @@ -185,6 +186,7 @@ def weight_norm(*args): __all__ = [ #noqa 'BatchNorm', + 'CELU', 'GroupNorm', 'LayerNorm', 'SpectralNorm', diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index 4151f25b94aff2..1af53e0826be87 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -15,6 +15,7 @@ # TODO: import all neural network related api under this directory, # including layers, linear, conv, rnn etc. +from .activation import celu # noqa: F401 from .activation import elu # noqa: F401 from .activation import elu_ # noqa: F401 from .activation import gelu # noqa: F401 @@ -115,6 +116,7 @@ from .sparse_attention import sparse_attention __all__ = [ #noqa + 'celu', 'conv1d', 'conv1d_transpose', 'conv2d', diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 67be64c01cbb8f..a39c00075a3de1 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -31,6 +31,50 @@ __all__ = [] +def celu(x, alpha=1.0, name=None): + r""" + celu activation. + + .. math:: + + celu(x) = max(0, x) + min(0, \alpha * (e^{x/\alpha}-1)) + + Parameters: + x (Tensor): The input Tensor with data type float32, float64. + alpha (float, optional): The 'alpha' value of the CELU formulation. Default is 1.0. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + import paddle.nn.functional as F + x = paddle.to_tensor([[-1., 6.], [1., 15.6]]) + out = F.celu(x, alpha=0.2) + # [[-0.19865242, 6. ], + # [ 1. , 15.60000038]] + """ + if alpha == 0: + raise ZeroDivisionError("alpha cannot be 0 for celu") + + if in_dygraph_mode(): + return _C_ops.celu(x, 'alpha', alpha) + + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'celu') + helper = LayerHelper("celu", **locals()) + out = helper.create_variable_for_type_inference(x.dtype) + helper.append_op( + type='celu', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'alpha': alpha}) + return out + + def elu(x, alpha=1.0, name=None): r""" elu activation. diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py index 074dfac5108f96..eb7535b16c6e1e 100644 --- a/python/paddle/nn/layer/__init__.py +++ b/python/paddle/nn/layer/__init__.py @@ -18,6 +18,7 @@ from . import transformer # noqa: F401 from . import container # noqa: F401 +from .activation import CELU # noqa: F401 from .activation import PReLU # noqa: F401 from .activation import ReLU # noqa: F401 from .activation import ReLU6 # noqa: F401 diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py index abfeff0641a472..cf0ac79ca8ff6f 100644 --- a/python/paddle/nn/layer/activation.py +++ b/python/paddle/nn/layer/activation.py @@ -25,6 +25,48 @@ __all__ = [] +class CELU(Layer): + r""" + CELU Activation. + + .. math:: + + CELU(x) = max(0, x) + min(0, \alpha * (e^{x/\alpha}-1)) + + Parameters: + alpha (float, optional): The 'alpha' value of the CELU formulation. Default is 1.0. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Shape: + - input: Tensor with any shape. + - output: Tensor with the same shape as input. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([[-1. ,6.], [1., 15.6]]) + m = paddle.nn.CELU(0.2) + out = m(x) + # [[-0.19865242, 6. ], + # [ 1. , 15.60000038]] + """ + + def __init__(self, alpha=1.0, name=None): + super(CELU, self).__init__() + self._alpha = alpha + self._name = name + + def forward(self, x): + return F.celu(x, self._alpha, self._name) + + def extra_repr(self): + name_str = ', name={}'.format(self._name) if self._name else '' + return 'alpha={}{}'.format(self._alpha, name_str) + + class ELU(Layer): r""" ELU Activation. From 8fd1b6ad5590af047127cecc442b16edbd4783e4 Mon Sep 17 00:00:00 2001 From: Guoxia Wang Date: Wed, 13 Oct 2021 19:52:37 +0800 Subject: [PATCH 151/298] fix BatchNorm for fp16 (#36376) * fix BatchNorm for fp16 --- python/paddle/nn/layer/norm.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index 147e7fca3ff19d..b0e0fe323437d0 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -564,19 +564,25 @@ def __init__(self, self._use_global_stats = use_global_stats if get_default_dtype() == 'float16': - set_default_dtype('float32') + self._dtype = 'float32' + else: + self._dtype = get_default_dtype() param_shape = [num_features] # create parameter if weight_attr == False: self.weight = self.create_parameter( - attr=None, shape=param_shape, default_initializer=Constant(1.0)) + attr=None, + shape=param_shape, + dtype=self._dtype, + default_initializer=Constant(1.0)) self.weight.stop_gradient = True else: self.weight = self.create_parameter( attr=self._weight_attr, shape=param_shape, + dtype=self._dtype, default_initializer=Constant(1.0)) self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0. @@ -584,12 +590,16 @@ def __init__(self, self.bias = self.create_parameter( attr=None, shape=param_shape, + dtype=self._dtype, default_initializer=Constant(0.0), is_bias=True) self.bias.stop_gradient = True else: self.bias = self.create_parameter( - attr=self._bias_attr, shape=param_shape, is_bias=True) + attr=self._bias_attr, + shape=param_shape, + dtype=self._dtype, + is_bias=True) self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0. moving_mean_name = None @@ -600,6 +610,7 @@ def __init__(self, moving_variance_name = name + "_variance" self._mean = self.create_parameter( + dtype=self._dtype, attr=ParamAttr( name=moving_mean_name, initializer=Constant(0.0), @@ -609,6 +620,7 @@ def __init__(self, self._mean.stop_gradient = True self._variance = self.create_parameter( + dtype=self._dtype, attr=ParamAttr( name=moving_variance_name, initializer=Constant(1.0), From 7f5128f4cbdeb8a2a0a9e3705a7f578cf1c08d5c Mon Sep 17 00:00:00 2001 From: Pei Yang Date: Thu, 14 Oct 2021 09:43:57 +0800 Subject: [PATCH 152/298] clean inference logs when config.DisableGlogInfo is triggered (#36356) --- paddle/fluid/framework/ir/fc_gru_fuse_pass.cc | 6 +++--- paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc | 6 +++--- paddle/fluid/framework/ir/layer_norm_fuse_pass.cc | 5 +++-- .../ir/mkldnn/batch_norm_act_fuse_pass.cc | 5 +++-- .../ir/mkldnn/fc_act_mkldnn_fuse_pass.cc | 9 +++++---- .../mkldnn/matmul_transpose_reshape_fuse_pass.cc | 10 ++++++---- .../framework/ir/mkldnn/multi_gru_fuse_pass.cc | 6 +++--- .../ir/mkldnn/multi_gru_seq_fuse_pass.cc | 6 +++--- .../reshape_transpose_matmul_mkldnn_fuse_pass.cc | 15 ++++++++------- .../framework/ir/mkldnn/scale_matmul_fuse_pass.cc | 5 +++-- .../fluid/inference/analysis/ir_pass_manager.cc | 2 ++ 11 files changed, 42 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc index 9a43edf40ef443..52e88c6408b0e8 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc @@ -335,9 +335,9 @@ void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const { graph, name_scope_, param_scope(), true /*with_fc_bias*/); AddStatis(fusion_count); - - string::PrettyLogDetail("--- fused %d pairs of fc gru patterns", - fusion_count); + if (!Has("disable_logs") || !Get("disable_logs")) + string::PrettyLogDetail("--- fused %d pairs of fc gru patterns", + fusion_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index 2e6ce1a0f73818..d72b626fc1ebcf 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -349,9 +349,9 @@ void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const { BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/); AddStatis(fusion_count); - - string::PrettyLogDetail("--- fused %d pairs of fc lstm patterns", - fusion_count); + if (!Has("disable_logs") || !Get("disable_logs")) + string::PrettyLogDetail("--- fused %d pairs of fc lstm patterns", + fusion_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc index 95d55834f823bf..86191587e18495 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc @@ -351,8 +351,9 @@ void LayerNormFusePass::ApplyImpl(Graph* graph) const { gpd(graph, handler); AddStatis(found_layer_norm_count); - PrettyLogDetail("--- Fused %d subgraphs into layer_norm op.", - found_layer_norm_count); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- Fused %d subgraphs into layer_norm op.", + found_layer_norm_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc index 3fdb87f2544036..c5bb4bf0b2fc97 100644 --- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc @@ -150,8 +150,9 @@ void FuseBatchNormActOneDNNPass::FuseBatchNormAct( gpd(graph, handler); AddStatis(found_bn_act_count); - PrettyLogDetail("--- fused %d batch norm with relu activation", - found_bn_act_count); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- fused %d batch norm with relu activation", + found_bn_act_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc index 85d308c7eb30db..093fd5ec538db1 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc @@ -68,9 +68,9 @@ void FuseFCActOneDNNPass::FuseFCAct(Graph *graph, bool approximate = BOOST_GET_CONST(bool, act_op->GetAttr("approximate")); std::string type = approximate ? "_tanh" : "_erf"; fc_op->SetAttr("activation_type", act_type + type); - } else + } else { fc_op->SetAttr("activation_type", act_type); - + } fc_op->SetAttr("use_mkldnn", true); fc_op->SetOutput("Out", {act_out->Name()}); @@ -82,8 +82,9 @@ void FuseFCActOneDNNPass::FuseFCAct(Graph *graph, gpd(graph, handler); AddStatis(found_fc_act_count); - PrettyLogDetail("--- fused %d fc with %s activation", found_fc_act_count, - act_type); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- fused %d fc with %s activation", found_fc_act_count, + act_type); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc index e5bdb08fe4ab48..a61099b4986747 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc @@ -149,10 +149,12 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { gpd(graph, handler); AddStatis(found_matmul_transpose_reshape_count); - std::stringstream msg_ss; - msg_ss << "--- Fused " << found_matmul_transpose_reshape_count - << " MatmulTransposeReshape patterns"; - paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_matmul_transpose_reshape_count + << " MatmulTransposeReshape patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc index 43c9849d5bbe3b..76a0c883c89233 100644 --- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc @@ -111,9 +111,9 @@ void MultiGRUFusePass::ApplyImpl(ir::Graph* graph) const { }; gpd(graph, handler); AddStatis(fused_count); - - PrettyLogDetail("--- fused %d pairs of concatenated multi_gru ops", - fused_count); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- fused %d pairs of concatenated multi_gru ops", + fused_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc index 17770d26d7de9d..7821501cc4b23c 100644 --- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc @@ -126,9 +126,9 @@ void MultiGruSeqFusePass::ApplyImpl(ir::Graph* graph) const { }; gpd(graph, handler); AddStatis(fused_count); - - PrettyLogDetail("--- fused %d sequences of two multi_gru ops", - fused_count); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- fused %d sequences of two multi_gru ops", + fused_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc index 26692849d977b5..e408440f26f1c2 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc @@ -148,13 +148,14 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse( gpd(graph, handler); AddStatis(found_reshape_transpose_matmul_count); - - std::stringstream msg_ss; - msg_ss << "--- Fused " << found_reshape_transpose_matmul_count - << " ReshapeTransposeMatmulMkldnn patterns"; - if (with_reshape_xshape) msg_ss << " with reshape's xshape"; - if (with_transpose_xshape) msg_ss << " with transpose's xshape"; - string::PrettyLogDetail(msg_ss.str().c_str()); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_reshape_transpose_matmul_count + << " ReshapeTransposeMatmulMkldnn patterns"; + if (with_reshape_xshape) msg_ss << " with reshape's xshape"; + if (with_transpose_xshape) msg_ss << " with transpose's xshape"; + string::PrettyLogDetail(msg_ss.str().c_str()); + } } void ReshapeTransposeMatmulMkldnnFusePass::ApplyImpl(ir::Graph *graph) const { diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc index 13f1fa50d080a3..0fc458723ffe43 100644 --- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc @@ -129,8 +129,9 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const { }; gpd(graph, handler); AddStatis(found_scale_matmul_fuse_count); - PrettyLogDetail("--- fused %d scale with matmul", - found_scale_matmul_fuse_count); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- fused %d scale with matmul", + found_scale_matmul_fuse_count); } } // namespace ir diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 4fdd963b6abff9..d2ea6450fc011e 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -237,6 +237,8 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("use_fc_padding", new bool(use_fc_padding)); } + pass->Set("disable_logs", new bool(disable_logs_)); + pre_pass = pass_name; passes_.emplace_back(std::move(pass)); From b857d755743b503e84a66c66b6cf8de5a70bec3e Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Wed, 13 Oct 2021 21:09:05 -0500 Subject: [PATCH 153/298] Sparsity support (#36413) * add pool2d convert test * modify error * modify error * modify error * modify error * modify error * modify error * sparsity support --- paddle/fluid/inference/tensorrt/engine.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 517af24f4d8a96..d075656d15747c 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -225,6 +225,7 @@ void TensorRTEngine::FreezeNetwork() { infer_engine_.reset(infer_builder_->buildEngineWithConfig( *network(), *infer_builder_config_)); #else + infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS); infer_ptr plan(infer_builder_->buildSerializedNetwork( *network(), *infer_builder_config_)); infer_ptr runtime(createInferRuntime(&logger_)); From 03d8304f260fcda9f73236080acab4e0a1f405ee Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Thu, 14 Oct 2021 10:33:36 +0800 Subject: [PATCH 154/298] [hybrid enhance] add flag to control the avg position for grad merge under pipeline mode (#36384) --- .../framework/distributed_strategy.proto | 4 + .../meta_optimizers/sharding_optimizer.py | 62 +++++- python/paddle/fluid/optimizer.py | 4 +- .../test_fleet_sharding_meta_optimizer.py | 195 ++++++++++++++++++ 4 files changed, 263 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index e7a25de96a9471..28eebeb4d9bdc2 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -133,6 +133,10 @@ message GradientScaleConfig { // Else if sum, the gradient will accumulated among multiple // devices. optional string scale_strategy = 1 [ default = 'avg' ]; + // The avg_loss flag is used to determine the position of average + // If scale_gradient is False, it will avg the loss@Grad before grad merge. + // Otherwise, it will do grad merge firstly, then avg the grad after merging. + optional bool scale_gradient = 2 [ default = false ]; } message AsyncConfig { diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 18211459a4e083..8b75c57fab4074 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -18,7 +18,7 @@ from paddle.static import default_startup_program, device_guard from paddle.fluid import layers -from .common import OpRole, OP_ROLE_VAR_KEY, CollectiveHelper +from .common import OpRole, OP_ROLE_VAR_KEY, CollectiveHelper, OP_ROLE_KEY from .common import is_backward_op, is_optimizer_op, is_update_op from .meta_optimizer_base import MetaOptimizerBase from .sharding.shard import Shard, ProgramSegment @@ -193,6 +193,14 @@ def _get_hybrid_dp_mode(self): else: gm_mode = "pp_gm" gm_acc_step = strategy.pipeline_configs['accumulate_steps'] + gradient_scale_configs = strategy.gradient_scale_configs + assert gradient_scale_configs['scale_strategy'] == 'avg', \ + 'For pipeline mode, the ' 'gradient scale mode should ' \ + 'be "avg", but got {}'.format(gradient_scale_configs['scale_strategy']) + # Note (Yuang Liu): this avg_loss flag determines where to do the average op for grad merge. + # If True, will do sum firstly for gradient merge, then do scale by gm_acc_step. + # If False, will scale loss by gm_acc_step first, then do sum for gradient merge. + self.scale_gradient = gradient_scale_configs['scale_gradient'] if gm_acc_step > 1: logger.info("Gradient merge in [{}], acc step = [{}]".format( gm_mode, gm_acc_step)) @@ -241,6 +249,7 @@ def _inner_opt_minimize(self, loss, startup_program, parameter_list, 'global_ring_id': 3, 'mp_degree': self.mp_degree, 'mp_rank': global_rank % self.mp_degree, + 'scale_gradient': self.scale_gradient } main_program = loss.block.program main_program._pipeline_opt = pipeline_opt @@ -362,6 +371,8 @@ def _insert_allreduce_for_pp(self, params_grads): main_block, strategy=strategy, shard=shard) len_of_ops = len(main_block.ops) + if self.scale_gradient: + self._avg_grad_merge_after_sum(main_block, accumulated_grad_names) first_optimize_op_index = get_first_optimize_op_idx(main_block) if self.pp_allreduce_in_optimize: @@ -429,6 +440,55 @@ def _insert_allreduce_for_pp(self, params_grads): # FIXME(wangxi): if fp16_allreduce, put cast fp16->fp32 to there? + def _avg_grad_merge_after_sum(self, main_block, accumulated_grad_names): + if self.user_defined_strategy.amp and \ + self.user_defined_strategy.amp_configs['use_dynamic_loss_scaling']: + # For AMP, if using dynamic loss scaling the avg + # operation can be simple done by modify the LossScaling op. + for idx, op in enumerate(main_block.ops): + if op.type == 'check_finite_and_unscale': + loss_scale_name = op.input('Scale')[0] + loss_scaling_var = main_block.var(loss_scale_name) + loss_scale_tmp_var_name = loss_scale_name + '@TMP' + loss_scale_tmp_var = main_block.create_var( + name=loss_scale_tmp_var_name, + shape=loss_scaling_var.shape, + dtype=loss_scaling_var.dtype) + main_block._insert_op_without_sync( + idx, + type='scale', + inputs={'X': loss_scaling_var}, + outputs={'Out': loss_scale_tmp_var}, + attrs={ + 'scale': self._gradient_merge_acc_step, + 'bias': 0.0, + 'bias_after_scale': False, + OP_ROLE_KEY: OpRole.Optimize + }) + op._rename_input(loss_scale_name, loss_scale_tmp_var_name) + break + else: + # For pp, do the avg operation for gradient merge after merging + # the gradient to meet the logic for gradient merge under pure dp. + tmp_first_opt_idx = None + for idx, op in enumerate(main_block.ops): + if is_optimizer_op(op) and op.type != 'c_sync_comm_stream': + tmp_first_opt_idx = idx + break + assert tmp_first_opt_idx is not None, 'Occurs some errors, no optimize ops' + for grad in accumulated_grad_names: + main_block._insert_op_without_sync( + tmp_first_opt_idx, + type='scale', + inputs={'X': grad}, + outputs={'Out': grad}, + attrs={ + 'scale': 1.0 / self._gradient_merge_acc_step, + 'bias': 0.0, + 'bias_after_scale': False, + OP_ROLE_KEY: OpRole.Optimize + }) + def _adapt_amp_clip_without_sharding(self): # if not use sharding, adapt amp/clip, for remain parallelism. # cast --> amp --> clip --> opt diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index b81862adf5e656..efdd55d856f398 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -5820,6 +5820,7 @@ def minimize(self, self.global_ring_id = pipeline_opt['global_ring_id'] self.mp_degree = pipeline_opt['mp_degree'] self.mp_rank = pipeline_opt['mp_rank'] + self.scale_gradient = pipeline_opt.get('scale_gradient', False) assert self.mp_degree >= 1 assert 0 <= self.mp_rank < self.mp_degree @@ -5886,7 +5887,8 @@ def device_cmp(device1, device2): "startup_program": new_startup_program, } real_block = program_list[self.local_rank].global_block() - self._insert_loss_scale(real_block) + if not self.scale_gradient: + self._insert_loss_scale(real_block) if not self.use_sharding: # Step7: clear gradients before each mini-batch and # accumulate gradients during backward diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index 7cb033b748874c..c7eaf4e0ff33db 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -1272,6 +1272,201 @@ def test_hybrid_with_pp_dp_amp_with_gradient_fuse(self): self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002']) + def test_hybrid_with_pp_dp_amp_with_gradient_fuse_and_avg_after_sum(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.pp_net(train_prog, startup_prog) + strategy.amp = True + strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], } + strategy.sharding = True + strategy.sharding_configs = { + "sharding_degree": 1, + "mp_degree": 1, + "pp_degree": 2, + "dp_degree": 2, + } + strategy.pipeline = True + strategy.pipeline_configs = { + "schedule_mode": "1F1B", + "micro_batch_size": 2, + "accumulate_steps": 4 + } + strategy.gradient_scale_configs = { + 'scale_strategy': 'avg', + 'scale_gradient': True + } + strategy.fuse_grad_merge = True + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + train_prog = train_prog._pipeline_opt['section_program'] + startup_prog = startup_prog._pipeline_opt['startup_program'] + + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check program + startup_prog_op_types = [op.type for op in startup_prog_ops] + main_prog_op_types = [op.type for op in main_prog_ops] + + self.assertEqual(startup_prog_op_types, [ + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', + 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast' + ]) + + self.assertEqual(main_prog_op_types, [ + 'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', + 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', + 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', + 'softmax', 'cross_entropy2', 'mean', 'elementwise_mul', + 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', + 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad', + 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', + 'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', + 'cast', 'sum', 'sum', 'c_allreduce_sum', 'c_allreduce_sum', + 'c_sync_comm_stream', 'scale', 'check_finite_and_unscale', 'cast', + 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', + 'momentum', 'momentum', 'momentum', 'momentum', 'momentum', + 'momentum', 'momentum' + ]) + + def test_hybrid_with_pp_dp_with_gradient_fuse_and_avg_after_sum(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.pp_net(train_prog, startup_prog) + strategy.sharding = True + strategy.sharding_configs = { + "sharding_degree": 1, + "mp_degree": 1, + "pp_degree": 2, + "dp_degree": 2, + } + strategy.pipeline = True + strategy.pipeline_configs = { + "schedule_mode": "1F1B", + "micro_batch_size": 2, + "accumulate_steps": 4 + } + strategy.gradient_scale_configs = { + 'scale_strategy': 'avg', + 'scale_gradient': True + } + strategy.fuse_grad_merge = True + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + train_prog = train_prog._pipeline_opt['section_program'] + startup_prog = startup_prog._pipeline_opt['startup_program'] + + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check program + startup_prog_op_types = [op.type for op in startup_prog_ops] + main_prog_op_types = [op.type for op in main_prog_ops] + + self.assertEqual(startup_prog_op_types, [ + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', + 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', + 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast' + ]) + + self.assertEqual(main_prog_op_types, [ + 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', + 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', + 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', + 'coalesce_tensor', 'coalesce_tensor', 'fill_constant', 'mean_grad', + 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', + 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', + 'sum', 'c_allreduce_sum', 'c_sync_comm_stream', 'scale', 'momentum', + 'momentum', 'momentum', 'momentum', 'momentum', 'momentum', + 'momentum', 'momentum' + ]) + + def test_hybrid_with_pp_dp_with_amp_no_dynamic_gradient_fuse_and_avg_after_sum( + self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.pp_net(train_prog, startup_prog) + strategy.sharding = True + strategy.sharding_configs = { + "sharding_degree": 1, + "mp_degree": 1, + "pp_degree": 2, + "dp_degree": 2, + } + strategy.amp = True + strategy.amp_configs = { + 'custom_black_varnames': ['fc_6.b_0'], + 'use_dynamic_loss_scaling': False + } + strategy.pipeline = True + strategy.pipeline_configs = { + "schedule_mode": "1F1B", + "micro_batch_size": 2, + "accumulate_steps": 4 + } + strategy.gradient_scale_configs = { + 'scale_strategy': 'avg', + 'scale_gradient': True + } + strategy.fuse_grad_merge = True + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + train_prog = train_prog._pipeline_opt['section_program'] + startup_prog = startup_prog._pipeline_opt['startup_program'] + + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check program + startup_prog_op_types = [op.type for op in startup_prog_ops] + main_prog_op_types = [op.type for op in main_prog_ops] + + self.assertEqual(startup_prog_op_types, [ + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', + 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast' + ]) + + self.assertEqual(main_prog_op_types, [ + 'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', + 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', + 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', + 'softmax', 'cross_entropy2', 'mean', 'elementwise_mul', + 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', + 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad', + 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', + 'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', + 'cast', 'sum', 'sum', 'c_allreduce_sum', 'c_allreduce_sum', + 'c_sync_comm_stream', 'scale', 'scale', 'check_finite_and_unscale', + 'momentum', 'momentum', 'momentum', 'momentum', 'momentum', + 'momentum', 'momentum', 'momentum' + ]) + if __name__ == "__main__": unittest.main() From fb68ea6247b9e9058f7b2bfd563bcdada4cdee87 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Thu, 14 Oct 2021 10:40:19 +0800 Subject: [PATCH 155/298] Add static memory analysis module (#36408) * add memory_analysis * fix has_none --- python/paddle/fluid/memory_analysis.py | 77 +++++++++++++++++++ .../tests/unittests/test_memory_analysis.py | 52 +++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 python/paddle/fluid/memory_analysis.py create mode 100644 python/paddle/fluid/tests/unittests/test_memory_analysis.py diff --git a/python/paddle/fluid/memory_analysis.py b/python/paddle/fluid/memory_analysis.py new file mode 100644 index 00000000000000..0bcfeed3516152 --- /dev/null +++ b/python/paddle/fluid/memory_analysis.py @@ -0,0 +1,77 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import core +import numpy as np + + +def get_var_and_memory_size(block, var_name, batch_size=None): + var = block._find_var_recursive(var_name) + assert var is not None, "Variable {} cannot be found".format(var_name) + assert var.type == core.VarDesc.VarType.LOD_TENSOR, "Variable {} is not Tensor".format( + var_name) + shape = list(var.shape) + if not shape: + return var, 0 + + has_none = False + for i, s in enumerate(shape): + if s is None or s < 0: + assert not has_none + shape[i] = batch_size + has_none = True + assert all( + [s >= 0 for s in shape]), "shape {} is not deterministic".format(shape) + mem_size = int(np.prod(shape)) * core.size_of_dtype(var.dtype) + return var, mem_size + + +def pre_allocate_memory(size, place): + t = core.LoDTensor() + t._set_dims([size]) + t._mutable_data(place, core.VarDesc.VarType.INT8) + del t + + +# NOTE: does not consider inplace yet. +def get_max_memory_info(program, batch_size=None): + assert program.num_blocks == 1, "only support to analysis program with only one block" + cur_tmp_mem = 0 + max_tmp_mem = 0 + max_persistable_mem = 0 + visited_vars = set() + alived_vars = [] + + block = program.global_block() + gc_vars = core._get_eager_deletion_vars(program.desc, [])[0] + for i, op in enumerate(block.ops): + var_names = op.input_arg_names + op.output_arg_names + for var_name in var_names: + if var_name in visited_vars: + continue + visited_vars.add(var_name) + var, mem_size = get_var_and_memory_size(block, var_name, batch_size) + if var.persistable: + max_persistable_mem += mem_size + else: + cur_tmp_mem += mem_size + max_tmp_mem = max(max_tmp_mem, cur_tmp_mem) + + cur_gc_vars = gc_vars[i] + for var_name in var_names: + if var_name not in cur_gc_vars: + continue + _, mem_size = get_var_and_memory_size(block, var_name, batch_size) + cur_tmp_mem -= mem_size + return max_tmp_mem, max_persistable_mem diff --git a/python/paddle/fluid/tests/unittests/test_memory_analysis.py b/python/paddle/fluid/tests/unittests/test_memory_analysis.py new file mode 100644 index 00000000000000..9388e07dbf8911 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_memory_analysis.py @@ -0,0 +1,52 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +from paddle.fluid.memory_analysis import pre_allocate_memory, get_max_memory_info +from simple_nets import simple_fc_net + + +class TestMemoryAnalysis(unittest.TestCase): + def setUp(self): + paddle.enable_static() + + def test_get_memory_info(self): + loss = simple_fc_net() + optimizer = paddle.optimizer.Adam(learning_rate=1e-3) + optimizer.minimize(loss) + main_prog = paddle.static.default_main_program() + max_tmp_mem_1, max_persitable_mem_1 = get_max_memory_info( + main_prog, batch_size=32) + self.assertGreater(max_tmp_mem_1, 0) + self.assertGreater(max_persitable_mem_1, 0) + max_tmp_mem_2, max_persitable_mem_2 = get_max_memory_info( + main_prog, batch_size=64) + self.assertEqual(max_persitable_mem_1, max_persitable_mem_2) + self.assertLess(max_tmp_mem_1, max_tmp_mem_2) + + +class TestPreAllocateMemory(unittest.TestCase): + def setUp(self): + paddle.enable_static() + + def test_pre_allocate(self): + size = 32 * 1024 * 1024 + pre_allocate_memory(size, paddle.CPUPlace()) + if paddle.is_compiled_with_cuda(): + pre_allocate_memory(size, paddle.CUDAPlace(0)) + + +if __name__ == "__main__": + unittest.main() From cb5bf583c947d3eb026833a1b9005191dee23099 Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Thu, 14 Oct 2021 10:43:12 +0800 Subject: [PATCH 156/298] fix import bug for assign (#36406) --- python/paddle/autograd/functional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py index 4d7fcd733cdb0b..17c7ad5b18af5f 100644 --- a/python/paddle/autograd/functional.py +++ b/python/paddle/autograd/functional.py @@ -16,7 +16,7 @@ import paddle from ..fluid import framework from ..fluid.dygraph import grad -from ..nn.initializer import assign +from ..tensor.creation import assign from ..tensor import reshape, zeros_like, to_tensor from .utils import _tensors, _stack_tensor_or_return_none, _replace_none_with_zero_tensor From 693b1aa15d95b281ca61c2ad46fb60ab6f0695d3 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Thu, 14 Oct 2021 10:49:56 +0800 Subject: [PATCH 157/298] reduce some unittest's parallel number to avoding timeout failure (#36397) --- paddle/scripts/paddle_build.bat | 16 ++++++++-------- tools/parallel_UT_rule.py | 14 +++++--------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index e6320d5bd154d4..e44c877d6a2f32 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -381,7 +381,7 @@ if not exist %THIRD_PARTY_PATH% ( echo There is no usable third_party cache in %THIRD_PARTY_PATH%, will download from bos. pip install wget if not exist %THIRD_PARTY_HOME% mkdir "%THIRD_PARTY_HOME%" - cd %THIRD_PARTY_HOME% + cd /d %THIRD_PARTY_HOME% echo Getting third party: downloading ... %PYTHON_ROOT%\python.exe -c "import wget;wget.download('https://paddle-windows.bj.bcebos.com/third_party/%sub_dir%/%md5%.tar.gz')" 2>nul if !ERRORLEVEL! EQU 0 ( @@ -397,7 +397,7 @@ if not exist %THIRD_PARTY_PATH% ( echo Get third party failed, reason: download failed, will build locally. ) if not exist %THIRD_PARTY_PATH% set UPLOAD_TP_FILE=ON - cd %work_dir%\%BUILD_DIR% + cd /d %work_dir%\%BUILD_DIR% ) else ( echo Found reusable third_party cache in %THIRD_PARTY_PATH%, will reuse it. ) @@ -519,16 +519,16 @@ if "%UPLOAD_TP_FILE%"=="ON" ( echo Uploading third_party: checking bce ... if not exist %cache_dir%\bce-python-sdk-0.8.33 ( echo There is no bce in this PC, will install bce. - cd %cache_dir% + cd /d %cache_dir% echo Download package from https://paddle-windows.bj.bcebos.com/bce-python-sdk-0.8.33.tar.gz %PYTHON_ROOT%\python.exe -c "import wget;wget.download('https://paddle-windows.bj.bcebos.com/bce-python-sdk-0.8.33.tar.gz')" %PYTHON_ROOT%\python.exe -c "import shutil;shutil.unpack_archive('bce-python-sdk-0.8.33.tar.gz', extract_dir='./',format='gztar')" - cd %cache_dir%\bce-python-sdk-0.8.33 + cd /d %cache_dir%\bce-python-sdk-0.8.33 %PYTHON_ROOT%\python.exe setup.py install 1>nul del %cache_dir%\bce-python-sdk-0.8.33.tar.gz ) if !errorlevel! EQU 0 ( - cd %THIRD_PARTY_HOME% + cd /d %THIRD_PARTY_HOME% echo Uploading third_party: compressing ... tar -zcf %md5%.tar.gz %md5% if !errorlevel! EQU 0 ( @@ -546,7 +546,7 @@ if "%UPLOAD_TP_FILE%"=="ON" ( ) else ( echo Failed upload third party to bos, reason: install bce failed. ) - cd %work_dir%\%BUILD_DIR% + cd /d %work_dir%\%BUILD_DIR% ) echo Build Paddle successfully! @@ -711,7 +711,7 @@ for /F %%i in ("%libsize%") do ( echo ipipe_log_param_Windows_Paddle_Inference_Size: !libsize_m!M ) -cd %work_dir%\paddle\fluid\inference\api\demo_ci +cd /d %work_dir%\paddle\fluid\inference\api\demo_ci %cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT%/include %TENSORRT_ROOT%/lib %MSVC_STATIC_CRT% goto:eof @@ -811,7 +811,7 @@ echo ======================================== echo Step 7. Testing fluid library with infer_ut for inference ... echo ======================================== -cd %work_dir%\paddle\fluid\inference\tests\infer_ut +cd /d %work_dir%\paddle\fluid\inference\tests\infer_ut %cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %MSVC_STATIC_CRT% goto:eof diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index 54e8d608ac67d3..803e173e071f69 100644 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -676,12 +676,10 @@ 'test_static_save_load_large', 'version_test', 'var_type_traits_test', - 'var_type_inference_test', 'variable_test', 'unroll_array_ops_test', 'tuple_test', 'to_string_test', - 'timer_test', 'threadpool_test', 'test_zeros_op', 'test_while_op', @@ -1015,7 +1013,6 @@ 'program_desc_test', 'profiler_test', 'place_test', - 'pass_test', 'op_version_registry_test', 'op_tester', 'op_proto_maker_test', @@ -1179,7 +1176,6 @@ 'test_fleet_sharding_meta_optimizer', 'test_listen_and_serv_op', 'test_analyzer_zerocopytensor_tensor', - 'test_conv_bn_fuse_pass_cc', 'test_collective_optimizer', 'test_bf16_utils', 'test_analyzer_seq_pool1_compare_determine', @@ -1236,6 +1232,9 @@ # It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, # just remove it from this list. TETRAD_PARALLEL_JOB = [ + 'timer_test', + 'var_type_inference_test', + 'pass_test', 'graph_node_test', 'test_assert', 'test_nce', @@ -1254,7 +1253,6 @@ 'test_imperative_using_non_zero_gpu', 'retry_allocator_test', 'system_allocator_test', - 'test_fc_fuse_pass_cc', 'test_fc_lstm_fuse_pass_cc', 'test_fc_gru_fuse_pass_cc', 'test_conv_bn_fuse_pass_cc', @@ -1281,14 +1279,11 @@ 'test_analyzer_bert', 'test_analyzer_googlenet', 'test_fleet_base', - 'test_sequential', - 'test_sequential', 'test_imperative_layers', 'test_dgc_momentum_op', 'test_memcpy_op', 'test_dgc_op', 'test_lookahead', - 'test_callback_visualdl', 'test_new_group_api', 'test_collective_split_embedding_none_divisible', 'test_collective_wait', @@ -1304,6 +1299,8 @@ # It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, # just remove it from this list. TWO_PARALLEL_JOB = [ + 'test_callback_visualdl', + 'test_sequential', 'test_lambv2_op', 'test_math_op_patch', 'test_tensor_to_numpy', @@ -1398,7 +1395,6 @@ 'test_kron_op', 'test_isfinite_v2_op', 'test_ctc_align', - 'test_imperative_save_load_v2', 'test_decayed_adagrad_op', 'test_dropout_op', 'test_functional_conv3d', From 8ffcc7c85cd4538314bf3159dd8d37ba75d80e17 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Thu, 14 Oct 2021 10:57:24 +0800 Subject: [PATCH 158/298] [HybridParallel]Rebuild code for pipeline (#36396) * add no_sync for parameters sync * add pipeline for moe --- .../fleet/meta_parallel/pipeline_parallel.py | 55 +++++++++++-------- python/paddle/fluid/dygraph/parallel.py | 10 +++- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 431bc6d7bc389c..90960973972777 100755 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -77,26 +77,15 @@ def __init__(self, layers, hcg, strategy): logger.info("start broadcast dp parameters") broadcast_dp_parameters(self._layers, self._hcg) - def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): - assert isinstance(optimizer, HybridParallelOptimizer), ( - 'optimizer should be HybridParallelOptimizer subclass.') - - assert fluid.framework._dygraph_tracer()._has_grad, ( - 'Please enable the generation of gradients.') - - if self.is_first_stage or self.is_last_stage: - assert data is not None, ( - "For the first and the last stage, the data must be set.") - else: - data = None + def forward_backward_pipeline(self, data, scaler=None): + # use the 1f1b scheduling strategy. + # this strategy is inspired by: + # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/schedules.py - self.optimizer = optimizer - self.lr_scheduler = lr_scheduler self.scaler = scaler - self.data = data - self._compute_loss = True - self._layers.train() + # store data for train + self.data = data # store total loss of entire batch self.total_loss = None @@ -104,10 +93,6 @@ def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): # store data id for micro_batch self.micro_batch_id = 0 - # Next, use the 1f1b scheduling strategy. - # this strategy is inspired by: - # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/schedules.py - startup_steps = (self.num_stages - self.stage_id - 1) startup_steps = min(startup_steps, self.accumulate_steps) steady_steps = self.accumulate_steps - startup_steps @@ -161,11 +146,35 @@ def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): self._layers.allreduce_shared_weight_gradients() - self.train_loss = self._broadcast_final_loss() + train_loss = self._broadcast_final_loss() + + return train_loss + + def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): + assert isinstance(optimizer, HybridParallelOptimizer), ( + 'optimizer should be HybridParallelOptimizer subclass.') + + assert fluid.framework._dygraph_tracer()._has_grad, ( + 'Please enable the generation of gradients.') + + if self.is_first_stage or self.is_last_stage: + assert data is not None, ( + "For the first and the last stage, the data must be set.") + else: + data = None + + self.optimizer = optimizer + self.lr_scheduler = lr_scheduler + + self._layers.train() + + # 1f1b for pipeline + train_loss = self.forward_backward_pipeline(data, scaler) # optimizer self._optimizer_step() - return self.train_loss + + return train_loss def eval_batch(self, data, compute_loss=False): self._layers.eval() diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index e4525a8d17992a..7dd8d38aa70efb 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -354,9 +354,15 @@ def sync_params_buffers(model, if not isinstance(param, core.VarBase): raise TypeError("The data type of '%s' must be Varbase" % param.name) + # is_distributed param not need to sync when in mp mode - if is_model_parallel and isinstance(param, ParamBase): - if param.is_distributed: + if isinstance(param, ParamBase): + if is_model_parallel and param.is_distributed: + continue + + # NOTE(shenliang03): Support situations that do not require synchronization parameters, + # such as moe's expert parameters + if getattr(param, "no_sync", False): continue model_vars.append(param.detach()) From eb722e34596be4f3980d59408c924727309f9582 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Thu, 14 Oct 2021 11:21:04 +0800 Subject: [PATCH 159/298] refine lars (#36409) --- python/paddle/fluid/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index efdd55d856f398..228ba08499808f 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -2086,7 +2086,7 @@ def _append_optimize_op(self, block, param_and_grad): # create the momentum optimize op momentum_op = block.append_op( - type=self.type, + type=self.type if _lars_weight_decay != 0.0 else 'momentum', inputs=inputs, outputs=outputs, attrs=attrs, From f4eda869f3f46d0f5097e4a10af4566a9e15e786 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Thu, 14 Oct 2021 14:41:15 +0800 Subject: [PATCH 160/298] Merge momentum ops/kernels (#36380) * merge momentum ops * update * add ut to improve coverage * remove optimizer change * fix error msg * update ut * add __restrict__ for CUDA * update ut * move merged_momentum_op to optimizer dir * fix coverage --- .../optimizers/merged_momentum_op.cc | 95 +++++++++ .../optimizers/merged_momentum_op.cu | 24 +++ .../operators/optimizers/merged_momentum_op.h | 197 ++++++++++++++++++ paddle/fluid/platform/macros.h | 6 + .../unittests/test_merged_momentum_op.py | 194 +++++++++++++++++ 5 files changed, 516 insertions(+) create mode 100644 paddle/fluid/operators/optimizers/merged_momentum_op.cc create mode 100644 paddle/fluid/operators/optimizers/merged_momentum_op.cu create mode 100644 paddle/fluid/operators/optimizers/merged_momentum_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_merged_momentum_op.py diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cc b/paddle/fluid/operators/optimizers/merged_momentum_op.cc new file mode 100644 index 00000000000000..6c63376b5eb425 --- /dev/null +++ b/paddle/fluid/operators/optimizers/merged_momentum_op.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" + +namespace paddle { +namespace operators { + +class MergedMomentumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override {} + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto param_dtype = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "Param"); + return framework::OpKernelType(param_dtype, ctx.GetPlace()); + } +}; + +class MergedMomentumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter that has to be updated") + .AsDuplicable(); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter") + .AsDuplicable(); + AddInput("Velocity", + "(Tensor, default Tensor) " + "Input velocity (corresponding to the parameter) " + "that has to be updated") + .AsDuplicable(); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "Input learning rate"); + AddInput("MasterParam", "FP32 master weight for AMP.") + .AsDispensable() + .AsDuplicable(); + AddOutput("ParamOut", + "(Tensor) This output is updated parameter. " + "It shared memory with Input(Param).") + .AsDuplicable(); + AddOutput("VelocityOut", + "(Tensor) This output is updated velocity. " + "It shared memory with Input(Velocity).") + .AsDuplicable(); + AddOutput("MasterParamOut", + "The updated FP32 master weight for AMP. " + "It shared memory with Input(MasterParam).") + .AsDispensable() + .AsDuplicable(); + AddAttr("mu", "(float) Momentum coefficient"); + AddAttr("multi_precision", + "(bool, default false) " + "Whether to use multi-precision during weight updating.") + .SetDefault(false); + AddAttr( + "rescale_grad", + "(float, default 1.0) Multiply the gradient with `rescale_grad`" + "before updating. Often choose to be `1.0/batch_size`.") + .SetDefault(1.0f); + AddComment(R"DOC(Merged Momentum Optimizer.)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_WITHOUT_GRADIENT(merged_momentum, ops::MergedMomentumOp, + ops::MergedMomentumOpMaker); + +REGISTER_OP_CPU_KERNEL( + merged_momentum, ops::MergedMomentumOpKernel, + ops::MergedMomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cu b/paddle/fluid/operators/optimizers/merged_momentum_op.cu new file mode 100644 index 00000000000000..7e4bbd9807938c --- /dev/null +++ b/paddle/fluid/operators/optimizers/merged_momentum_op.cu @@ -0,0 +1,24 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + merged_momentum, + ops::MergedMomentumOpKernel, + ops::MergedMomentumOpKernel, + ops::MergedMomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.h b/paddle/fluid/operators/optimizers/merged_momentum_op.h new file mode 100644 index 00000000000000..4dfaa4de3ad447 --- /dev/null +++ b/paddle/fluid/operators/optimizers/merged_momentum_op.h @@ -0,0 +1,197 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace operators { + +template +struct MergedMomentumMasterParams { + MT *PADDLE_RESTRICT master_params[kParamNum]; + + HOSTDEVICE MT *MasterParam(size_t idx) const { return master_params[idx]; } + HOSTDEVICE void SetMasterParam(size_t idx, MT *p) { master_params[idx] = p; } +}; + +template +struct MergedMomentumMasterParams { + HOSTDEVICE constexpr MT *MasterParam(size_t) const { return nullptr; } + HOSTDEVICE constexpr void SetMasterParam(size_t, MT *) {} +}; + +template +struct MergedMomentumKernelParam + : public MergedMomentumMasterParams { + static constexpr auto N = kParamNum; + size_t sizes[N]; + T *PADDLE_RESTRICT params[N]; + const T *PADDLE_RESTRICT grads[N]; + MT *PADDLE_RESTRICT velocitys[N]; + const MT *PADDLE_RESTRICT lr; + MT mu; + MT rescale_grad; + uint32_t param_num; + + HOSTDEVICE void operator()(size_t i) const { + const auto lr_val = *lr; + for (uint32_t idx = 0; idx < param_num; ++idx) { + auto size = sizes[idx]; + if (i >= size) continue; + + auto param_p = params[idx]; + auto grad_p = grads[idx]; + auto velocity_p = velocitys[idx]; + auto master_param_p = this->MasterParam(idx); + + const MT param = + master_param_p ? master_param_p[i] : static_cast(param_p[i]); + const MT grad = static_cast(grad_p[i]) * rescale_grad; + const MT velocity = velocity_p[i]; + const MT velocity_out = velocity * mu + grad; + const MT param_out = param - lr_val * velocity_out; + velocity_p[i] = velocity_out; + param_p[i] = static_cast(param_out); + if (master_param_p) { + master_param_p[i] = param_out; + } + } + } +}; + +template +class MergedMomentumOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto params = ctx.MultiInput("Param"); + auto params_out = ctx.MultiOutput("ParamOut"); + size_t n = params.size(); + PADDLE_ENFORCE_EQ( + n, params_out.size(), + platform::errors::InvalidArgument( + "Output(ParamOut) number must be equal to Input(Param) number.")); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ( + params[i], params_out[i], + platform::errors::InvalidArgument( + "Input(Param) and Output(ParamOut) must be the same Tensors.")); + } + + auto grads = ctx.MultiInput("Grad"); + PADDLE_ENFORCE_EQ( + n, grads.size(), + platform::errors::InvalidArgument( + "Input(Grad) number must be equal to Input(Param) number.")); + + auto velocitys = ctx.MultiInput("Velocity"); + PADDLE_ENFORCE_EQ(n, velocitys.size(), + platform::errors::InvalidArgument( + "Input(Velocity) number and Input(Param) number.")); + + auto velocitys_out = ctx.MultiOutput("VelocityOut"); + PADDLE_ENFORCE_EQ( + n, velocitys_out.size(), + platform::errors::InvalidArgument("Output(VelocityOut) number must be " + "equal to Input(Param) number.")); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(velocitys[i], velocitys_out[i], + platform::errors::InvalidArgument( + "Input(Velocity) and Output(VelocityOut) must be " + "the same Tensors.")); + } + + auto master_params = ctx.MultiInput("MasterParam"); + auto master_params_out = + ctx.MultiOutput("MasterParamOut"); + auto multi_precision = ctx.Attr("multi_precision"); + if (multi_precision) { + PADDLE_ENFORCE_EQ( + n, master_params.size(), + platform::errors::InvalidArgument("Input(MasterParam) number must be " + "equal to Input(Param) number.")); + PADDLE_ENFORCE_EQ(n, master_params_out.size(), + platform::errors::InvalidArgument( + "Output(MasterParamOut) number must be equal to " + "Input(MasterParam) number.")); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(master_params[i], master_params_out[i], + platform::errors::InvalidArgument( + "Input(MasterParam) and Output(MasterParamOut) " + "must be the same Tensors.")); + PADDLE_ENFORCE_NOT_NULL(master_params[i], + platform::errors::InvalidArgument( + "Input(MasterParam) must be provided when " + "multi_precision=True.")); + } + } else { + master_params.clear(); + master_params_out.clear(); + } + + auto lr = ctx.Input("LearningRate"); + auto mu = ctx.Attr("mu"); + auto rescale_grad = ctx.Attr("rescale_grad"); + using MPType = typename operators::details::MPTypeTrait::Type; + + auto &dev_ctx = ctx.template device_context(); + +#define PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(kMultiPrecision) \ + MergedMomentumKernelParam kernel_params; \ + constexpr auto kMaxMergedNum = decltype(kernel_params)::N; \ + size_t kernel_num = (n + kMaxMergedNum - 1) / kMaxMergedNum; \ + kernel_params.mu = static_cast(mu); \ + kernel_params.rescale_grad = static_cast(rescale_grad); \ + kernel_params.lr = lr->data(); \ + for (size_t i = 0; i < kernel_num; ++i) { \ + size_t start = i * kMaxMergedNum; \ + size_t end = std::min((i + 1) * kMaxMergedNum, n); \ + kernel_params.param_num = static_cast(end - start); \ + size_t max_size = 0; \ + for (size_t j = 0; j < kernel_params.param_num; ++j) { \ + auto size = static_cast(params_out[j + start]->numel()); \ + max_size = std::max(max_size, size); \ + kernel_params.sizes[j] = size; \ + kernel_params.params[j] = params_out[j + start]->data(); \ + kernel_params.grads[j] = grads[j + start]->data(); \ + kernel_params.velocitys[j] = velocitys_out[j + start]->data(); \ + kernel_params.SetMasterParam( \ + j, kMultiPrecision ? master_params_out[j + start]->data() \ + : nullptr); \ + } \ + platform::ForRange for_range(dev_ctx, max_size); \ + for_range(kernel_params); \ + VLOG(10) << "Launch MergedMomentum kernel " << i << " " \ + << kernel_params.param_num; \ + } + + if (multi_precision) { + PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(true); + } else { + PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(false); + } + +#undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h index fb5cf9fb319157..bf089ac117d415 100644 --- a/paddle/fluid/platform/macros.h +++ b/paddle/fluid/platform/macros.h @@ -30,3 +30,9 @@ limitations under the License. */ #define FLT_MAX __FLT_MAX__ #endif // __FLT_MAX__ #endif // PADDLE_WITH_MUSL + +#if defined(__NVCC__) || defined(__HIPCC__) +#define PADDLE_RESTRICT __restrict__ +#else +#define PADDLE_RESTRICT +#endif diff --git a/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py new file mode 100644 index 00000000000000..0118a372c3f4d4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py @@ -0,0 +1,194 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import numpy as np +from paddle.fluid.layer_helper import LayerHelper +from collections import OrderedDict + + +def run_momentum_op(params, + grads, + velocitys, + master_params, + learning_rate, + place, + multi_precision, + mu=0.9, + rescale_grad=0.01, + use_merged=False): + assert len(params) == len(grads) + assert len(params) == len(velocitys) + if multi_precision: + assert len(params) == len(master_params) + op_type = 'merged_momentum' if use_merged else 'momentum' + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + helper = LayerHelper(op_type, **locals()) + attrs = { + 'mu': mu, + 'multi_precision': multi_precision, + 'rescale_grad': rescale_grad, + } + + param_vars = [ + helper.create_variable( + persistable=True, shape=p.shape, dtype=p.dtype) for p in params + ] + grad_vars = [ + helper.create_variable( + shape=g.shape, dtype=g.dtype) for g in grads + ] + velocity_vars = [ + helper.create_variable( + persistable=True, shape=v.shape, dtype=v.dtype) + for v in velocitys + ] + lr_var = helper.create_variable( + persistable=True, + shape=learning_rate.shape, + dtype=learning_rate.dtype) + + feed_dict = OrderedDict() + + feed_dict.update( + OrderedDict([(p_var.name, p_val) + for p_var, p_val in zip(param_vars, params)])) + feed_dict.update( + OrderedDict([(v_var.name, v_val) + for v_var, v_val in zip(velocity_vars, velocitys)])) + fetch_list = list(feed_dict.keys()) + + feed_dict.update( + OrderedDict([(g_var.name, g_val) + for g_var, g_val in zip(grad_vars, grads)])) + feed_dict.update({lr_var.name: learning_rate}) + + if multi_precision: + master_param_vars = [ + helper.create_variable( + persistable=True, shape=p.shape, dtype=p.dtype) + for p in master_params + ] + feed_dict.update( + OrderedDict([(mp_var.name, mp_val) + for mp_var, mp_val in zip(master_param_vars, + master_params)])) + # CPUPlace does not use MasterParam + if isinstance(place, paddle.CUDAPlace): + fetch_list = fetch_list + [ + mp_var.name for mp_var in master_param_vars + ] + else: + master_param_vars = None + + if not use_merged: + for i, (p, g, + v) in enumerate(zip(param_vars, grad_vars, velocity_vars)): + inputs = { + 'Param': p, + 'Grad': g, + 'Velocity': v, + 'LearningRate': lr_var + } + outputs = {'ParamOut': p, 'VelocityOut': v} + if multi_precision: + inputs['MasterParam'] = master_param_vars[i] + outputs['MasterParamOut'] = master_param_vars[i] + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=attrs) + else: + inputs = { + 'Param': param_vars, + 'Grad': grad_vars, + 'Velocity': velocity_vars, + 'LearningRate': lr_var + } + outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars} + if multi_precision: + inputs['MasterParam'] = master_param_vars + outputs['MasterParamOut'] = master_param_vars + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=attrs) + + exe = paddle.static.Executor(place) + with paddle.static.scope_guard(paddle.static.Scope()): + exe.run(startup) + return exe.run(main, feed=feed_dict, fetch_list=fetch_list) + + +class TestMergedMomentum(unittest.TestCase): + def setUp(self): + paddle.enable_static() + self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]] + self.seed = 10 + + def gen_rand_data(self, shapes, dtype): + return [np.random.random(s).astype(dtype) for s in shapes] + + def prepare_data(self, shapes, multi_precision, seed, place): + np.random.seed(seed) + mp_dtype = np.float32 + dtype = np.float16 if multi_precision and isinstance( + place, paddle.CUDAPlace) else np.float32 + params = self.gen_rand_data(shapes, dtype) + grads = self.gen_rand_data(shapes, dtype) + velocitys = self.gen_rand_data(shapes, mp_dtype) + learning_rate = self.gen_rand_data([[1]], mp_dtype)[0] + if multi_precision: + master_params = [p.astype(mp_dtype) for p in params] + else: + master_params = None + return params, grads, velocitys, master_params, learning_rate + + def check_with_place(self, place, multi_precision): + params, grads, velocitys, master_params, learning_rate = self.prepare_data( + self.shapes, multi_precision, self.seed, place) + + def run_op(use_merged): + # FIXME(zengjinle): CPU Momentum Op does not support rescale_grad + rescale_grad = 1.0 if isinstance(place, paddle.CPUPlace) else 0.01 + return run_momentum_op( + params, + grads, + velocitys, + master_params, + learning_rate, + place, + multi_precision, + rescale_grad=rescale_grad, + use_merged=use_merged) + + outs1 = run_op(True) + outs2 = run_op(False) + self.assertEqual(len(outs1), len(outs2)) + for i, (out1, out2) in enumerate(zip(outs1, outs2)): + self.assertTrue(np.allclose(out1, out2, atol=1e-7)) + + def get_places(self): + places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + places.append(paddle.CUDAPlace(0)) + return places + + def test_main(self): + for multi_precision in [False, True]: + for place in self.get_places(): + self.check_with_place(place, multi_precision) + + +if __name__ == "__main__": + unittest.main() From 5d18967b66af832435856c76db174faf8919fa26 Mon Sep 17 00:00:00 2001 From: lidanqing Date: Thu, 14 Oct 2021 15:24:34 +0800 Subject: [PATCH 161/298] Revert "Implemented LRU based cache clearing (#36290)" (#36426) This reverts commit bf748f245eb74ffc86e44853fa9ebad7c858b015. --- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 49 ++++---- .../mkldnn/conv_transpose_mkldnn_op.cc | 33 +++--- .../operators/mkldnn/quantize_mkldnn_op.cc | 105 ++++++++++++------ paddle/fluid/platform/device_context.cc | 63 ++++------- paddle/fluid/platform/device_context.h | 15 +-- paddle/fluid/platform/mkldnn_reuse.h | 17 ++- 6 files changed, 146 insertions(+), 136 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 84c989f64e46c0..cce835e6bc0354 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -78,8 +78,7 @@ class ConvMKLDNNHandlerT mkldnn::convolution_backward_weights>( dev_ctx, mkldnn_engine, cpu_place, platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), - unique_name)), - is_test_(ctx.Attr("is_test")) { + unique_name)) { if (!this->isCached()) { PADDLE_ENFORCE_EQ( input->layout(), framework::DataLayout::kMKLDNN, @@ -160,6 +159,7 @@ class ConvMKLDNNHandlerT framework::slice_ddim(filter_dims, 2, filter_dims.size()); const auto ksize = framework::vectorize(filter_data_dims); + const bool is_test = ctx.Attr("is_test"); auto strides_temp = ctx.Attr>("strides"); std::vector strides(begin(strides_temp), end(strides_temp)); @@ -214,8 +214,9 @@ class ConvMKLDNNHandlerT const auto dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - const auto fwd_prop_kind = is_test_ ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training; + const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training; + float sum_scale = 1.0f; std::vector output_shift_scale; if (platform::is_int8()) @@ -260,8 +261,7 @@ class ConvMKLDNNHandlerT mkldnn::convolution_backward_weights>( dev_ctx, dev_ctx.GetEngine(), cpu_place, platform::CreateKey(dev_ctx, framework::vectorize(in->dims()), - unique_name)), - is_test_(false) { + unique_name)) { if (!this->isBwdCached()) { PADDLE_ENFORCE_EQ( in->layout(), framework::DataLayout::kMKLDNN, @@ -291,7 +291,7 @@ class ConvMKLDNNHandlerT "Wrong format set for output_grad tensor")); PADDLE_ENFORCE_EQ( - is_test_, false, + ctx.Attr("is_test"), false, platform::errors::InvalidArgument( "is_test attribute should be set to False in training phase.")); @@ -557,14 +557,13 @@ class ConvMKLDNNHandlerT framework::vectorize(in_mem->dims()), platform::MKLDNNGetDataType(), in_mem->format()); return this->AcquireMemoryWithReorder( - user_mem_md, mem_md, platform::to_void_cast(in_mem_data), key_mem, - is_test_); + user_mem_md, mem_md, platform::to_void_cast(in_mem_data), key_mem); } else { const std::string target_key_suffix{key_mem_target}; const auto target_mem_p = this->AcquireMemory(target_key_suffix); user_mem_p->set_data_handle(platform::to_void_cast(in_mem_data)); if (user_mem_p != target_mem_p) { - this->AcquireReorder(user_mem_p, target_mem_p); + this->AcquireReorder(user_mem_p, target_mem_p, key_mem); } return target_mem_p; } @@ -572,11 +571,12 @@ class ConvMKLDNNHandlerT std::shared_ptr AcquireWeightsMemoryWithReorder( const framework::Tensor* filter, const int groups, const bool is_conv3d, - const std::vector& scale_data = {1.0f}, int mask = 0) { + const bool is_test, const std::vector& scale_data = {1.0f}, + int mask = 0) { // This is workaround to make execution faster, delete // if statement after including md inside Tensor auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target"); - if (is_test_ && weights_mem_p) { + if (is_test && weights_mem_p) { return weights_mem_p; } else { const K* filter_data = filter->data(); @@ -589,16 +589,16 @@ class ConvMKLDNNHandlerT return this->AcquireMemoryWithReorder( user_src_md, this->fwd_pd_->weights_desc(), - platform::to_void_cast(filter_data), "@weights_mem_p", is_test_, - {}, scale_data, mask); + platform::to_void_cast(filter_data), "@weights_mem_p", is_test, {}, + scale_data, mask); } } std::shared_ptr AcquireBiasMemoryWithReorder( - const framework::Tensor* bias, + const framework::Tensor* bias, const bool is_test, const std::vector& scale_data = {1.0f}, int mask = 0) { auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target"); - if (is_test_ && bias_mem_p) { + if (is_test && bias_mem_p) { return bias_mem_p; } else { const K* bias_data = bias->data(); @@ -608,7 +608,7 @@ class ConvMKLDNNHandlerT return this->AcquireMemoryWithReorder( user_bias_md, this->fwd_pd_->bias_desc(), - platform::to_void_cast(bias_data), "@bias_mem_p", is_test_, {}, + platform::to_void_cast(bias_data), "@bias_mem_p", is_test, {}, scale_data, mask); } } @@ -641,7 +641,7 @@ class ConvMKLDNNHandlerT platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc())) { auto residual_memory_p = this->AcquireResidualMemory(residual_param); dst_memory_p = this->template AcquireDstMemory(output); - this->AcquireReorder(residual_memory_p, dst_memory_p); + this->AcquireReorder(residual_memory_p, dst_memory_p, "@residual_dst"); } else { // Changing ShareDataWith to TensorCopy results in performance drop // on ResNet architectures @@ -651,9 +651,6 @@ class ConvMKLDNNHandlerT } return dst_memory_p; } - - private: - const bool is_test_; }; } // anonymous namespace @@ -698,6 +695,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel { ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); + const bool is_test = ctx.Attr("is_test"); const bool is_conv3d = ctx.Attr>("strides").size() == 3U; const bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); @@ -714,7 +712,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel { auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( - filter, ctx.Attr("groups"), is_conv3d); + filter, ctx.Attr("groups"), is_conv3d, is_test); std::shared_ptr dst_memory_p; if (fuse_residual_conn) { @@ -733,7 +731,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel { {MKLDNN_ARG_DST, *dst_memory_p}}; if (bias) { - auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias); + auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias, is_test); args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } @@ -785,10 +783,11 @@ class ConvMKLDNNOpKernel : public framework::OpKernel { ctx.Attr>("Scale_weights"); const bool is_multi_channel = scale_weights_data.size() > 1; const int& groups = ctx.Attr("groups"); + const bool& is_test = ctx.Attr("is_test"); int mask_reorder = is_multi_channel ? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( - filter, groups, false, scale_weights_data, mask_reorder); + filter, groups, false, is_test, scale_weights_data, mask_reorder); std::shared_ptr dst_memory_p; if (fuse_residual_conn) { @@ -823,7 +822,7 @@ class ConvMKLDNNOpKernel : public framework::OpKernel { handler.get_int8_bias_scales(ctx); auto bias_memory_p = handler.AcquireBiasMemoryWithReorder( - bias, scale_bias_data, mask_reorder); + bias, is_test, scale_bias_data, mask_reorder); args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 4c374d72c046fc..8d43e9f0dca44f 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -51,10 +51,10 @@ class ConvTransposeMKLDNNHandlerT : platform::MKLDNNHandlerT( dev_ctx, mkldnn_engine, cpu_place, platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), - unique_name)), - is_test_(ctx.Attr("is_test")) { + unique_name)) { if (!this->isCached()) { - PADDLE_ENFORCE_EQ(is_test_, true, + const bool is_test = ctx.Attr("is_test"); + PADDLE_ENFORCE_EQ(is_test, true, platform::errors::InvalidArgument( "ConvTransposeMKLDNN works only for inference. " "The attribute \'is_test\' value should be set to " @@ -169,8 +169,8 @@ class ConvTransposeMKLDNNHandlerT const mkldnn::primitive_attr conv_trans_attr = CreatePostOps(fuse_activation, fuse_alpha, fuse_beta); - auto fwd_prop_kind = is_test_ ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training; + auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training; if (bias) { std::vector bias_tz = framework::vectorize(bias->dims()); const auto bias_md = @@ -231,18 +231,18 @@ class ConvTransposeMKLDNNHandlerT const auto target_src_mem_p = this->AcquireMemory(target_key_suffix); user_src_mem_p->set_data_handle(platform::to_void_cast(input_data)); if (user_src_mem_p != target_src_mem_p) { - this->AcquireReorder(user_src_mem_p, target_src_mem_p); + this->AcquireReorder(user_src_mem_p, target_src_mem_p, "@src_mem_p"); } return target_src_mem_p; } } std::shared_ptr AcquireWeightsMemoryWithReorder( - const framework::Tensor* filter, const int& groups) { + const framework::Tensor* filter, const int& groups, const bool& is_test) { // This is workaround to make execution faster, delete // if statement after including md inside Tensor auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target"); - if (is_test_ && weights_mem_p) { + if (is_test && weights_mem_p) { return weights_mem_p; } else { const K* filter_data = filter->data(); @@ -277,15 +277,15 @@ class ConvTransposeMKLDNNHandlerT return this->template AcquireMemoryWithReorder( user_src_md, this->fwd_pd_->weights_desc(), - platform::to_void_cast(filter_data), "@weights_mem_p", is_test_, + platform::to_void_cast(filter_data), "@weights_mem_p", is_test, iohw2oihw_reorder); } } std::shared_ptr AcquireBiasMemoryWithReorder( - const framework::Tensor* bias) { + const framework::Tensor* bias, const bool& is_test) { auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target"); - if (is_test_ && bias_mem_p) { + if (is_test && bias_mem_p) { return bias_mem_p; } else { const K* bias_data = bias->data(); @@ -294,12 +294,9 @@ class ConvTransposeMKLDNNHandlerT MKLDNNMemoryFormat::x); return this->AcquireMemoryWithReorder( user_bias_md, this->fwd_pd_->bias_desc(), - platform::to_void_cast(bias_data), "@bias_mem_p", is_test_); + platform::to_void_cast(bias_data), "@bias_mem_p", is_test); } } - - private: - const bool is_test_; }; template @@ -328,6 +325,8 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel { ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); + const bool is_test = ctx.Attr("is_test"); + const auto* input = ctx.Input("Input"); const auto* filter = ctx.Input("Filter"); const auto* bias = @@ -341,7 +340,7 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel { output, unique_name); auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( - filter, ctx.Attr("groups")); + filter, ctx.Attr("groups"), is_test); std::shared_ptr dst_memory_p = handler.template AcquireDstMemory(output); @@ -353,7 +352,7 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel { {MKLDNN_ARG_DST, *dst_memory_p}}; if (bias) { - auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias); + auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias, is_test); args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc index 815af4eaaf1b37..819c0d15505ca9 100644 --- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc @@ -64,46 +64,81 @@ class QuantOpKernel : public framework::OpKernel { bool is_negative_input = ctx.Attr("is_negative_input"); bool bfloat16 = ctx.Attr("bfloat16"); - // TODO(jczaja): Refactor with Acquire API + std::string key = + platform::CreateKey(dev_ctx, src_tz, scale_data, scale_shift, + is_negative_input, ctx.OutputName("Output")); + key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); + + const std::string key_prim = key + "@r"; + const std::string key_src_mem = key + "@s"; + const std::string key_dst_mem = key + "@d"; + std::shared_ptr src_memory; std::shared_ptr dst_memory; std::shared_ptr reorder_p; - - std::string out_layout = ctx.Attr("output_format"); - MKLDNNMemoryFormat out_format = - platform::data_format_to_memory_format(out_layout); - mkldnn::primitive_attr attri; - int mask = 0; - attri.set_output_scales(mask, {scale_data}); - - if (with_shift) { - mkldnn::post_ops post_operations; - post_operations.append_sum(); - attri.set_post_ops(post_operations); - uint8_t* output_data = output->mutable_data(ctx.GetPlace()); - // memset casts scale_shift to unsigned char (uint8_t) internally - std::memset(output_data, scale_shift, output->numel()); - } - - auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32, - input->format()); - src_memory = std::make_shared(src_md, engine, - to_void_cast(input_data)); - - std::shared_ptr dst_md; - if (bfloat16) { - platform::SetDstMemoryQuantized( - ctx, output, dst_tz, engine, dst_md, dst_memory, out_format); - } else if (is_negative_input && !with_shift) { - platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, - dst_md, dst_memory, out_format); + reorder_p = std::static_pointer_cast(dev_ctx.GetBlob(key_prim)); + + if (reorder_p == nullptr) { + std::string out_layout = ctx.Attr("output_format"); + MKLDNNMemoryFormat out_format = + platform::data_format_to_memory_format(out_layout); + mkldnn::primitive_attr attri; + int mask = 0; + attri.set_output_scales(mask, {scale_data}); + + if (with_shift) { + mkldnn::post_ops post_operations; + post_operations.append_sum(); + attri.set_post_ops(post_operations); + uint8_t* output_data = output->mutable_data(ctx.GetPlace()); + // memset casts scale_shift to unsigned char (uint8_t) internally + std::memset(output_data, scale_shift, output->numel()); + } + + auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32, + input->format()); + src_memory = std::make_shared( + src_md, engine, to_void_cast(input_data)); + + std::shared_ptr dst_md; + if (bfloat16) { + platform::SetDstMemoryQuantized( + ctx, output, dst_tz, engine, dst_md, dst_memory, out_format); + } else if (is_negative_input && !with_shift) { + platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, + dst_md, dst_memory, out_format); + } else { + platform::SetDstMemoryQuantized( + ctx, output, dst_tz, engine, dst_md, dst_memory, out_format); + } + auto reorder_pd = std::shared_ptr( + new reorder::primitive_desc(*src_memory, *dst_memory, attri)); + reorder_p = std::shared_ptr(new reorder(*reorder_pd)); + + dev_ctx.SetBlob(key_prim, reorder_p); + dev_ctx.SetBlob(key_src_mem, src_memory); + dev_ctx.SetBlob(key_dst_mem, dst_memory); } else { - platform::SetDstMemoryQuantized(ctx, output, dst_tz, engine, - dst_md, dst_memory, out_format); + src_memory = std::static_pointer_cast( + dev_ctx.GetBlob(key_src_mem)); + src_memory->set_data_handle(to_void_cast(input_data)); + + dst_memory = std::static_pointer_cast( + dev_ctx.GetBlob(key_dst_mem)); + auto place = ctx.GetPlace(); + + if (bfloat16) { + dst_memory->set_data_handle( + output->mutable_data(place)); + } else if (with_shift || !is_negative_input) { + uint8_t* output_data = output->mutable_data(ctx.GetPlace()); + if (with_shift) std::memset(output_data, scale_shift, output->numel()); + dst_memory->set_data_handle(output_data); + } else { + dst_memory->set_data_handle( + output->mutable_data(ctx.GetPlace())); + } } - auto reorder_pd = std::shared_ptr( - new reorder::primitive_desc(*src_memory, *dst_memory, attri)); - reorder_p = std::shared_ptr(new reorder(*reorder_pd)); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); { diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 8c81db8c26b0be..587ad5f37e55e5 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -11,12 +11,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include -#include -#ifdef _WIN32 -#include -#else -#include -#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" @@ -672,7 +666,7 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) { // of this executor for (auto& s : *p_exec_items_) { for (auto& v : (*s.second)[ptr]) { - (v.first)->second.erase(v.second); + (v.first)->erase(v.second); } s.second->erase(ptr); } @@ -683,27 +677,12 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) { } } -std::string MKLDNNDeviceContext::PickLeastUsedShape( - BlobPtr_t sb) const { - auto ancient_one = sb->begin(); - for (auto v = std::next(sb->begin()); v != sb->end(); ++v) { - if (v->second->first < ancient_one->second->first) { - ancient_one = v; - } - } - VLOG(2) << "num_shapes: " << sb->size() - << ", remove all blobs of shape: " << ancient_one->first; - return ancient_one->first; -} - -void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor( - std::string shape_to_be_removed) const { - p_exec_items_->erase(shape_to_be_removed); +void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const { + p_exec_items_->erase(p_exec_items_->begin()); } -void MKLDNNDeviceContext::LinkEntryWithExecutor( - BlobPtr_t> pblob, - KeyBlob::iterator it) const { +void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t pblob, + KeyBlob::iterator it) const { // Take current input shape from TLS // Take current executor addess from TLS // and for this executor's items add the one defined with arguments @@ -740,7 +719,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, BlobPtr_t data) const { BlobMap* pMap = p_blobmap_.get(); BlobPtr_t sBlob = nullptr; - BlobPtr_t> pBlob = nullptr; + BlobPtr_t pBlob = nullptr; int sid = tls().get_cur_mkldnn_session_id(); @@ -769,24 +748,22 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, sBlob->size() && (sBlob->size() >= static_cast(tls().cur_input_shape_cache_capacity))) { - auto shape_to_be_erased = PickLeastUsedShape(sBlob); - sBlob->erase(shape_to_be_erased); - RemoveShapeEntriesWithExecutor(shape_to_be_erased); + VLOG(2) << "sid=" << sid + << ", remove all blobs of shape: " << sBlob->begin()->first; + sBlob->erase(sBlob->begin()->first); + RemoveShapeEntriesWithExecutor(); } - pBlob = std::make_shared>(); - pBlob->first = __rdtsc(); + pBlob = std::make_shared(); (*sBlob)[tls().cur_input_shape_str] = pBlob; } else { pBlob = key_it->second; - // Update time stamp - pBlob->first = __rdtsc(); } // Find Blob via name - auto blob_it = pBlob->second.find(name); - if (blob_it == pBlob->second.end()) { - auto el = pBlob->second.insert( - std::make_pair(name, data)); // (*pBlob)[name] = data; + auto blob_it = pBlob->find(name); + if (blob_it == pBlob->end()) { + auto el = + pBlob->insert(std::make_pair(name, data)); // (*pBlob)[name] = data; // Register new element in per executor map // to have easily erased when executor terminated LinkEntryWithExecutor(pBlob, el.first); @@ -802,7 +779,7 @@ unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const { unsigned int num_entries = 0; for (auto const& l3 : *p_blobmap_) { for (auto const& l2 : *(l3.second)) { - num_entries += (l2.second->second).size(); + num_entries += (l2.second)->size(); } } return num_entries; @@ -812,7 +789,7 @@ MKLDNNDeviceContext::BlobPtr_t MKLDNNDeviceContext::GetBlob( const std::string& name) const { BlobMap* pMap = p_blobmap_.get(); BlobPtr_t sBlob = nullptr; - BlobPtr_t> pBlob = nullptr; + BlobPtr_t pBlob = nullptr; int sid = tls().get_cur_mkldnn_session_id(); @@ -836,14 +813,12 @@ MKLDNNDeviceContext::BlobPtr_t MKLDNNDeviceContext::GetBlob( pBlob = sBlob_it->second; // Find Blob via name - auto key_it = pBlob->second.find(name); + auto key_it = pBlob->find(name); - if (key_it == pBlob->second.end()) { + if (key_it == pBlob->end()) { VLOG(2) << "GetBlob sid=" << sid << ", miss blob=" << name << "\n"; return nullptr; } - // Update timestamp - sBlob_it->second->first = __rdtsc(); // TODO(windows) VLOG(2) << "GetBlob sid=" << sid << ", get blob=" << name << "\n"; // lock will be automatically released when out of scope diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index ee6bbbf23778db..13a1040dd19df2 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -757,20 +757,18 @@ class MKLDNNDeviceContext : public CPUDeviceContext { // Following three maps are used to cache MKLDNN primitives. // There relations are: // - BlobMap = Map - // - ShapeBlob = Map> + // - ShapeBlob = Map // - KeyBlob = Map using KeyBlob = umap_key_string_t; - using ShapeBlob = umap_key_string_t>; + using ShapeBlob = umap_key_string_t; using BlobMap = umap_value_smart_t; // Auxillary two-level structure (shape, executor) to easier control // clearing cache objects related to specific executor using ExecKey = void*; - using ExecMapCacheIterPair = - std::pair>, - KeyBlob::iterator>; + using ExecMapCacheIterPair = std::pair, KeyBlob::iterator>; using ExecMap = std::unordered_map>; using ExecShape = std::unordered_map>; @@ -781,11 +779,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext { const mkldnn::engine& GetEngine() const { return tls().get_engine(); } // Register object to currently used executor's map - void LinkEntryWithExecutor( - BlobPtr_t> pblob, - KeyBlob::iterator it) const; - void RemoveShapeEntriesWithExecutor(std::string) const; - std::string PickLeastUsedShape(BlobPtr_t sb) const; + void LinkEntryWithExecutor(BlobPtr_t, KeyBlob::iterator) const; + void RemoveShapeEntriesWithExecutor(void) const; // Remove all entries from the blob map void ResetBlobMap(void* ptr); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 5d725307e59208..084b47bb3c7a3b 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -500,9 +500,18 @@ class MKLDNNHandlerT { } void AcquireReorder(const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p) { - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); + const std::shared_ptr& target_memory_p, + const std::string& suffix) { + const auto key_reorder_p = key_ + suffix + "reorder_p"; + + auto reorder_p = std::static_pointer_cast( + dev_ctx_.GetBlob(key_reorder_p)); + + if (reorder_p == nullptr) { + reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + dev_ctx_.SetBlob(key_reorder_p, reorder_p); + } auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); @@ -569,8 +578,6 @@ class MKLDNNHandlerT { std::static_pointer_cast(dev_ctx_.GetBlob(user_key)); user_memory_p->set_data_handle(ptr); - // TODO(jczaja): Here we detect if reorder is cached it means it is needed - // need to change this to get rid of keys auto reorder_p = std::static_pointer_cast( dev_ctx_.GetBlob(key_reorder_p)); if (reorder_p != nullptr) { From bed4fb2702345d330fc5813cf8b4ecca2ce713f6 Mon Sep 17 00:00:00 2001 From: zhulei <563755780@qq.com> Date: Thu, 14 Oct 2021 15:25:43 +0800 Subject: [PATCH 162/298] [NPU] Add density_prior_box (#36361) * [NPU] Add density_prior_box op * [NPU] Add density_prior_box op --- .../fluid/operators/detection/CMakeLists.txt | 3 +- .../detection/density_prior_box_op_npu.cc | 379 ++++++++++++++++++ .../npu/test_density_prior_box_op_npu.py | 196 +++++++++ 3 files changed, 577 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/detection/density_prior_box_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 4e951f6318cc9c..871240aa15fce0 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -17,14 +17,15 @@ endfunction() if (WITH_ASCEND_CL) detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc) + detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu density_prior_box_op_npu.cc) else() detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) + detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) endif() detection_library(bipartite_match_op SRCS bipartite_match_op.cc) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) -detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) detection_library(anchor_generator_op SRCS anchor_generator_op.cc anchor_generator_op.cu) detection_library(target_assign_op SRCS target_assign_op.cc diff --git a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc new file mode 100644 index 00000000000000..cb58640056438b --- /dev/null +++ b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc @@ -0,0 +1,379 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/density_prior_box_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using fp16 = paddle::platform::float16; + +template +struct DensityPriorBoxFunction { + public: + explicit DensityPriorBoxFunction(const framework::ExecutionContext& ctx) + : ctx(ctx) { + place = ctx.GetPlace(); + stream = ctx.template device_context().stream(); + t0.mutable_data({1}, place); + t1.mutable_data({1}, place); + tn.mutable_data({1}, place); + FillNpuTensorWithConstant(&t0, static_cast(0)); + FillNpuTensorWithConstant(&t1, static_cast(1)); + } + void Arange(int n, Tensor* x) { + // x should be init first + FillNpuTensorWithConstant(&tn, static_cast(n)); + const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {}); + runner.Run(stream); + } + void Add(const Tensor* x, const Tensor* y, Tensor* z) { + // z should be init first + const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Cast(const Tensor* x, Tensor* y) { + auto dst_dtype = ConvertToNpuDtype(y->type()); + const auto& runner = NpuOpRunner( + "Cast", {*x}, {*y}, {{"dst_type", static_cast(dst_dtype)}}); + runner.Run(stream); + } + void Sub(const Tensor* x, const Tensor* y, Tensor* z) { + // z should be init first + const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Mul(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Adds(const Tensor* x, float scalar, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); + runner.Run(stream); + } + void Muls(const Tensor* x, float scalar, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}}); + runner.Run(stream); + } + void Maximum(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Minimum(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Concat(const std::vector& inputs, int axis, Tensor* output) { + // output should be init first + std::vector names; + for (size_t i = 0; i < inputs.size(); i++) { + names.push_back("x" + std::to_string(i)); + } + NpuOpRunner runner{ + "ConcatD", + {inputs}, + {*output}, + {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}}; + runner.AddInputNames(names); + runner.Run(stream); + } + void Tile(const Tensor* x, Tensor* y, const std::vector& multiples) { + // y should be init first + if (x->dims() == y->dims()) { + framework::TensorCopy( + *x, place, ctx.template device_context(), + y); + return; + } + const auto& runner = + NpuOpRunner("TileD", {*x}, {*y}, {{"multiples", multiples}}); + runner.Run(stream); + } + void FloatVec2Tsr(const std::vector& vec, Tensor* tsr_dst) { + // + framework::TensorFromVector(vec, ctx.device_context(), tsr_dst); + ctx.template device_context().Wait(); + } + + private: + platform::Place place; + aclrtStream stream; + const framework::ExecutionContext& ctx; + Tensor t0; + Tensor t1; + Tensor tn; +}; + +template <> +void DensityPriorBoxFunction::Arange(int n, Tensor* x) { + Tensor x_fp32(framework::proto::VarType::FP32); + x_fp32.mutable_data(x->dims(), place); + FillNpuTensorWithConstant(&tn, static_cast(n)); + const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {}); + runner.Run(stream); + Cast(&x_fp32, x); +} + +template <> +void DensityPriorBoxFunction::FloatVec2Tsr(const std::vector& vec, + Tensor* tsr_dst) { + Tensor tsr_fp32(framework::proto::VarType::FP32); + tsr_fp32.mutable_data(tsr_dst->dims(), place); + framework::TensorFromVector(vec, ctx.device_context(), &tsr_fp32); + ctx.template device_context().Wait(); + Cast(&tsr_fp32, tsr_dst); +} + +template +class DensityPriorBoxOpNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto variances = ctx.Attr>("variances"); + auto clip = ctx.Attr("clip"); + + auto fixed_sizes = ctx.Attr>("fixed_sizes"); + auto fixed_ratios = ctx.Attr>("fixed_ratios"); + auto densities = ctx.Attr>("densities"); + + float step_w = ctx.Attr("step_w"); + float step_h = ctx.Attr("step_h"); + float offset = ctx.Attr("offset"); + + int image_w = image->dims()[3]; + int image_h = image->dims()[2]; + int layer_w = input->dims()[3]; + int layer_h = input->dims()[2]; + + auto _type = input->type(); + auto place = ctx.GetPlace(); + DensityPriorBoxFunction F(ctx); + + Tensor h(_type); + h.mutable_data({layer_h}, place); + Tensor w(_type); + w.mutable_data({layer_w}, place); + F.Arange(layer_h, &h); + F.Arange(layer_w, &w); + h.Resize({layer_h, 1, 1, 1}); + w.Resize({1, layer_w, 1, 1}); + + step_w = step_w > 0 ? step_w : static_cast(image_w) / layer_w; + step_h = step_h > 0 ? step_h : static_cast(image_h) / layer_h; + int step_average = static_cast((step_w + step_h) * 0.5); + + int ratios_size = fixed_ratios.size(); + int num_priors_per_ratio = 0; + for (size_t i = 0; i < densities.size(); ++i) { + num_priors_per_ratio += densities[i] * densities[i]; + } + Tensor di(_type); + Tensor dj(_type); + Tensor shifts(_type); + Tensor box_w_ratio(_type); + Tensor box_h_ratio(_type); + di.mutable_data({ratios_size * num_priors_per_ratio}, place); + dj.mutable_data({ratios_size * num_priors_per_ratio}, place); + shifts.mutable_data({ratios_size * num_priors_per_ratio}, place); + box_w_ratio.mutable_data({ratios_size * num_priors_per_ratio}, place); + box_h_ratio.mutable_data({ratios_size * num_priors_per_ratio}, place); + + int64_t start = 0; + std::vector vec_tile = {0, 0, 0}; + for (size_t i = 0; i < densities.size(); ++i) { + // Range = start:start+ratios_size*density_sqr, density = densities[i] + int density_sqr = densities[i] * densities[i]; + // shifts[Range] = [step_average/density]*ratios_size*density_sqr + Tensor shifts_part = + shifts.Slice(start, start + ratios_size * density_sqr); + FillNpuTensorWithConstant(&shifts_part, + static_cast(step_average / densities[i])); + + // di[Range] = [ i // density for i in range(density_sqr) ] * ratios_size + // dj[Range] = [ i % density for i in range(density_sqr) ] * ratios_size + Tensor di_part = di.Slice(start, start + ratios_size * density_sqr); + Tensor dj_part = dj.Slice(start, start + ratios_size * density_sqr); + if (densities[i] > 1) { + di_part.Resize({ratios_size, densities[i], densities[i]}); + dj_part.Resize({ratios_size, densities[i], densities[i]}); + Tensor range_n(_type); + range_n.mutable_data({densities[i]}, place); + F.Arange(densities[i], &range_n); + range_n.Resize({1, densities[i], 1}); + vec_tile[0] = ratios_size; + vec_tile[1] = 1; + vec_tile[2] = densities[i]; + F.Tile(&range_n, &di_part, vec_tile); + range_n.Resize({1, 1, densities[i]}); + vec_tile[1] = densities[i]; + vec_tile[2] = 1; + F.Tile(&range_n, &dj_part, vec_tile); + } else { + FillNpuTensorWithConstant(&di_part, static_cast(0)); + FillNpuTensorWithConstant(&dj_part, static_cast(0)); + } + + int start_box_ratio = start; + for (float ar : fixed_ratios) { + // Range_mini = start_box_ratio:start_box_ratio+density_sqr + // box_h_ratio[Range_mini] = [fixed_sizes[i] * sqrt(ar)] * density_sqr + // box_w_ratio[Range_mini] = [fixed_sizes[i] / sqrt(ar)] * density_sqr + Tensor box_h_ratio_part = + box_h_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr); + Tensor box_w_ratio_part = + box_w_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr); + FillNpuTensorWithConstant(&box_w_ratio_part, + static_cast(fixed_sizes[i] * sqrt(ar))); + FillNpuTensorWithConstant(&box_h_ratio_part, + static_cast(fixed_sizes[i] / sqrt(ar))); + start_box_ratio += density_sqr; + } + start = start_box_ratio; + } + di.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); + dj.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); + shifts.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); + box_w_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); + box_h_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); + + // c_x = (w+offset)*step_w - 0.5*step_average + 0.5*shifts + dj*shifts + // c_y = (h+offset)*step_h - 0.5*step_average + 0.5*shifts + di*shifts + Tensor c_x(_type); + Tensor c_y(_type); + auto dim0 = framework::make_ddim( + {1, layer_w, ratios_size * num_priors_per_ratio, 1}); + auto dim1 = framework::make_ddim( + {layer_h, 1, ratios_size * num_priors_per_ratio, 1}); + c_x.mutable_data(dim0, place); + c_y.mutable_data(dim1, place); + F.Adds(&w, offset, &w); + F.Muls(&w, step_w, &w); + F.Adds(&w, static_cast(-step_average) * static_cast(0.5), &w); + F.Adds(&h, offset, &h); + F.Muls(&h, step_h, &h); + F.Adds(&h, static_cast(-step_average) * static_cast(0.5), &h); + F.Mul(&di, &shifts, &di); + F.Mul(&dj, &shifts, &dj); + F.Muls(&shifts, static_cast(0.5), &shifts); + F.Add(&di, &shifts, &di); + F.Add(&dj, &shifts, &dj); + F.Add(&dj, &w, &c_x); + F.Add(&di, &h, &c_y); + + // box_w_ratio = box_w_ratio / 2 + // box_h_ratio = box_h_ratio / 2 + F.Muls(&box_w_ratio, static_cast(0.5), &box_w_ratio); + F.Muls(&box_h_ratio, static_cast(0.5), &box_h_ratio); + + Tensor zero_t(_type); + Tensor one_t(_type); + zero_t.mutable_data({1}, place); + one_t.mutable_data({1}, place); + FillNpuTensorWithConstant(&zero_t, static_cast(0)); + FillNpuTensorWithConstant(&one_t, static_cast(1)); + + Tensor outbox0(_type); + Tensor outbox1(_type); + Tensor outbox2(_type); + Tensor outbox3(_type); + outbox0.mutable_data(dim0, place); + outbox1.mutable_data(dim1, place); + outbox2.mutable_data(dim0, place); + outbox3.mutable_data(dim1, place); + + // outbox0 = max ( (c_x - box_w_ratio)/image_w, 0 ) + // outbox1 = max ( (c_y - box_h_ratio)/image_h, 0 ) + // outbox2 = min ( (c_x + box_w_ratio)/image_w, 1 ) + // outbox3 = min ( (c_y + box_h_ratio)/image_h, 1 ) + F.Sub(&c_x, &box_w_ratio, &outbox0); + F.Sub(&c_y, &box_h_ratio, &outbox1); + F.Add(&c_x, &box_w_ratio, &outbox2); + F.Add(&c_y, &box_h_ratio, &outbox3); + F.Muls(&outbox0, static_cast(1.0 / image_w), &outbox0); + F.Muls(&outbox1, static_cast(1.0 / image_h), &outbox1); + F.Muls(&outbox2, static_cast(1.0 / image_w), &outbox2); + F.Muls(&outbox3, static_cast(1.0 / image_h), &outbox3); + + F.Maximum(&outbox0, &zero_t, &outbox0); + F.Maximum(&outbox1, &zero_t, &outbox1); + F.Minimum(&outbox2, &one_t, &outbox2); + F.Minimum(&outbox3, &one_t, &outbox3); + if (clip) { + // outbox0 = min ( outbox0, 1 ) + // outbox1 = min ( outbox1, 1 ) + // outbox2 = max ( outbox2, 0 ) + // outbox3 = max ( outbox3, 0 ) + F.Minimum(&outbox0, &one_t, &outbox0); + F.Minimum(&outbox1, &one_t, &outbox1); + F.Maximum(&outbox2, &zero_t, &outbox2); + F.Maximum(&outbox3, &zero_t, &outbox3); + } + + auto out_dim = framework::make_ddim( + {layer_h, layer_w, ratios_size * num_priors_per_ratio, 4}); + boxes->mutable_data(place); + vars->mutable_data(place); + Tensor boxes_share(_type); + Tensor vars_share(_type); + boxes_share.ShareDataWith(*boxes); + boxes_share.Resize(out_dim); + vars_share.ShareDataWith(*vars); + vars_share.Resize(out_dim); + + Tensor box0(_type); + Tensor box1(_type); + Tensor box2(_type); + Tensor box3(_type); + // out_dim = {layer_h, layer_w, ratios_size*num_priors_per_ratio, 1} + out_dim[3] = 1; + box0.mutable_data(out_dim, place); + box1.mutable_data(out_dim, place); + box2.mutable_data(out_dim, place); + box3.mutable_data(out_dim, place); + + std::vector vec_exp_out02 = {layer_h, 1, 1, 1}; + std::vector vec_exp_out13 = {1, layer_w, 1, 1}; + F.Tile(&outbox0, &box0, vec_exp_out02); + F.Tile(&outbox1, &box1, vec_exp_out13); + F.Tile(&outbox2, &box2, vec_exp_out02); + F.Tile(&outbox3, &box3, vec_exp_out13); + F.Concat({box0, box1, box2, box3}, 3, &boxes_share); + + std::vector multiples = {layer_h, layer_w, + ratios_size * num_priors_per_ratio, 1}; + Tensor variances_t(_type); + // variances.size() == 4 + variances_t.mutable_data({4}, place); + F.FloatVec2Tsr(variances, &variances_t); + F.Tile(&variances_t, &vars_share, multiples); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(density_prior_box, + ops::DensityPriorBoxOpNPUKernel, + ops::DensityPriorBoxOpNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py new file mode 100644 index 00000000000000..a190aa9b6f2be5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py @@ -0,0 +1,196 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +import math +import paddle +from op_test import OpTest + +paddle.enable_static() + +np.random.seed(2021) + + +class TestNpuDensityPriorBoxOp(OpTest): + def set_data(self): + self.init_test_params() + self.init_test_input() + self.init_test_output() + #self.init_test_output2() + self.inputs = {'Input': self.input, 'Image': self.image} + + self.attrs = { + 'variances': self.variances, + 'clip': self.clip, + 'step_w': self.step_w, + 'step_h': self.step_h, + 'offset': self.offset, + 'densities': self.densities, + 'fixed_sizes': self.fixed_sizes, + 'fixed_ratios': self.fixed_ratios, + 'flatten_to_2d': self.flatten_to_2d + } + self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var} + + def test_check_output(self): + self.check_output_with_place(self.place, atol=self.atol) + + def setUp(self): + self.__class__.use_npu = True + self.op_type = 'density_prior_box' + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.set_data() + + def init_dtype(self): + self.dtype = np.float32 + + def set_density(self): + self.densities = [4, 2, 1] + self.fixed_sizes = [32.0, 64.0, 128.0] + self.fixed_ratios = [1.0] + self.layer_w = 17 + self.layer_h = 17 + self.image_w = 533 + self.image_h = 533 + self.flatten_to_2d = False + + def init_test_params(self): + self.set_density() + + self.step_w = float(self.image_w) / float(self.layer_w) + self.step_h = float(self.image_h) / float(self.layer_h) + + self.input_channels = 2 + self.image_channels = 3 + self.batch_size = 10 + + self.variances = [0.1, 0.1, 0.2, 0.2] + self.variances = np.array(self.variances, dtype=np.float).flatten() + + self.clip = True + self.num_priors = 0 + if len(self.fixed_sizes) > 0 and len(self.densities) > 0: + for density in self.densities: + if len(self.fixed_ratios) > 0: + self.num_priors += len(self.fixed_ratios) * (pow(density, + 2)) + self.offset = 0.5 + self.atol = 1e-5 + + def init_test_input(self): + self.image = np.random.random( + (self.batch_size, self.image_channels, self.image_h, + self.image_w)).astype(self.dtype) + + self.input = np.random.random( + (self.batch_size, self.input_channels, self.layer_h, + self.layer_w)).astype(self.dtype) + + def init_test_output(self): + out_dim = (self.layer_h, self.layer_w, self.num_priors, 4) + out_boxes = np.zeros(out_dim).astype(self.dtype) + out_var = np.zeros(out_dim).astype(self.dtype) + + step_average = int((self.step_w + self.step_h) * 0.5) + for h in range(self.layer_h): + for w in range(self.layer_w): + idx = 0 + c_x = (w + self.offset) * self.step_w + c_y = (h + self.offset) * self.step_h + # Generate density prior boxes with fixed size + for density, fixed_size in zip(self.densities, + self.fixed_sizes): + if (len(self.fixed_ratios) > 0): + for ar in self.fixed_ratios: + shift = int(step_average / density) + box_width_ratio = fixed_size * math.sqrt(ar) + box_height_ratio = fixed_size / math.sqrt(ar) + for di in range(density): + for dj in range(density): + c_x_temp = c_x - step_average / 2.0 + shift / 2.0 + dj * shift + c_y_temp = c_y - step_average / 2.0 + shift / 2.0 + di * shift + out_boxes[h, w, idx, :] = [ + max((c_x_temp - box_width_ratio / 2.0) / + self.image_w, 0), + max((c_y_temp - box_height_ratio / 2.0) + / self.image_h, 0), + min((c_x_temp + box_width_ratio / 2.0) / + self.image_w, 1), + min((c_y_temp + box_height_ratio / 2.0) + / self.image_h, 1) + ] + idx += 1 + if self.clip: + out_boxes = np.clip(out_boxes, 0.0, 1.0) + out_var = np.tile(self.variances, + (self.layer_h, self.layer_w, self.num_priors, 1)) + self.out_boxes = out_boxes.astype(self.dtype) + self.out_var = out_var.astype(self.dtype) + if self.flatten_to_2d: + self.out_boxes = self.out_boxes.reshape((-1, 4)) + self.out_var = self.out_var.reshape((-1, 4)) + + +class TestNpuDensityPriorBoxFlatten(TestNpuDensityPriorBoxOp): + def set_density(self): + self.densities = [3, 4] + self.fixed_sizes = [1.0, 2.0] + self.fixed_ratios = [1.0] + self.layer_w = 32 + self.layer_h = 32 + self.image_w = 40 + self.image_h = 40 + self.flatten_to_2d = True + + +class TestNpuDensityPriorBoxOp1(TestNpuDensityPriorBoxOp): + def set_density(self): + super(TestNpuDensityPriorBoxOp1, self).set_density() + self.layer_w = 1 + self.layer_h = 1 + + +class TestNpuDensityPriorBoxOp2(TestNpuDensityPriorBoxOp): + def set_density(self): + super(TestNpuDensityPriorBoxOp2, self).set_density() + self.layer_w = 15 + self.layer_h = 17 + self.image_w = 533 + self.image_h = 532 + + +class TestNpuDensityPriorBoxOp3(TestNpuDensityPriorBoxOp): + def set_density(self): + super(TestNpuDensityPriorBoxOp3, self).set_density() + self.fixed_ratios = [1.0, 4.0] + + +class TestNpuDensityPriorBoxOpFP16(TestNpuDensityPriorBoxOp): + def init_dtype(self): + self.dtype = np.float16 + + def init_test_params(self): + super(TestNpuDensityPriorBoxOpFP16, self).init_test_params() + self.atol = 1e-3 + self.clip = False + + +if __name__ == '__main__': + unittest.main() From 12e6dbbcf3effc97ca427b75143255e590e7ee96 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Thu, 14 Oct 2021 15:27:28 +0800 Subject: [PATCH 163/298] Add the complete code and related files of resnet_unit_op (#36366) --- cmake/operators.cmake | 2 +- paddle/fluid/operators/fused/CMakeLists.txt | 6 +- .../operators/fused/cudnn_bn_add_relu_test.cc | 6 +- .../operators/fused/cudnn_fusion_helper.h | 10 +- .../fused/cudnn_scale_bias_add_relu.cu.h | 35 +- .../fluid/operators/fused/resnet_unit_op.cc | 410 ++++++++++++++++++ .../fluid/operators/fused/resnet_unit_op.cu | 298 +++++++++++++ .../contrib/mixed_precision/fp16_utils.py | 41 +- 8 files changed, 768 insertions(+), 40 deletions(-) create mode 100644 paddle/fluid/operators/fused/resnet_unit_op.cc create mode 100644 paddle/fluid/operators/fused/resnet_unit_op.cu diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 7541b234ceaa69..228da9f77739d7 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -216,7 +216,7 @@ function(op_library TARGET) "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "sparse_attention_op" "dgc_op" "fused_fc_elementwise_layernorm_op" "skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op" -"fused_bn_add_activation_op") +"fused_bn_add_activation_op" "resnet_unit_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 2630c12db2fc9a..2286aaaf85969f 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -16,7 +16,8 @@ register_operators(EXCLUDES fusion_gru_op fusion_lstm_op fused_bn_add_activation_op - fused_transformer_op) + fused_transformer_op + resnet_unit_op) # fusion_gru_op does not have CUDA kernel op_library(fusion_gru_op) @@ -78,7 +79,10 @@ if (WITH_GPU OR WITH_ROCM) nv_test(test_fused_dropout_act_bias SRCS fused_dropout_act_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory) nv_test(test_fused_layernorm_residual_dropout_bias SRCS fused_layernorm_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory) endif() + # resnet_unit needs cudnn 8.0 above if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000)) + op_library(resnet_unit_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(resnet_unit);\n") cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory) cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory) endif() diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 709d69214c603f..c5995fe3554b4e 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -631,8 +631,8 @@ class CudnnBNAddReluTester { op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type_, fuse_add_, has_shortcut_, data_shape, param_shape, bitmask_shape); - sbar_op.Forward(ctx, x, equiv_scale_x, equiv_bias_x, z, equiv_scale_z, - equiv_bias_z, &y, &bitmask); + sbar_op.Forward(ctx, x, equiv_scale_x, equiv_bias_x, &z, &equiv_scale_z, + &equiv_bias_z, &y, &bitmask); TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x); TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x); @@ -690,7 +690,7 @@ class CudnnBNAddReluTester { op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type, true, false, data_shape, param_shape, bitmask_shape); sbar_op.Backward(ctx, dy, x, bn_scale, bn_bias, saved_mean, saved_var, - bitmask, &dx, &dz, &dscale, &dbias, eps_); + &bitmask, &dx, &dz, &dscale, &dbias, eps_); TensorCopySync(dx, platform::CPUPlace(), cpu_dx); TensorCopySync(dz, platform::CPUPlace(), cpu_dz); diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h index fcd354df938ace..1de64cf5ad947d 100644 --- a/paddle/fluid/operators/fused/cudnn_fusion_helper.h +++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h @@ -38,10 +38,12 @@ class CudnnFusionOp { &op_variant_params_, op_id)); } - ~CudnnFusionOp() { - dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_); - dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_); - dynload::cudnnDestroyFusedOpsPlan(op_); + ~CudnnFusionOp() PADDLE_MAY_THROW { + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_)); } // Execute fused op diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h index b48c964d264add..5166ff27234f23 100644 --- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h +++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h @@ -94,13 +94,13 @@ template class CudnnScaleBiasAddRelu { public: CudnnScaleBiasAddRelu(const platform::CUDADeviceContext &ctx, - const std::string &act_type, bool fused_add, + const std::string &act_type, bool fuse_add, bool has_shortcut, const std::vector &data_shape, const std::vector ¶m_shape, const std::vector &bitmask_shape) : fwd_op_(CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK), bwd_op_(CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM) { - fused_add_ = fused_add; + fuse_add_ = fuse_add; has_shortcut_ = has_shortcut; args_.Set(act_type, data_shape, param_shape, bitmask_shape); } @@ -108,8 +108,8 @@ class CudnnScaleBiasAddRelu { ~CudnnScaleBiasAddRelu() {} void Forward(const platform::CUDADeviceContext &ctx, const Tensor &x, - const Tensor &x_scale, const Tensor &x_bias, const Tensor &z, - const Tensor &z_scale, const Tensor &z_bias, Tensor *out, + const Tensor &x_scale, const Tensor &x_bias, const Tensor *z, + const Tensor *z_scale, const Tensor *z_bias, Tensor *out, Tensor *bitmask) { ForwardInit(ctx); auto handle = ctx.cudnn_handle(); @@ -125,15 +125,15 @@ class CudnnScaleBiasAddRelu { fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, x_scale_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, x_bias_ptr); if (has_shortcut_) { - T *z_ptr = const_cast(z.data()); - T *z_scale_ptr = const_cast(z_scale.data()); - T *z_bias_ptr = const_cast(z_bias.data()); + T *z_ptr = const_cast(z->data()); + T *z_scale_ptr = const_cast(z_scale->data()); + T *z_bias_ptr = const_cast(z_bias->data()); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQSCALE, z_scale_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQBIAS, z_bias_ptr); } else { - if (fused_add_) { - T *z_ptr = const_cast(z.data()); + if (fuse_add_) { + T *z_ptr = const_cast(z->data()); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr); } } @@ -160,7 +160,7 @@ class CudnnScaleBiasAddRelu { void Backward(const platform::CUDADeviceContext &ctx, const Tensor &dy, const Tensor &x, const Tensor &scale, const Tensor &bias, const Tensor &saved_mean, const Tensor &saved_invstd, - const Tensor &bitmask, Tensor *dx, Tensor *dz, Tensor *dscale, + const Tensor *bitmask, Tensor *dx, Tensor *dz, Tensor *dscale, Tensor *dbias, double eps) { BackwardInit(ctx); auto handle = ctx.cudnn_handle(); @@ -175,7 +175,8 @@ class CudnnScaleBiasAddRelu { float *bias_ptr = const_cast(bias.data()); float *saved_mean_ptr = const_cast(saved_mean.data()); float *saved_invstd_ptr = const_cast(saved_invstd.data()); - int32_t *bitmask_ptr = const_cast(bitmask.data()); + int32_t *bitmask_ptr = + bitmask ? const_cast(bitmask->data()) : nullptr; T *dx_ptr = dx->mutable_data(place); T *dz_ptr = dz ? dz->mutable_data(place) : nullptr; float *dscale_ptr = dscale ? dscale->mutable_data(place) : nullptr; @@ -199,7 +200,7 @@ class CudnnScaleBiasAddRelu { bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_DBIAS, dbias_ptr); bwd_op_.SetOpVariantParamAttrPtr(CUDNN_SCALAR_DOUBLE_BN_EPSILON, &eps); - if (has_shortcut_ || fused_add_) { + if (has_shortcut_ || fuse_add_) { bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DZDATA, dz_ptr); } @@ -226,14 +227,14 @@ class CudnnScaleBiasAddRelu { {CUDNN_PARAM_ZDATA_PLACEHOLDER, CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER, CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER}, CUDNN_PTR_16B_ALIGNED); - } else if (fused_add_) { + } else if (fuse_add_) { fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_ZDATA_PLACEHOLDER, CUDNN_PTR_16B_ALIGNED); } // input desc fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc()); - if (has_shortcut_ || fused_add_) { + if (has_shortcut_ || fuse_add_) { fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ZDESC, args_.in_desc.desc()); } @@ -271,7 +272,7 @@ class CudnnScaleBiasAddRelu { CUDNN_PARAM_BN_DSCALE_PLACEHOLDER, CUDNN_PARAM_BN_DBIAS_PLACEHOLDER, CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER}, CUDNN_PTR_16B_ALIGNED); - if (has_shortcut_ || fused_add_) { + if (has_shortcut_ || fuse_add_) { bwd_op_.SetOpConstParamAttr(CUDNN_PARAM_DZDATA_PLACEHOLDER, CUDNN_PTR_16B_ALIGNED); } @@ -279,7 +280,7 @@ class CudnnScaleBiasAddRelu { // input desc bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc()); bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DXDESC, args_.in_desc.desc()); - if (has_shortcut_ || fused_add_) { + if (has_shortcut_ || fuse_add_) { bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DZDESC, args_.in_desc.desc()); } @@ -303,7 +304,7 @@ class CudnnScaleBiasAddRelu { CUDNN_BATCHNORM_SPATIAL_PERSISTENT); } - bool fused_add_ = false; + bool fuse_add_ = false; bool has_shortcut_ = false; size_t fwd_workspace_byte_; size_t bwd_workspace_byte_; diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc new file mode 100644 index 00000000000000..062fd3f1cf4088 --- /dev/null +++ b/paddle/fluid/operators/fused/resnet_unit_op.cc @@ -0,0 +1,410 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +// Shape of bitmask +static framework::DDim GetBitmaskDims(std::vector out_shape) { + int c = out_shape.back(); + int64_t nhw = std::accumulate(out_shape.begin(), out_shape.end(), 1, + std::multiplies()) / + c; + int32_t c_int32_elems = ((c + 63) & ~63) / 32; + int32_t nhw_int32_elems = ((nhw + 31) & ~31); + std::vector bitmask_shape = {nhw_int32_elems, c_int32_elems, 1}; + return framework::make_ddim(bitmask_shape); +} + +class ResNetUnitOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const { + // Check input + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("FilterX"), "Input", "FilterX", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("ScaleX"), "Input", "ScaleX", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("BiasX"), "Input", "BiasX", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("MeanX"), "Input", "MeanX", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("VarX"), "Input", "VarX", "ResNetUnitOp"); + + bool fuse_add = ctx->Attrs().Get("fuse_add"); + bool has_shortcut = ctx->Attrs().Get("has_shortcut"); + if (fuse_add || has_shortcut) { + OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "ResNetUnitOp"); + } + if (has_shortcut) { + OP_INOUT_CHECK(ctx->HasInput("FilterZ"), "Input", "FilterZ", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("ScaleZ"), "Input", "ScaleZ", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("BiasZ"), "Input", "BiasZ", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("MeanZ"), "Input", "MeanZ", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("VarZ"), "Input", "VarZ", "ResNetUnitOp"); + } + + // Check output + OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("BitMask"), "Output", "BitMask", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("ConvX"), "Output", "ConvX", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedMeanX"), "Output", "SavedMeanX", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedInvstdX"), "Output", "SavedInvstdX", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("RunningMeanX"), "Output", "RunningMeanX", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("RunningVarX"), "Output", "RunningVarX", + "ResNetUnitOp"); + if (has_shortcut) { + OP_INOUT_CHECK(ctx->HasOutput("ConvZ"), "Output", "ConvZ", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedMeanZ"), "Output", "SavedMeanZ", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedInvstdZ"), "Output", "SavedInvstdZ", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("RunningMeanZ"), "Output", "RunningMeanZ", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("RunningVarZ"), "Output", "RunningVarZ", + "ResNetUnitOp"); + } + + // make sure Mean/RunningMean and Var/RunningVar share memory + PADDLE_ENFORCE_EQ( + ctx->Inputs("MeanX")[0], ctx->Outputs("RunningMeanX")[0], + platform::errors::InvalidArgument( + "MeanX and RunningMeanX should share the same memory")); + PADDLE_ENFORCE_EQ(ctx->Inputs("VarX")[0], ctx->Outputs("RunningVarX")[0], + platform::errors::InvalidArgument( + "VarX and RunningVarX should share the same memory")); + if (has_shortcut) { + PADDLE_ENFORCE_EQ( + ctx->Inputs("MeanZ")[0], ctx->Outputs("RunningMeanZ")[0], + platform::errors::InvalidArgument( + "MeanZ and RunningMeanZ should share the same memory")); + PADDLE_ENFORCE_EQ( + ctx->Inputs("VarZ")[0], ctx->Outputs("RunningVarZ")[0], + platform::errors::InvalidArgument( + "VarZ and RunningVarZ should share the same memory")); + } + + // Check dims of inputs + const auto x_dims = ctx->GetInputDim("X"); + const auto w_dims = ctx->GetInputDim("FilterX"); + const auto bn_param_dims = ctx->GetInputDim("ScaleX"); + PADDLE_ENFORCE_EQ(x_dims.size(), 4, platform::errors::InvalidArgument( + "The dimensions of input " + "must equal to 4." + "But received: the shape of input " + "= [%s], the dimension of input = " + "[%d]", + x_dims, x_dims.size())); + PADDLE_ENFORCE_EQ(w_dims.size(), 4, + platform::errors::InvalidArgument( + "The dimensions of filter " + "must equal to 4." + "But received: the shape of filter " + "= [%s], the dimension of filter = [%d] ", + w_dims, w_dims.size())); + PADDLE_ENFORCE_EQ(bn_param_dims.size(), 4, + platform::errors::InvalidArgument( + "The dimensions of bn param " + "must equal to 4." + "But received: the shape of bn param " + "= [%s], the dimension of bn param = [%d] ", + bn_param_dims, bn_param_dims.size())); + auto data_format = ctx->Attrs().Get("data_format"); + PADDLE_ENFORCE_EQ( + data_format, "NHWC", + platform::errors::InvalidArgument("The data format must equal to NHWC. " + "But received: the data format " + "= [%s]", + data_format)); + // Calculate the dims of outputs + int batch = x_dims[0]; + int output_channel = w_dims[0]; + int filter_size = w_dims[2]; + int stride = ctx->Attrs().Get("stride"); + int padding = ctx->Attrs().Get("padding"); + int out_h = (x_dims[1] + padding * 2 - filter_size) / stride + 1; + int out_w = (x_dims[2] + padding * 2 - filter_size) / stride + 1; + std::vector out_shape = {batch, out_h, out_w, output_channel}; + + auto y_dims = framework::make_ddim(out_shape); + auto bitmask_dims = GetBitmaskDims(out_shape); + // Set dims of outputs + ctx->SetOutputDim("Y", y_dims); + ctx->SetOutputDim("BitMask", bitmask_dims); + ctx->SetOutputDim("ConvX", y_dims); + ctx->SetOutputDim("SavedMeanX", bn_param_dims); + ctx->SetOutputDim("SavedInvstdX", bn_param_dims); + ctx->SetOutputDim("RunningMeanX", bn_param_dims); + ctx->SetOutputDim("RunningVarX", bn_param_dims); + if (has_shortcut) { + ctx->SetOutputDim("ConvZ", y_dims); + ctx->SetOutputDim("SavedMeanZ", bn_param_dims); + ctx->SetOutputDim("SavedInvstdZ", bn_param_dims); + ctx->SetOutputDim("RunningMeanZ", bn_param_dims); + ctx->SetOutputDim("RunningVarZ", bn_param_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + // By default, the type of the scale, bias, mean, + // and var tensors should be float when input tensor's dtype is float16. + auto bn_param_type = framework::proto::VarType::FP32; + + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("ScaleX")->type(), + platform::errors::InvalidArgument( + "Scale input should be of float type")); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("BiasX")->type(), + platform::errors::InvalidArgument( + "Bias input should be of float type")); + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library); + } +}; + +class ResNetUnitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "The input 1 tensor"); + AddInput("FilterX", "Filter tensor of input 1"); + AddInput("ScaleX", "Scale tensor of input 1 used in batchnorm"); + AddInput("BiasX", "Bias tensor of input 1 used in batchnorm"); + AddInput("MeanX", "Mean tensor of input 1 used in batchnorm"); + AddInput("VarX", "Variance tensor of input 1 used in batchnorm"); + AddInput("Z", "The input 2 tensor").AsDispensable(); + AddInput("FilterZ", "Filter tensor of input 2").AsDispensable(); + AddInput("ScaleZ", "Scale tensor of input 2").AsDispensable(); + AddInput("BiasZ", "Bias tensor of input 2").AsDispensable(); + AddInput("MeanZ", "Mean tensor of input 2").AsDispensable(); + AddInput("VarZ", "Variance tensor of input 2").AsDispensable(); + AddOutput("Y", "The result of the resnet unit"); + AddOutput("BitMask", "The bitmask generated after relu"); + AddOutput("ConvX", "The output of input 1 after conv"); + AddOutput("SavedMeanX", "Mean of input 1 in the current batch"); + AddOutput("SavedInvstdX", "Invstd of input 1 in the current batch"); + AddOutput("RunningMeanX", "Shared memory with MeanX"); + AddOutput("RunningVarX", "Shared memory with VarX"); + AddOutput("ConvZ", "The output of input 2 after conv").AsDispensable(); + AddOutput("SavedMeanZ", "Mean of input 1 in the current batch") + .AsDispensable(); + AddOutput("SavedInvstdZ", "Invstd of input 1 in the current batch") + .AsDispensable(); + AddOutput("RunningMeanZ", "Shared memory with MeanZ").AsDispensable(); + AddOutput("RunningVarZ", "Shared memory with VarZ").AsDispensable(); + AddAttr("stride", "").SetDefault(1); + AddAttr("stride_z", "").SetDefault(1); + AddAttr("padding", "").SetDefault(0); + AddAttr("dilation", "").SetDefault(1); + AddAttr("group", "").SetDefault(1); + AddAttr("momentum", "").SetDefault(0.9); + AddAttr("epsilon", "").SetDefault(1e-5); + AddAttr("data_format", "").SetDefault("NHWC"); + AddAttr("fuse_add", "").SetDefault(false); + AddAttr("has_shortcut", "").SetDefault(false); + AddAttr("use_global_stats", "").SetDefault(false); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddAttr("act_type", "The activation type to be fused.") + .SetDefault("relu"); + AddComment(R"DOC( +Fusion op of the basic unit of resnet block. + +The implementation is based on the latest fusion op interface in cuDNN v8.0. +For more details: +https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnFusedOps_t + +)DOC"); + } +}; + +class ResNetUnitGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const { + // check input + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("FilterX"), "Input", "FilterX", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("ConvX"), "Input", "ConvX", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("ScaleX"), "Input", "ScaleX", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("BiasX"), "Input", "BiasX", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedMeanX"), "Input", "SavedMeanX", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedInvstdX"), "Input", "SavedInvstdX", + "ResNetUnitGradOp"); + + bool fuse_add = ctx->Attrs().Get("fuse_add"); + bool has_shortcut = ctx->Attrs().Get("has_shortcut"); + if (fuse_add || has_shortcut) { + OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "ResNetUnitGradOp"); + } + if (has_shortcut) { + OP_INOUT_CHECK(ctx->HasInput("FilterZ"), "Input", "FilterZ", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("ConvZ"), "Input", "ConvZ", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("ScaleZ"), "Input", "ScaleZ", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("BiasZ"), "Input", "BiasZ", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedMeanZ"), "Input", "SavedMeanZ", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedInvstdZ"), "Input", "SavedInvstdZ", + "ResNetUnitGradOp"); + } + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("BitMask"), "Input", "BitMask", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input", + framework::GradVarName("Y"), "ResNetUnitGradOp"); + + // check output + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + framework::GradVarName("X"), "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("FilterX")), "Output", + framework::GradVarName("FilterX"), "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("ScaleX")), "Output", + framework::GradVarName("ScaleX"), "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("BiasX")), "Output", + framework::GradVarName("BiasX"), "ResNetUnitGradOp"); + if (fuse_add) { + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Z")), "Output", + framework::GradVarName("Z"), "ResNetUnitGradOp"); + } + if (has_shortcut) { + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("FilterZ")), + "Output", framework::GradVarName("FilterZ"), + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("ScaleZ")), "Output", + framework::GradVarName("ScaleZ"), "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("BiasZ")), "Output", + framework::GradVarName("BiasZ"), "ResNetUnitGradOp"); + } + const auto x_dims = ctx->GetInputDim("X"); + const auto filter_x_dims = ctx->GetInputDim("FilterX"); + const auto param_dims = ctx->GetInputDim("ScaleX"); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->SetOutputDim(framework::GradVarName("FilterX"), filter_x_dims); + ctx->SetOutputDim(framework::GradVarName("ScaleX"), param_dims); + ctx->SetOutputDim(framework::GradVarName("BiasX"), param_dims); + if (fuse_add || has_shortcut) { + const auto z_dims = ctx->GetInputDim("Z"); + ctx->SetOutputDim(framework::GradVarName("Z"), z_dims); + } + if (has_shortcut) { + const auto filter_z_dims = ctx->GetInputDim("FilterZ"); + ctx->SetOutputDim(framework::GradVarName("FilterZ"), filter_z_dims); + ctx->SetOutputDim(framework::GradVarName("ScaleZ"), param_dims); + ctx->SetOutputDim(framework::GradVarName("BiasZ"), param_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + PADDLE_ENFORCE_NOT_NULL( + ctx.InputVar(framework::GradVarName("Y")), + platform::errors::NotFound( + "Can not find Y@GRAD in the execution context.")); + + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), + layout, library); + } +}; + +template +class ResNetUnitGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("resnet_unit_grad"); + op->SetInput("X", this->Input("X")); + op->SetInput("FilterX", this->Input("FilterX")); + op->SetInput("ConvX", this->Output("ConvX")); + op->SetInput("ScaleX", this->Input("ScaleX")); + op->SetInput("BiasX", this->Input("BiasX")); + op->SetInput("SavedMeanX", this->Output("SavedMeanX")); + op->SetInput("SavedInvstdX", this->Output("SavedInvstdX")); + op->SetInput("Z", this->Input("Z")); + op->SetInput("FilterZ", this->Input("FilterZ")); + op->SetInput("ConvZ", this->Output("ConvZ")); + op->SetInput("ScaleZ", this->Input("ScaleZ")); + op->SetInput("BiasZ", this->Input("BiasZ")); + op->SetInput("SavedMeanZ", this->Output("SavedMeanZ")); + op->SetInput("SavedInvstdZ", this->Output("SavedInvstdZ")); + op->SetInput("Y", this->Output("Y")); + op->SetInput("BitMask", this->Output("BitMask")); + op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); + + op->SetAttrMap(this->Attrs()); + + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetOutput(framework::GradVarName("FilterX"), + this->InputGrad("FilterX")); + op->SetOutput(framework::GradVarName("ScaleX"), this->InputGrad("ScaleX")); + op->SetOutput(framework::GradVarName("BiasX"), this->InputGrad("BiasX")); + op->SetOutput(framework::GradVarName("Z"), this->InputGrad("Z")); + op->SetOutput(framework::GradVarName("FilterZ"), + this->InputGrad("FilterZ")); + op->SetOutput(framework::GradVarName("ScaleZ"), this->InputGrad("ScaleZ")); + op->SetOutput(framework::GradVarName("BiasZ"), this->InputGrad("BiasZ")); + } +}; + +class ResNetUnitOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map& GetInputOutputWithSameType() + const override { + static std::unordered_map m{{"X", /*->*/ "Y"}}; + return m; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(resnet_unit, ops::ResNetUnitOp, ops::ResNetUnitOpMaker, + ops::ResNetUnitOpInferVarType, + ops::ResNetUnitGradOpMaker, + ops::ResNetUnitGradOpMaker); +REGISTER_OPERATOR(resnet_unit_grad, ops::ResNetUnitGradOp); diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu new file mode 100644 index 00000000000000..a0126e5a9d4283 --- /dev/null +++ b/paddle/fluid/operators/fused/resnet_unit_op.cu @@ -0,0 +1,298 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h" +#include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h" +#include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ResNetUnitKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet("It must use CUDAPlace.")); + PADDLE_ENFORCE_EQ(platform::CudnnDataType::type, CUDNN_DATA_HALF, + platform::errors::Unavailable( + "ResNetUnitOp only supports float16 for now.")); + + // input x + const Tensor *input_x = ctx.Input("X"); + const Tensor *filter_x = ctx.Input("FilterX"); + const Tensor *scale_x = ctx.Input("ScaleX"); + const Tensor *bias_x = ctx.Input("BiasX"); + // norm conv + Tensor *conv_out_x = ctx.Output("ConvX"); + // bn finalize + Tensor *saved_mean_x = ctx.Output("SavedMeanX"); + Tensor *saved_invstd_x = ctx.Output("SavedInvstdX"); + Tensor *running_mean_x = ctx.Output("RunningMeanX"); + Tensor *running_var_x = ctx.Output("RunningVarX"); + // sbar + Tensor *output = ctx.Output("Y"); + Tensor *bitmask = ctx.Output("BitMask"); + // attrs + int padding = ctx.Attr("padding"); + int stride = ctx.Attr("stride"); + int stride_z = ctx.Attr("stride_z"); + int dilate = ctx.Attr("dilate"); + int group = ctx.Attr("group"); + double eps = static_cast(ctx.Attr("epsilon")); + double momentum = static_cast(ctx.Attr("momentum")); + bool has_shortcut = ctx.Attr("has_shortcut"); + bool fuse_add = ctx.Attr("fuse_add"); + bool use_global_stats = ctx.Attr("use_global_stats"); + bool is_test = ctx.Attr("is_test"); + bool is_train = !is_test && !use_global_stats; + std::string act_type = ctx.Attr("act_type"); + + auto input_x_shape = framework::vectorize(input_x->dims()); + auto filter_x_shape = framework::vectorize(filter_x->dims()); + auto param_dims = scale_x->dims(); + auto param_shape = framework::vectorize(scale_x->dims()); + auto output_shape = framework::vectorize(output->dims()); + auto bitmask_shape = framework::vectorize(bitmask->dims()); + int output_channel = filter_x_shape[0]; + int64_t ele_count = + std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()) / + output_channel; + + auto place = ctx.GetPlace(); + auto &dev_ctx = ctx.template device_context(); + + // 1. Conv + Tensor sum_x; + Tensor sum_of_squares_x; + sum_x.Resize(param_dims); + sum_of_squares_x.Resize(param_dims); + CudnnNormConvolution conv_x_op(dev_ctx, input_x_shape, filter_x_shape, + output_shape, padding, stride, dilate, + group); + conv_x_op.Forward(dev_ctx, *input_x, *filter_x, conv_out_x, &sum_x, + &sum_of_squares_x); + + // 2. BN + Tensor equiv_scale_x; + Tensor equiv_bias_x; + equiv_scale_x.Resize(param_dims); + equiv_bias_x.Resize(param_dims); + CudnnBNStatsFinalize bn_x_op(dev_ctx, param_shape); + bn_x_op.Forward(dev_ctx, sum_x, sum_of_squares_x, *scale_x, *bias_x, + saved_mean_x, saved_invstd_x, running_mean_x, running_var_x, + &equiv_scale_x, &equiv_bias_x, eps, momentum, ele_count, + is_train); + + // 3. scale + bias + add + relu + CudnnScaleBiasAddRelu sbar_op(dev_ctx, act_type, fuse_add, has_shortcut, + output_shape, param_shape, bitmask_shape); + if (has_shortcut) { + // input z + const Tensor *input_z = ctx.Input("Z"); + const Tensor *filter_z = ctx.Input("FilterZ"); + const Tensor *scale_z = ctx.Input("ScaleZ"); + const Tensor *bias_z = ctx.Input("BiasZ"); + // norm conv + Tensor *conv_out_z = ctx.Output("ConvZ"); + // bn finalize + Tensor *saved_mean_z = ctx.Output("SavedMeanZ"); + Tensor *saved_invstd_z = ctx.Output("SavedInvstdZ"); + Tensor *running_mean_z = ctx.Output("RunningMeanZ"); + Tensor *running_var_z = ctx.Output("RunningVarZ"); + + auto input_z_shape = framework::vectorize(input_z->dims()); + auto filter_z_shape = framework::vectorize(filter_z->dims()); + + // 3.1 Conv for second input + Tensor sum_z; + Tensor sum_of_squares_z; + sum_z.Resize(param_dims); + sum_of_squares_z.Resize(param_dims); + CudnnNormConvolution conv_z_op(dev_ctx, input_z_shape, filter_z_shape, + output_shape, padding, stride_z, dilate, + group); + conv_z_op.Forward(dev_ctx, *input_z, *filter_z, conv_out_z, &sum_z, + &sum_of_squares_z); + + // 3.2 BN for second input + Tensor equiv_scale_z; + Tensor equiv_bias_z; + equiv_scale_z.Resize(param_dims); + equiv_bias_z.Resize(param_dims); + CudnnBNStatsFinalize bn_z_op(dev_ctx, param_shape); + bn_z_op.Forward(dev_ctx, sum_z, sum_of_squares_z, *scale_z, *bias_z, + saved_mean_z, saved_invstd_z, running_mean_z, + running_var_z, &equiv_scale_z, &equiv_bias_z, eps, + momentum, ele_count, is_train); + // 3.3 sbar + sbar_op.Forward(dev_ctx, *conv_out_x, equiv_scale_x, equiv_bias_x, + conv_out_z, &equiv_scale_z, &equiv_bias_z, output, + bitmask); + } else { + const Tensor *input_z = fuse_add ? ctx.Input("Z") : nullptr; + sbar_op.Forward(dev_ctx, *conv_out_x, equiv_scale_x, equiv_bias_x, + input_z, nullptr, nullptr, output, bitmask); + } + } +}; + +template +class ResNetUnitGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet("It must use CUDAPlace.")); + PADDLE_ENFORCE_EQ(platform::CudnnDataType::type, CUDNN_DATA_HALF, + platform::errors::Unavailable( + "ResNetUnitOp only supports float16 for now.")); + + const Tensor *y_grad = ctx.Input(framework::GradVarName("Y")); + + const Tensor *x = ctx.Input("X"); + const Tensor *filter_x = ctx.Input("FilterX"); + const Tensor *scale_x = ctx.Input("ScaleX"); + const Tensor *bias_x = ctx.Input("BiasX"); + const Tensor *saved_mean_x = ctx.Input("SavedMeanX"); + const Tensor *saved_invstd_x = ctx.Input("SavedInvstdX"); + + const Tensor *conv_out_x = ctx.Input("ConvX"); + const Tensor *output = ctx.Input("Y"); + const Tensor *bitmask = ctx.Input("BitMask"); + + Tensor *x_grad = ctx.Output(framework::GradVarName("X")); + Tensor *filter_x_grad = + ctx.Output(framework::GradVarName("FilterX")); + Tensor *scale_x_grad = ctx.Output(framework::GradVarName("ScaleX")); + Tensor *bias_x_grad = ctx.Output(framework::GradVarName("BiasX")); + + int padding = ctx.Attr("padding"); + int stride = ctx.Attr("stride"); + int stride_z = ctx.Attr("stride_z"); + int dilate = ctx.Attr("dilate"); + int group = ctx.Attr("group"); + double eps = static_cast(ctx.Attr("epsilon")); + double momentum = static_cast(ctx.Attr("momentum")); + bool has_shortcut = ctx.Attr("has_shortcut"); + bool fuse_add = ctx.Attr("fuse_add"); + bool use_global_stats = ctx.Attr("use_global_stats"); + std::string act_type = ctx.Attr("act_type"); + + auto x_shape = framework::vectorize(x->dims()); + auto filter_x_shape = framework::vectorize(filter_x->dims()); + auto param_shape = framework::vectorize(scale_x->dims()); + auto output_shape = framework::vectorize(output->dims()); + auto bitmask_shape = framework::vectorize(bitmask->dims()); + + auto place = ctx.GetPlace(); + auto &dev_ctx = ctx.template device_context(); + + // 1. Backward of BN (+ Add + Relu) for x, get conv_out_x_grad, + // scale_x_grad, bias_x_grad + Tensor conv_out_x_grad; + conv_out_x_grad.Resize(conv_out_x->dims()); + CudnnScaleBiasAddRelu sbar_x_op(dev_ctx, act_type, fuse_add, + has_shortcut, output_shape, param_shape, + bitmask_shape); + if (has_shortcut) { + // X Z + // | | + // NormConv NormConv + // | | + // BNStatsFinalize BNStatsFinalize + // \ / + // ScaleBiasAddRelu + // | + // Y + const Tensor *z = ctx.Input("Z"); + const Tensor *filter_z = ctx.Input("FilterZ"); + const Tensor *scale_z = ctx.Input("ScaleZ"); + const Tensor *bias_z = ctx.Input("BiasZ"); + const Tensor *saved_mean_z = ctx.Input("SavedMeanZ"); + const Tensor *saved_invstd_z = ctx.Input("SavedInvstdZ"); + const Tensor *conv_out_z = ctx.Input("ConvZ"); + + Tensor *z_grad = ctx.Output(framework::GradVarName("Z")); + Tensor *filter_z_grad = + ctx.Output(framework::GradVarName("FilterZ")); + Tensor *scale_z_grad = + ctx.Output(framework::GradVarName("ScaleZ")); + Tensor *bias_z_grad = ctx.Output(framework::GradVarName("BiasZ")); + + // 1.1 Backward of BN + Add (+ Relu) for x, get conv_out_x_grad, + // scale_x_grad, bias_x_grad and z_grad_temp + Tensor z_grad_temp; + z_grad_temp.Resize(conv_out_z->dims()); + sbar_x_op.Backward(dev_ctx, *y_grad, *conv_out_x, *scale_x, *bias_x, + *saved_mean_x, *saved_invstd_x, bitmask, + &conv_out_x_grad, &z_grad_temp, scale_x_grad, + bias_x_grad, eps); + + // 1.2 bn backward for z, get conv_out_z_grad, dscale_z, dbias_z + Tensor conv_out_z_grad; + conv_out_z_grad.Resize(conv_out_z->dims()); + CudnnScaleBiasAddRelu sbar_z_op( + dev_ctx, "", false, false, output_shape, param_shape, bitmask_shape); + sbar_z_op.Backward(dev_ctx, z_grad_temp, *conv_out_z, *scale_z, *bias_z, + *saved_mean_z, *saved_invstd_z, nullptr, + &conv_out_z_grad, nullptr, scale_z_grad, bias_z_grad, + eps); + + // 1.3 Backward of Conv for z, get z_grad and filter_z_grad + auto z_shape = framework::vectorize(z->dims()); + auto filter_z_shape = framework::vectorize(filter_z->dims()); + CudnnNormConvolutionGrad conv_z_op(dev_ctx, z_shape, filter_z_shape, + output_shape, padding, stride_z, + dilate, group); + conv_z_op.Backward(dev_ctx, *z, *filter_z, conv_out_z_grad, z_grad, + filter_z_grad); + } else { + // 1.1 Backward of BN (+ Add + Relu) for x, get conv_out_x_grad, + // scale_x_grad, bias_x_grad (and z_grad) + Tensor *z_grad = + fuse_add ? ctx.Output(framework::GradVarName("Z")) : nullptr; + sbar_x_op.Backward(dev_ctx, *y_grad, *conv_out_x, *scale_x, *bias_x, + *saved_mean_x, *saved_invstd_x, bitmask, + &conv_out_x_grad, z_grad, scale_x_grad, bias_x_grad, + eps); + } + + // 2. Backward of Conv for x, get x_grad and filter_x_grad + CudnnNormConvolutionGrad conv_x_op(dev_ctx, x_shape, filter_x_shape, + output_shape, padding, stride, dilate, + group); + conv_x_op.Backward(dev_ctx, *x, *filter_x, conv_out_x_grad, x_grad, + filter_x_grad); + } +}; + +} // namespace operators +} // namespace paddle + +#if CUDNN_VERSION >= 8000 +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL(resnet_unit, ops::ResNetUnitKernel); +REGISTER_OP_CUDA_KERNEL(resnet_unit_grad, + ops::ResNetUnitGradKernel); +#endif diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index 5978d3829aecae..6317be9a2e2e20 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -80,6 +80,27 @@ def _dtype_to_str(dtype): return 'fp32' +def _keep_fp32_input(op, in_name): + op_type = op.type + if op_type in ['batch_norm', 'layer_norm']: + # Scale, Bias, Mean, Variance should be float32. + return in_name != 'X' + if op_type == 'fused_bn_add_activation': + return in_name not in {'X', 'Z'} + if op_type == 'resnet_unit': + return in_name not in {'X', 'FilterX', 'Z', 'FilterZ'} + return False + + +def _keep_fp32_output(op, out_name): + op_type = op.type + if op_type in ['batch_norm', 'fused_bn_add_activation', 'layer_norm']: + return out_name != 'Y' + if op_type == 'resnet_unit': + return out_name not in {'Y', 'ConvX', 'ConvZ'} + return False + + def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): """ Insert cast op and rename args of input and output. @@ -97,11 +118,9 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): num_cast_ops = 0 for in_name in op.input_names: - if src_dtype == core.VarDesc.VarType.FP32 and op.type in [ - 'batch_norm', 'fused_bn_add_activation', 'layer_norm' - ]: - if in_name not in {'X', 'Z'}: - continue + if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(op, + in_name): + continue for in_var_name in op.input(in_name): in_var = block._find_var_recursive(in_var_name) if in_var.type not in _valid_types or in_var.dtype == dest_dtype: @@ -154,9 +173,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): op._set_attr('in_dtype', dest_dtype) if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.FP16: for out_name in op.output_names: - if op.type in [ - 'batch_norm', 'fused_bn_add_activation', 'layer_norm' - ] and out_name != 'Y': + if _keep_fp32_output(op, out_name): continue for out_var_name in op.output(out_name): out_var = block.var(out_var_name) @@ -371,9 +388,7 @@ def cast_model_to_fp16(program, amp_lists=None, use_fp16_guard=True): keep_fp32_ops.add(op) continue # processed below for in_name in op.input_names: - if op.type in { - 'batch_norm', 'fused_bn_add_activation', 'layer_norm' - } and in_name not in {'X', 'Z'}: + if _keep_fp32_input(op, in_name): continue for in_var_name in op.input(in_name): in_var = None @@ -401,9 +416,7 @@ def cast_model_to_fp16(program, amp_lists=None, use_fp16_guard=True): format(op.type, in_var_name, in_var.dtype)) for out_name in op.output_names: - if op.type in { - 'batch_norm', 'fused_bn_add_activation', 'layer_norm' - } and out_name != 'Y': + if _keep_fp32_output(op, out_name): continue for out_var_name in op.output(out_name): out_var = None From 3e6d9dbbcac1b003253f9cb437e51e360970f407 Mon Sep 17 00:00:00 2001 From: Wilber Date: Thu, 14 Oct 2021 16:13:38 +0800 Subject: [PATCH 164/298] inference support bert when exists matmul_v2 (#36424) * support bert when exists matmul_v2 * update --- cmake/external/lite.cmake | 2 +- .../framework/ir/graph_pattern_detector.cc | 19 +++ .../framework/ir/graph_pattern_detector.h | 13 ++ .../framework/ir/map_matmul_to_mul_pass.cc | 114 ++++++++++++++++++ .../framework/ir/map_matmul_to_mul_pass.h | 12 ++ .../ir/multihead_matmul_fuse_pass.cc | 33 ++--- .../inference/api/paddle_pass_builder.cc | 3 + .../fluid/inference/lite/test_engine_lite.cc | 35 +++--- .../operators/lite/lite_engine_op_test.cc | 19 +-- 9 files changed, 207 insertions(+), 43 deletions(-) diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake index e344ebaa2477ea..097ca38be070ab 100644 --- a/cmake/external/lite.cmake +++ b/cmake/external/lite.cmake @@ -134,7 +134,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) GIT_TAG ${LITE_GIT_TAG} PREFIX ${LITE_SOURCES_DIR} UPDATE_COMMAND "" - PATCH_COMMAND sed -i "s?NNadapter_bridges_path = os.path.abspath('..')+\"\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?NNadapter_bridges_path = os.path.abspath(\'..\')+\"\/extern_lite\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?" ${LITE_SOURCES_DIR}/src/extern_lite//lite/tools/cmake_tools/record_supported_kernel_op.py && sed -i "/general::ssa::ConvertToSSA(cpp_prog)$/d" ${LITE_SOURCES_DIR}/src/extern_lite/lite/model_parser/model_parser.cc + PATCH_COMMAND sed -i "s?NNadapter_bridges_path = os.path.abspath('..')+\"\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?NNadapter_bridges_path = os.path.abspath(\'..\')+\"\/extern_lite\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?" ${LITE_SOURCES_DIR}/src/extern_lite//lite/tools/cmake_tools/record_supported_kernel_op.py BUILD_COMMAND ${LITE_BUILD_COMMAND} INSTALL_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 695da372d18f3e..2f18b678e2856b 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1615,6 +1615,25 @@ PDNode *patterns::Matmul::operator()() { return matmul_out; } +PDNode *patterns::MatmulV2::operator()() { + auto matmul_op = + pattern->NewNode(matmul_op_repr())->assert_is_op("matmul_v2"); + + auto matmul_in_x = pattern->NewNode(matmul_in_x_repr()) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + auto matmul_in_y = pattern->NewNode(matmul_in_y_repr()) + ->assert_is_persistable_var() + ->AsInput() + ->assert_is_op_input("matmul_v2", "Y"); + auto matmul_out = pattern->NewNode(matmul_out_repr()) + ->AsOutput() + ->assert_is_op_output("matmul_v2", "Out"); + + matmul_op->LinksFrom({matmul_in_x, matmul_in_y}).LinksTo({matmul_out}); + return matmul_out; +} + PDNode *patterns::Squeeze2Matmul::operator()() { auto squeeze2_in_x = pattern->NewNode(squeeze2_in_x_repr()) ->assert_is_op_input("squeeze2", "X") diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 4afb7dfd4991b0..ba0d982dcc481b 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -976,6 +976,19 @@ struct Matmul : public PatternBase { PATTERN_DECL_NODE(matmul_out); }; +// Matmul_v2 op +// Forward pass for matmul_v2. +struct MatmulV2 : public PatternBase { + MatmulV2(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "matmul_v2") {} + + PDNode* operator()(); + PATTERN_DECL_NODE(matmul_in_x); + PATTERN_DECL_NODE(matmul_in_y); + PATTERN_DECL_NODE(matmul_op); + PATTERN_DECL_NODE(matmul_out); +}; + // Squeeze2 + Matmul // Forward pass. struct Squeeze2Matmul : public PatternBase { diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc index 864055cfa3620d..cdec49260f90cd 100644 --- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc +++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc @@ -16,6 +16,7 @@ #include #include +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -67,6 +68,42 @@ MapMatmul2MulPass::MapMatmul2MulPass() { .End(); } +MapMatmulv2ToMulPass::MapMatmulv2ToMulPass() { + AddOpCompat(OpCompat("matmul_v2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("trans_x") + .IsBoolEQ(false) + .End() + .AddAttr("trans_y") + .IsBoolEQ(false) + .End(); + + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumGE(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); +} + Flatten2MatmulFusePass::Flatten2MatmulFusePass() { AddOpCompat(OpCompat("matmul")) .AddInput("X") @@ -250,6 +287,75 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { AddStatis(found_count); } +void MapMatmulv2ToMulPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + std::string name_scope = "map_matmul_v2_to_mul_pass"; + FusePassBase::Init(name_scope, graph); + + GraphPatternDetector gpd; + patterns::MatmulV2 matmul_pattern(gpd.mutable_pattern(), name_scope); + matmul_pattern(); + + int found_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "map matmul_v2 to mul"; + GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, matmul_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, matmul_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern); + bool flag = true; + + bool trans_x = BOOST_GET_CONST(bool, matmul_op->Op()->GetAttr("trans_x")); + bool trans_y = BOOST_GET_CONST(bool, matmul_op->Op()->GetAttr("trans_y")); + flag = flag && !trans_x && !trans_y; + + std::vector x_shape = matmul_in_x->Var()->GetShape(); + std::vector y_shape = matmul_in_y->Var()->GetShape(); + size_t x_rank = x_shape.size(); + size_t y_rank = y_shape.size(); + flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2; + + std::vector& next_ops = matmul_out->outputs; + flag = flag && next_ops.size() == 1 && + next_ops[0]->Name() == "elementwise_add"; + + if (flag) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } + OpDesc desc(matmul_op->Op()->Block()); + desc.SetType("mul"); + desc.SetInput("X", {matmul_in_x->Name()}); + desc.SetInput("Y", {matmul_in_y->Name()}); + desc.SetOutput("Out", {matmul_out->Name()}); + desc.SetAttr("x_num_col_dims", static_cast(x_rank - 1)); + desc.SetAttr("y_num_col_dims", 1); + if (matmul_op->Op()->HasAttr("enable_int8")) { + desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8")); + desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale")); + desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale")); + } + auto mul_node = g->CreateOpNode(&desc); + IR_NODE_LINK_TO(matmul_in_x, mul_node); + IR_NODE_LINK_TO(matmul_in_y, mul_node); + IR_NODE_LINK_TO(mul_node, matmul_out); + GraphSafeRemoveNodes(graph, {matmul_op}); + ++found_count; + + if (!IsCompat(desc)) { + LOG(WARNING) << "MapMatmulv2ToMulPass in out mul op compat failed."; + return; + } + } + }; + + gpd(graph, handler); + AddStatis(found_count); +} + void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); @@ -567,6 +673,14 @@ REGISTER_PASS_CAPABILITY(map_matmul_to_mul_pass) .LE("matmul", 1) .EQ("mul", 0)); +REGISTER_PASS(map_matmul_v2_to_mul_pass, + paddle::framework::ir::MapMatmulv2ToMulPass); +REGISTER_PASS_CAPABILITY(map_matmul_v2_to_mul_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .EQ("matmul_v2", 0) + .EQ("mul", 0)); + REGISTER_PASS(squeeze2_matmul_fuse_pass, paddle::framework::ir::Squeeze2MatmulFusePass); REGISTER_PASS_CAPABILITY(squeeze2_matmul_fuse_pass) diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h index 192dcfc00f9d34..8f462810fce51a 100644 --- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h +++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h @@ -46,6 +46,18 @@ class MapMatmul2MulPass : public FusePassBase { void ApplyImpl(Graph* graph) const override; }; +/* + * Map matmul_v2 to mul, the same as MapMatmul2MulPass. + */ +class MapMatmulv2ToMulPass : public FusePassBase { + public: + MapMatmulv2ToMulPass(); + virtual ~MapMatmulv2ToMulPass() {} + + protected: + void ApplyImpl(Graph* graph) const override; +}; + /* * Fuse squeeze2+matmul to mul, so the optimization can use fc_fuse_pass. * The squeeze2 op must satisfy the following conditions: diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc index c826e1c5a584ac..4c0b28fd422662 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc @@ -425,15 +425,15 @@ PDNode* MultiHeadMatmulPattern::operator()() { PDNode* MultiHeadMatmulV3Pattern::operator()() { std::unordered_set matmul_ops{"matmul", "matmul_v2"}; auto* input0 = pattern->NewNode(input0_repr()); - input0->assert_is_op_input("matmul"); + input0->assert_is_ops_input(matmul_ops); // First path with scale - auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matmul"); + auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_ops(matmul_ops); auto* mul0_w_var = pattern->NewNode(mul0_w_repr()) ->AsInput() - ->assert_is_op_input("matmul", "Y"); + ->assert_is_ops_input(matmul_ops, "Y"); auto* mul0_out_var = - pattern->NewNode(mul0_out_repr())->assert_is_op_output("matmul"); + pattern->NewNode(mul0_out_repr())->assert_is_ops_output(matmul_ops); decltype(mul0) eltadd0; decltype(mul0) eltadd0_b_var; @@ -461,11 +461,12 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() { pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2"); auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr()) ->assert_is_op_output("transpose2"); - transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul", "X"); + transpose2_0_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops); - auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul"); + auto* matmul_qk = + pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops); auto* matmul_qk_out_var = - pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul"); + pattern->NewNode(matmul_qk_out_repr())->assert_is_ops_output(matmul_ops); matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); auto* eltadd_qk = @@ -499,15 +500,15 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() { pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2"); auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr()) ->assert_is_op_output("reshape2"); - reshape2_qkv_out_var->assert_is_op_input("matmul"); + reshape2_qkv_out_var->assert_is_ops_input(matmul_ops); // Second path to matmul - auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matmul"); + auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_ops(matmul_ops); auto* mul1_w_var = pattern->NewNode(mul1_w_repr()) ->AsInput() - ->assert_is_op_input("matmul", "Y"); + ->assert_is_ops_input(matmul_ops, "Y"); auto* mul1_out_var = - pattern->NewNode(mul1_out_repr())->assert_is_op_output("matmul"); + pattern->NewNode(mul1_out_repr())->assert_is_ops_output(matmul_ops); decltype(mul1) eltadd1; decltype(mul1) eltadd1_b_var; @@ -534,16 +535,16 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() { pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2"); auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr()) ->assert_is_op_output("transpose2"); - transpose2_1_out_var->AsIntermediate()->assert_is_op_input( - "matmul", "Y"); // link to matmul qk + transpose2_1_out_var->AsIntermediate()->assert_is_ops_input( + matmul_ops, "Y"); // link to matmul qk // Third path to matmul - auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matmul"); + auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_ops(matmul_ops); auto* mul2_w_var = pattern->NewNode(mul2_w_repr()) ->AsInput() - ->assert_is_op_input("matmul", "Y"); + ->assert_is_ops_input(matmul_ops, "Y"); auto* mul2_out_var = - pattern->NewNode(mul2_out_repr())->assert_is_op_output("matmul"); + pattern->NewNode(mul2_out_repr())->assert_is_ops_output(matmul_ops); decltype(mul2) eltadd2; decltype(mul2) eltadd2_b_var; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 47e9c1fd202a05..504f81bfa01ac6 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -94,6 +94,7 @@ const std::vector kTRTSubgraphPasses({ "reshape2_matmul_fuse_pass", // "flatten2_matmul_fuse_pass", // "map_matmul_to_mul_pass", // + "map_matmul_v2_to_mul_pass", // "fc_fuse_pass", // "conv_elementwise_add_fuse_pass", // "add_support_int8_pass", @@ -142,6 +143,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { "reshape2_matmul_fuse_pass", // "flatten2_matmul_fuse_pass", // "map_matmul_to_mul_pass", // + "map_matmul_v2_to_mul_pass", // "fc_fuse_pass", // "fc_elementwise_layernorm_fuse_pass", // #if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be @@ -202,6 +204,7 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { "reshape2_matmul_fuse_pass", // "flatten2_matmul_fuse_pass", // "map_matmul_to_mul_pass", // + "map_matmul_v2_to_mul_pass", // "fc_fuse_pass", // "repeated_fc_relu_fuse_pass", // "squared_mat_sub_fuse_pass", // diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc index 080622899eb2e7..b2750fd070d3eb 100644 --- a/paddle/fluid/inference/lite/test_engine_lite.cc +++ b/paddle/fluid/inference/lite/test_engine_lite.cc @@ -110,23 +110,24 @@ TEST(EngineManager, engine) { }; LOG(INFO) << "Create EngineManager"; - inference::Singleton::Global().Create( - unique_key, config); - LOG(INFO) << "Create EngineManager done"; - ASSERT_EQ( - inference::Singleton::Global().Empty(), - false); - ASSERT_EQ(inference::Singleton::Global().Has( - unique_key), - true); - paddle::lite_api::PaddlePredictor* engine_0 = - inference::Singleton::Global().Get( - unique_key); - CHECK_NOTNULL(engine_0); - inference::Singleton::Global().DeleteAll(); - CHECK(inference::Singleton::Global().Get( - unique_key) == nullptr) - << "the engine_0 should be nullptr"; + // TODO(wilber): The ut is out of date, we need to a new lite subgraph test. + // inference::Singleton::Global().Create( + // unique_key, config); + // LOG(INFO) << "Create EngineManager done"; + // ASSERT_EQ( + // inference::Singleton::Global().Empty(), + // false); + // ASSERT_EQ(inference::Singleton::Global().Has( + // unique_key), + // true); + // paddle::lite_api::PaddlePredictor* engine_0 = + // inference::Singleton::Global().Get( + // unique_key); + // CHECK_NOTNULL(engine_0); + // inference::Singleton::Global().DeleteAll(); + // CHECK(inference::Singleton::Global().Get( + // unique_key) == nullptr) + // << "the engine_0 should be nullptr"; } } // namespace lite diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc index 8b7f1268081343..053ba322d8f4de 100644 --- a/paddle/fluid/operators/lite/lite_engine_op_test.cc +++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc @@ -105,15 +105,16 @@ TEST(LiteEngineOp, engine_op) { engine_op_desc.SetAttr("use_gpu", true); engine_op_desc.SetAttr("zero_copy", true); engine_op_desc.SetBlockAttr("sub_block", &block_desc); - inference::Singleton::Global().Create( - engine_key, config); - LOG(INFO) << "create engine op"; - auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); - LOG(INFO) << "engine_op " << engine_op.get(); - // Execute them. - LOG(INFO) << "engine_op run"; - engine_op->Run(scope, place); - LOG(INFO) << "done"; + // TODO(wilber): The ut is out of date, we need to a new lite subgraph test. + // inference::Singleton::Global().Create( + // engine_key, config); + // LOG(INFO) << "create engine op"; + // auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); + // LOG(INFO) << "engine_op " << engine_op.get(); + // // Execute them. + // LOG(INFO) << "engine_op run"; + // engine_op->Run(scope, place); + // LOG(INFO) << "done"; } #endif From 63fd7d6604ecb21a7e5fcaa9b5b578ca48cdd356 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Thu, 14 Oct 2021 16:30:19 +0800 Subject: [PATCH 165/298] refine merge lars (#36428) --- .../operators/optimizers/lars_momentum_op.cu | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index caefd496978af2..e90f1136fd30da 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -28,7 +28,7 @@ limitations under the License. */ #define LARS_BLOCK_SIZE 512 #endif -#define LARS_MAX_MERGED_OPS 150 +#define LARS_MAX_MERGED_OPS 60 namespace paddle { namespace operators { @@ -256,11 +256,8 @@ template struct LarsParamWarpper { int64_t numel_arr[LARS_MAX_MERGED_OPS]; int repeat_arr[LARS_MAX_MERGED_OPS]; - const T* __restrict__ p_arr[LARS_MAX_MERGED_OPS]; const T* __restrict__ g_arr[LARS_MAX_MERGED_OPS]; - const MT* __restrict__ v_arr[LARS_MAX_MERGED_OPS]; const MT* __restrict__ lr_arr[LARS_MAX_MERGED_OPS]; - const MT* __restrict__ master_p_arr[LARS_MAX_MERGED_OPS]; T* __restrict__ p_out_arr[LARS_MAX_MERGED_OPS]; MT* __restrict__ v_out_arr[LARS_MAX_MERGED_OPS]; MT* __restrict__ master_p_out_arr[LARS_MAX_MERGED_OPS]; @@ -268,7 +265,7 @@ struct LarsParamWarpper { }; template -__global__ void MergedMomentumLarsKernel(LarsParamWarpper* lars_warpper, +__global__ void MergedMomentumLarsKernel(LarsParamWarpper lars_warpper, MT* __restrict__ p_buffer, MT* __restrict__ g_buffer, const int op_num, const MT mu, @@ -279,18 +276,18 @@ __global__ void MergedMomentumLarsKernel(LarsParamWarpper* lars_warpper, int tid = threadIdx.x + blockIdx.x * blockDim.x; const cooperative_groups::grid_group cg = cooperative_groups::this_grid(); for (int i = 0; i < op_num; ++i) { - int numel = lars_warpper->numel_arr[i]; + int numel = lars_warpper.numel_arr[i]; MT param_norm = static_cast(0); MT grad_norm = static_cast(0); - L2NormKernel(&cg, lars_warpper->p_arr[i], lars_warpper->g_arr[i], - p_buffer, g_buffer, numel, lars_warpper->repeat_arr[i], + L2NormKernel(&cg, lars_warpper.p_out_arr[i], lars_warpper.g_arr[i], + p_buffer, g_buffer, numel, lars_warpper.repeat_arr[i], rescale_grad, 0, ¶m_norm, &grad_norm); MomentumUpdate( - lars_warpper->p_arr[i], lars_warpper->g_arr[i], - lars_warpper->v_out_arr[i], lars_warpper->p_out_arr[i], - lars_warpper->v_out_arr[i], lars_warpper->master_p_arr[i], - lars_warpper->master_p_out_arr[i], lars_warpper->lr_arr[i], mu, - lars_warpper->weight_decay_arr[i], lars_coeff, epsilon, rescale_grad, + lars_warpper.p_out_arr[i], lars_warpper.g_arr[i], + lars_warpper.v_out_arr[i], lars_warpper.p_out_arr[i], + lars_warpper.v_out_arr[i], lars_warpper.master_p_out_arr[i], + lars_warpper.master_p_out_arr[i], lars_warpper.lr_arr[i], mu, + lars_warpper.weight_decay_arr[i], lars_coeff, epsilon, rescale_grad, param_norm, grad_norm, tid, grid_stride, numel, is_amp); } } @@ -410,15 +407,21 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { size_t temp_numel = param[i]->numel(); total_numel += temp_numel; lars_warpper.numel_arr[i] = temp_numel; - lars_warpper.p_arr[i] = param[i]->data(); lars_warpper.g_arr[i] = grad[i]->data(); - lars_warpper.v_arr[i] = velocity[i]->data(); lars_warpper.lr_arr[i] = learning_rate[i]->data(); lars_warpper.p_out_arr[i] = param_out[i]->mutable_data(ctx.GetPlace()); lars_warpper.v_out_arr[i] = velocity_out[i]->mutable_data(ctx.GetPlace()); lars_warpper.weight_decay_arr[i] = static_cast(weight_decay_arr[i]); + PADDLE_ENFORCE_EQ( + param[i]->data(), lars_warpper.p_out_arr[i], + platform::errors::InvalidArgument( + "Input(Param) and Output(ParamOut) must be the same Tensors.")); + PADDLE_ENFORCE_EQ(velocity[i]->data(), lars_warpper.v_out_arr[i], + platform::errors::InvalidArgument( + "Input(Velocity) and Output(VelocityOut) must be " + "the same Tensors.")); } int64_t avg_numel = total_numel / op_num; LarsThreadConfig lars_thread_config(avg_numel, sm_num, @@ -429,19 +432,16 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { } if (multi_precision) { for (int i = 0; i < op_num; ++i) { - lars_warpper.master_p_arr[i] = master_param[i]->data(); lars_warpper.master_p_out_arr[i] = master_param_out[i]->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE_EQ(master_param[i]->data(), + lars_warpper.master_p_out_arr[i], + platform::errors::InvalidArgument( + "Input(MasterParam) and Output(MasterParamOut) " + "must be the same Tensors.")); } } - auto merged_buf = memory::Alloc(cuda_ctx, sizeof(lars_warpper)); - auto* merged_ptr = - reinterpret_cast*>(merged_buf->ptr()); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, cuda_ctx.GetPlace()), - reinterpret_cast(merged_ptr), platform::CPUPlace(), - reinterpret_cast(&lars_warpper), sizeof(lars_warpper), - cuda_ctx.stream()); - void* cuda_param[] = {reinterpret_cast(&merged_ptr), + void* cuda_param[] = {reinterpret_cast(&lars_warpper), reinterpret_cast(&p_buffer), reinterpret_cast(&g_buffer), reinterpret_cast(&op_num), From 3cf5764692fcd1ca6499930f50601611f56463a1 Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Thu, 14 Oct 2021 16:45:27 +0800 Subject: [PATCH 166/298] enable 3rd order test case (#36427) --- .../paddle/fluid/tests/unittests/autograd/test_hessian.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/autograd/test_hessian.py b/python/paddle/fluid/tests/unittests/autograd/test_hessian.py index 120a6c853e8d89..1aa0d94de16308 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_hessian.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_hessian.py @@ -16,6 +16,7 @@ import numpy as np import paddle import paddle.compat as cpt +import paddle.nn.functional as F from utils import _compute_numerical_hessian @@ -107,10 +108,9 @@ def func(x): error_msg = cpt.get_exception_message(e) assert error_msg.find("has no gradient") > 0 - # TODO(levi): enable this test case when matmul_grad_grad_grad is ok - def _test_create_graph_true(self): + def test_create_graph_true(self): def func(x): - return paddle.sum(paddle.matmul(x, x)) + return paddle.sum(F.sigmoid(x)) numerical_hessian = _compute_numerical_hessian( func, self.x, self.numerical_delta, self.np_dtype) From 8256f6fa862e8f46dbd162de8f65939c5f6eeaa9 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Thu, 14 Oct 2021 17:53:40 +0800 Subject: [PATCH 167/298] fix lars (#36431) --- .../operators/optimizers/lars_momentum_op.cu | 41 ++++++++++++++----- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index e90f1136fd30da..b640e62221f777 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -165,8 +165,10 @@ __global__ void L2NormKernel( int tid = threadIdx.x + blockDim.x * blockIdx.x; int grid_stride = LARS_BLOCK_SIZE * gridDim.x; const MT rescale_pow = rescale_grad * rescale_grad; - s_buffer[0] = static_cast(0); - s_buffer[1] = static_cast(0); + if (threadIdx.x == 0) { + s_buffer[0] = static_cast(0); + s_buffer[1] = static_cast(0); + } MT p_tmp = static_cast(0); MT g_tmp = static_cast(0); @@ -175,8 +177,12 @@ __global__ void L2NormKernel( p_tmp = static_cast(p_data[tid]); g_tmp = static_cast(g_data[tid]); } - s_buffer[0] += math::blockReduceSum(p_tmp * p_tmp, FINAL_MASK); - s_buffer[1] += math::blockReduceSum(g_tmp * g_tmp, FINAL_MASK); + MT tmp0 = math::blockReduceSum(p_tmp * p_tmp, FINAL_MASK); + MT tmp1 = math::blockReduceSum(g_tmp * g_tmp, FINAL_MASK); + if (threadIdx.x == 0) { + s_buffer[0] += tmp0; + s_buffer[1] += tmp1; + } } else { /* Avoid occupy too much temp buffer. Slice the whole data into 2 parts, the front of data whose quantity is excatly multiple of grid-thread @@ -185,8 +191,12 @@ __global__ void L2NormKernel( p_tmp = static_cast(p_data[tid]); g_tmp = static_cast(g_data[tid]); tid += grid_stride; - s_buffer[0] += math::blockReduceSum(p_tmp * p_tmp, FINAL_MASK); - s_buffer[1] += math::blockReduceSum(g_tmp * g_tmp, FINAL_MASK); + MT tmp0 = math::blockReduceSum(p_tmp * p_tmp, FINAL_MASK); + MT tmp1 = math::blockReduceSum(g_tmp * g_tmp, FINAL_MASK); + if (threadIdx.x == 0) { + s_buffer[0] += tmp0; + s_buffer[1] += tmp1; + } __syncthreads(); } MT p_val = 0; @@ -195,8 +205,12 @@ __global__ void L2NormKernel( p_val = static_cast(p_data[tid]); g_val = static_cast(g_data[tid]); } - s_buffer[0] += math::blockReduceSum(p_val * p_val, FINAL_MASK); - s_buffer[1] += math::blockReduceSum(g_val * g_val, FINAL_MASK); + MT tmp0 = math::blockReduceSum(p_val * p_val, FINAL_MASK); + MT tmp1 = math::blockReduceSum(g_val * g_val, FINAL_MASK); + if (threadIdx.x == 0) { + s_buffer[0] += tmp0; + s_buffer[1] += tmp1; + } } __syncthreads(); @@ -208,8 +222,15 @@ __global__ void L2NormKernel( cg->sync(); // Grid sync for writring partial result to gloabl memory MT p_part_sum = threadIdx.x < gridDim.x ? p_buffer[threadIdx.x] : 0; MT g_part_sum = threadIdx.x < gridDim.x ? g_buffer[threadIdx.x] : 0; - *p_n = Sqrt(math::blockReduceSum(p_part_sum, FINAL_MASK)); - *g_n = Sqrt(rescale_pow * math::blockReduceSum(g_part_sum, FINAL_MASK)); + MT tmp0 = math::blockReduceSum(p_part_sum, FINAL_MASK); + MT tmp1 = math::blockReduceSum(g_part_sum, FINAL_MASK); + if (threadIdx.x == 0) { + s_buffer[0] = tmp0; + s_buffer[1] = tmp1; + } + __syncthreads(); + *p_n = Sqrt(s_buffer[0]); + *g_n = Sqrt(rescale_pow * s_buffer[1]); #endif } From 66c58fa3460da2f573a296169479f79dae1e9e17 Mon Sep 17 00:00:00 2001 From: duanboqiang Date: Thu, 14 Oct 2021 18:38:21 +0800 Subject: [PATCH 168/298] optimize-offload support adamw op type (#36432) --- .../fleet/meta_optimizers/sharding/offload_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py index bb6af1b3195f70..9c751c5044701b 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py @@ -284,7 +284,7 @@ def offload(self, block, startup_block): break vars_name = [] - if op.type == "adam": + if op.type == "adam" or op.type == "adamw": # {Moment1Out = [''], Moment2Out = [''], ParamOut = ['']} = # adam(inputs={Moment1 = [''], Moment2 = [''], Param = ['']}) vars_name.append(op.desc.input("Moment1")[0]) From 6ccc2a40aa65a3b56563ff932da77fff2005d4fe Mon Sep 17 00:00:00 2001 From: Yanxing Shi <48111042+Yanxing-Shi@users.noreply.github.com> Date: Thu, 14 Oct 2021 19:17:04 +0800 Subject: [PATCH 169/298] add sparse_embedding doc (#36283) * add sparse_embedding doc * delete wrong space * fix error for sample code * fix error for doc compile * delete __all__ * modify sample code --- python/paddle/fluid/contrib/layers/nn.py | 111 ++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index eb2c94b20106c5..99ede353c1081e 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -971,12 +971,121 @@ def sparse_embedding(input, table_class="CommonSparseTable", param_attr=None, dtype='float32'): + r""" + :api_attr: Static Graph + + The OP is used as the operator of the Embedding Lookup layer in the large-scale + sparse training of the parameter server mode, instead of using the paddle.nn.functional.embedding. + + The operator is used to lookup embeddings vector of ids provided by :attr:`input` . + It automatically constructs a 2D embedding matrix based on the input :attr:`size` + (vocab_size, emb_size) and :attr:`dtype` . + + The shape of output Tensor is generated by appending an emb_size dimension to the + last dimension of the input Tensor shape. + + **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` , otherwise + the program will throw an exception and exit. + + .. code-block:: text + + Case 1: + + input is a Tensor. padding_idx = -1 + input.data = [[1, 3], [2, 4], [4, 127]] + input.shape = [3, 2] + Given size = [128, 16] + output is a Tensor: + out.shape = [3, 2, 16] + out.data = [[[0.129435295, 0.244512452, ..., 0.436322452], + [0.345421456, 0.524563927, ..., 0.144534654]], + + [[0.345249859, 0.124939536, ..., 0.194353745], + [0.945345345, 0.435394634, ..., 0.435345365]], + + [[0.945345345, 0.435394634, ..., 0.435345365], + [0.0, 0.0, ..., 0.0 ]]] # padding data + The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127 + It will pad all-zero data when ids is 127. + + Case 2: + + input is a LoDTensor with 1-level LoD. padding_idx = 0 + input.lod = [[2, 3]] + input.data = [[1], [3], [2], [4], [0]] + input.shape = [5, 1] + Given size = [128, 16] + output is a LoDTensor: + out.lod = [[2, 3]] + out.shape = [5, 1, 16] + out.data = [[[0.129435295, 0.244512452, ..., 0.436322452]], + [[0.345421456, 0.524563927, ..., 0.144534654]], + [[0.345249859, 0.124939536, ..., 0.194353745]], + [[0.945345345, 0.435394634, ..., 0.435345365]], + [[0.0, 0.0, ..., 0.0 ]]] # padding data + It will pad all-zero data when ids is 0. + + Args: + input(Variable): A Tensor or LoDTensor with type int64, which contains the id + information. The value of the input id should satisfy :math:`0<= id < size[0]` . + size(tuple|list): The shape of lookup table parameter (vocab_size, emb_size). It + should have two elements which indicates the size of the dictionary of embeddings + and the size of each embedding vector respectively. The initial parameter size + is 0 in the large-scale sparse scenario, which will gradually expand with the + training. So if vocab_size is temporarily useless, its value can be any integer. + The emb_size is the dimensional configuration of the word embedding weight parameter. + padding_idx(int|long|None, optional): padding_idx needs to be in the interval [-vocab_size, vocab_size). + If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted + to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever + lookup encounters :math:`padding\_idx` in id. And the padding data will not be updated + while training. If set None, it makes no efe mfect to output. Default: None. + is_test(bool, optional): Training or prediction mode. In prediction mode (is_test=False), + the output is not initialized and created, and it is filled with 0 and returned. Default: False. + entry(str, optional): Entry config with parameter server whose value is ProbabilityEntry, + CountFilterEntry or None. Default: None. + table_class(str, optional): The type of the sparse table. The value can be CommonSparseTable + or SSDSparseTable. The default is CommonSparseTable. + param_attr(ParamAttr, optional): To specify the weight parameter property. Default: None, which means the + default weight parameter property is used. In addition, user-defined or pre-trained word + vectors can be loaded with the :attr:`param_attr` parameter. The local word vector needs + to be transformed into numpy format, and the shape of local word vector should be consistent + with :attr:`size` . + dtype(str|core.VarDesc.VarType): It refers to the data type of output Tensor. It must be float32 or + float64. Default: float32. + + Returns: + Variable: Embedding Tensor or LoDTensor mapped by input. The data type is the same as :attr:`dtype` . + + Examples: + .. code-block:: python + + import paddle + + paddle.enable_static() + sparse_feature_dim = 1024 + embedding_size = 64 + + # Only when the feature appear more than 10 times or more will be participated in the training. + entry = paddle.distributed.CountFilterEntry(10) + + input = paddle.static.data(name='ins', shape=[1], dtype='int64') + + emb = paddle.static.nn.sparse_embedding( + input=input, + size=[sparse_feature_dim, embedding_size], + is_test=False, + entry=entry, + param_attr=paddle.ParamAttr(name="SparseFeatFactors", + initializer=paddle.nn.initializer.Uniform())) + + """ + helper = LayerHelper('sparse_embedding', **locals()) check_variable_and_dtype(input, 'input', ['int64'], 'fluid.contrib.layers.sparse_embedding') - check_dtype(dtype, 'dtype', ['float32'], + check_dtype(dtype, 'dtype', ['float32', 'float64'], 'paddle.static.nn.sparse_embedding') w = helper.create_parameter( From 8566cc98de9a5d42dbe58a65ab42640d30c17337 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Fri, 15 Oct 2021 10:43:08 +0800 Subject: [PATCH 170/298] close some check on CI-OP-Benchmark, test=develop (#36442) --- tools/test_ci_op_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh index d60556a242d9a4..23df51f09c8e6a 100644 --- a/tools/test_ci_op_benchmark.sh +++ b/tools/test_ci_op_benchmark.sh @@ -298,7 +298,7 @@ function cpu_op_benchmark { prepare_benchmark_environment load_CHANGE_OP_MAP load_BENCHMARK_OP_MAP - check_CHANGE_OP_MAP + # check_CHANGE_OP_MAP build_whl LOG "[INFO] Op benchmark run success and no error!" exit 0 From 4dda18a8b4f1af281483a16d456798ab00aed1db Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 15 Oct 2021 11:07:29 +0800 Subject: [PATCH 171/298] fix momentum ops (#36452) --- .../fluid/operators/optimizers/momentum_op.h | 67 ++++++++++--------- .../unittests/test_merged_momentum_op.py | 9 ++- 2 files changed, 41 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index f461dec66c0e75..2d713308fd9389 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -173,14 +173,15 @@ class CPUDenseMomentumFunctor { } }; -template +template class DenseMomentumFunctor; // NOTE(dzh) for performance. // avoid if/else in inside kernel, implement GPU UseNesterov/NoNesterov as two // functor. -template -class DenseMomentumFunctor { +template +class DenseMomentumFunctor { private: const T* param_; const T* grad_; @@ -193,7 +194,6 @@ class DenseMomentumFunctor { T* param_out_; MT* velocity_out_; MT* master_param_out_; - const RegularizationType regularization_flag_; const MT regularization_coeff_; public: @@ -201,7 +201,6 @@ class DenseMomentumFunctor { const MultiPrecisionType* learning_rate, const MT* master_param, const MT mu, const MT rescale_grad, const int64_t num, - const RegularizationType regularization_flag, const MT regularization_coeff, T* param_out, MT* velocity_out, MT* master_param_out) : param_(param), @@ -215,7 +214,6 @@ class DenseMomentumFunctor { param_out_(param_out), velocity_out_(velocity_out), master_param_out_(master_param_out), - regularization_flag_(regularization_flag), regularization_coeff_(regularization_coeff) {} inline HOSTDEVICE void operator()(size_t i) const { // put memory access in register @@ -225,9 +223,9 @@ class DenseMomentumFunctor { const MT lr = static_cast(lr_[0]); const MT velocity = velocity_[i]; - grad = regularization_flag_ == RegularizationType::kL2DECAY - ? grad + regularization_coeff_ * param - : grad; + if (kRegType == RegularizationType::kL2DECAY) { + grad += regularization_coeff_ * param; + } MT velocity_out = velocity * mu_ + grad; MT param_out = param - (grad + velocity_out * mu_) * lr; @@ -240,8 +238,8 @@ class DenseMomentumFunctor { } }; -template -class DenseMomentumFunctor { +template +class DenseMomentumFunctor { private: const T* param_; const T* grad_; @@ -254,7 +252,6 @@ class DenseMomentumFunctor { T* param_out_; MT* velocity_out_; MT* master_param_out_; - const RegularizationType regularization_flag_; const MT regularization_coeff_; public: @@ -262,7 +259,6 @@ class DenseMomentumFunctor { const MultiPrecisionType* learning_rate, const MT* master_param, const MT mu, const MT rescale_grad, const int64_t num, - const RegularizationType regularization_flag, const MT regularization_coeff, T* param_out, MT* velocity_out, MT* master_param_out) : param_(param), @@ -276,7 +272,6 @@ class DenseMomentumFunctor { param_out_(param_out), velocity_out_(velocity_out), master_param_out_(master_param_out), - regularization_flag_(regularization_flag), regularization_coeff_(regularization_coeff) {} inline HOSTDEVICE void operator()(size_t i) const { // put memory access in register @@ -286,9 +281,9 @@ class DenseMomentumFunctor { const MT lr = static_cast(lr_[0]); const MT velocity = velocity_[i]; - grad = regularization_flag_ == RegularizationType::kL2DECAY - ? grad + regularization_coeff_ * param - : grad; + if (kRegType == RegularizationType::kL2DECAY) { + grad += regularization_coeff_ * param; + } MT velocity_out = velocity * mu_ + grad; MT param_out = param - lr * velocity_out; @@ -522,23 +517,31 @@ class MomentumOpKernel : public framework::OpKernel { platform::ForRange for_range( static_cast(ctx.device_context()), param->numel()); - if (use_nesterov) { - DenseMomentumFunctor functor( - param->data(), grad->data(), velocity->data(), - learning_rate->data(), master_in_data, mu, rescale_grad, - param->numel(), regularization_flag, regularization_coeff, - param_out->mutable_data(ctx.GetPlace()), - velocity_out->mutable_data(ctx.GetPlace()), master_out_data); - for_range(functor); +#define PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(__nesterov, __reg_type) \ + DenseMomentumFunctor functor( \ + param->data(), grad->data(), velocity->data(), \ + learning_rate->data(), master_in_data, mu, rescale_grad, \ + param->numel(), regularization_coeff, \ + param_out->mutable_data(ctx.GetPlace()), \ + velocity_out->mutable_data(ctx.GetPlace()), master_out_data); \ + for_range(functor); + if (use_nesterov) { + if (regularization_flag == RegularizationType::kL2DECAY) { + PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(UseNesterov, + RegularizationType::kL2DECAY); + } else { + PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(UseNesterov, + RegularizationType::kNONE); + } } else { - DenseMomentumFunctor functor( - param->data(), grad->data(), velocity->data(), - learning_rate->data(), master_in_data, mu, rescale_grad, - param->numel(), regularization_flag, regularization_coeff, - param_out->mutable_data(ctx.GetPlace()), - velocity_out->mutable_data(ctx.GetPlace()), master_out_data); - for_range(functor); + if (regularization_flag == RegularizationType::kL2DECAY) { + PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(NoNesterov, + RegularizationType::kL2DECAY); + } else { + PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(NoNesterov, + RegularizationType::kNONE); + } } } diff --git a/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py index 0118a372c3f4d4..96e458795a3c08 100644 --- a/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_merged_momentum_op.py @@ -102,7 +102,7 @@ def run_momentum_op(params, 'Param': p, 'Grad': g, 'Velocity': v, - 'LearningRate': lr_var + 'LearningRate': lr_var, } outputs = {'ParamOut': p, 'VelocityOut': v} if multi_precision: @@ -115,7 +115,7 @@ def run_momentum_op(params, 'Param': param_vars, 'Grad': grad_vars, 'Velocity': velocity_vars, - 'LearningRate': lr_var + 'LearningRate': lr_var, } outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars} if multi_precision: @@ -176,7 +176,10 @@ def run_op(use_merged): outs2 = run_op(False) self.assertEqual(len(outs1), len(outs2)) for i, (out1, out2) in enumerate(zip(outs1, outs2)): - self.assertTrue(np.allclose(out1, out2, atol=1e-7)) + if isinstance(place, paddle.CUDAPlace): + self.assertTrue(np.array_equal(out1, out2)) + else: + self.assertTrue(np.allclose(out1, out2, atol=1e-7)) def get_places(self): places = [paddle.CPUPlace()] From 808be6574a46e552688acdd3066e271598c4f132 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Fri, 15 Oct 2021 11:59:29 +0800 Subject: [PATCH 172/298] [New Feature] Support tanh triple grad (#36225) * native commit for triple grad of sigmod * Updated unittests files * init functional jacobian api * Updated trible_test func * Updated gradient_checker & test_script * finish test with dtype float32 * add float64 test case * polish code * use atol=1e-5 with dtype float64 * fix for ci * set timeout for test_jacobian * fix dygraph grad to support high differential * polish API docstring * Updated gradient checker and some related files * fix double grad strip error for high differential * fix double grad strip error for high differential * Add Sigmoid triple grad tests * fix dygraph double grad dtype error when calling for high differential senario * Updated triple grad teses func * Use np.random to initialize ddx * Updated triple_grad_check func * add todo for gradient checker and refine some comments * remove additional code * add test for warnging in backward.py * add tanh triple grad * format python code * refine code Co-authored-by: veyron95 Co-authored-by: levi131 --- paddle/fluid/operators/activation_op.cc | 46 ++++++- paddle/fluid/operators/activation_op.cu | 9 ++ paddle/fluid/operators/activation_op.h | 112 ++++++++++++++++++ .../unittests/test_activation_nn_grad.py | 22 ++++ 4 files changed, 188 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 3cdcfd79235596..5e5cd0ea1c504d 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -940,6 +940,34 @@ class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { } }; +template +class TanhTripleGradMaker : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("tanh_triple_grad"); + // Out, DDX, DOut, D_DDOut, D_DOut_New // input + // D_OutNew, D_DOut, D_DDx // output + // input1: Out + op->SetInput("Out", this->Input("Out")); + // input2: ddx + op->SetInput("DDX", this->Input("DDX")); + // input3: dout + op->SetInput("DOut", this->Input("DOut")); + // input4: d_ddout + op->SetInput("D_DDOut", this->OutputGrad("DDOut")); + // input5: d_dout_new + op->SetInput("D_DOut_New", this->OutputGrad("DOutNew")); + op->SetAttrMap(this->Attrs()); + + // output: d_dOut, d_OutNew, d_ddx + op->SetOutput("D_OutNew", this->InputGrad("Out")); + op->SetOutput("D_DOut", this->InputGrad("DOut")); + op->SetOutput("D_DDx", this->InputGrad("DDX")); + } +}; // ReluGrad: dx = dy if y >= 0 else 0 // ReluGradGrad: ddy = ddx if y >= 0 else 0 template @@ -1299,7 +1327,14 @@ REGISTER_OPERATOR(tanh_grad, ops::ActivationOpGrad, REGISTER_OPERATOR( tanh_grad_grad, ops::ActivationOpDoubleGrad::FwdDeps()>, - ops::ActivationDoubleGradOpInplaceInferer); + ops::ActivationDoubleGradOpInplaceInferer, + ops::TanhTripleGradMaker, + ops::TanhTripleGradMaker); + +REGISTER_OPERATOR( + tanh_triple_grad, + ops::ActivationOpTripleGrad::FwdDeps()>, + ops::ActivationTripleGradOpInplaceInferer); REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); REGISTER_OP_CPU_KERNEL( @@ -1309,6 +1344,15 @@ REGISTER_OP_CPU_KERNEL( ops::TanhGradGradFunctor>, ops::TanhDoubleGradKernel>); +// Register TripleGrad Kernel +REGISTER_OP_CPU_KERNEL( + tanh_triple_grad, + ops::TanhTripeGradKernel>, + ops::TanhTripeGradKernel>, + ops::TanhTripeGradKernel>); /* ========================================================================== */ /* ========================== relu register ============================= */ diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index d83a63015cfe5b..cde8e9a4507441 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -1487,6 +1487,15 @@ REGISTER_OP_CUDA_KERNEL( ops::TanhGradGradFunctor>, ops::TanhDoubleGradKernel>); + +REGISTER_OP_CUDA_KERNEL( + tanh_triple_grad, + ops::TanhTripeGradKernel>, + ops::TanhTripeGradKernel>, + ops::TanhTripeGradKernel>); /* ========================================================================== */ /* =========================== sqrt register ============================= */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index a6240c038b1100..627522e1da06d9 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -536,6 +536,61 @@ struct TanhGradGradFunctor : public BaseActivationFunctor { } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +/* + Out + DOut D_Dout + DDx -> TanhTripleGrad -> D_DDx + D_DDout d_OutNew + D_Dout_new + + D_Dout = (-2) * Out * DDx * D_Dout_new + D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new + D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new + + Out, DDX, DOut, D_DDOut, D_DOut_New // input + D_OutNew, D_DOut, D_DDx // output +*/ +template +struct TanhTripleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, const framework::Tensor* dOut, + const framework::Tensor* d_DDOut, + const framework::Tensor* d_dOut_New, + framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, + framework::Tensor* d_DDx) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad")); + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad")); + auto d_ddOut = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad")); + auto d_dOutNew = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad")); + + if (d_Out_New) { + auto d_OutNew = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad")); + d_OutNew.device(*d) = (static_cast(-2) * out * ddx * d_ddOut) - + (static_cast(2) * dout * ddx * d_dOutNew); + } + if (d_d_Out) { + auto d_dOut = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad")); + d_dOut.device(*d) = static_cast(-2) * out * ddx * d_dOutNew; + } + if (d_DDx) { + auto d_ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad")); + d_ddx.device(*d) = (static_cast(1) - (out * out)) * d_ddOut - + static_cast(2) * out * dout * d_dOutNew; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; // tanhshrink(x) = x - tanh(x) // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) @@ -2137,6 +2192,63 @@ class TanhDoubleGradKernel functor(place, Out, ddX, dOut, dOutNew, ddOut); } }; + +template +class TanhTripeGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; + framework::Tensor *d_OutNew, *d_dOut, *d_ddx; + Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; + d_OutNew = d_dOut = d_ddx = nullptr; + + // extract ddx(input), out(input), dOut(input), d_ddOut(input), + // d_dOutNew(input) + ddX = ctx.Input("DDX"); + Out = ctx.Input("Out"); + dOut = ctx.Input("DOut"); + d_ddOut = ctx.Input("D_DDOut"); + d_dOutNew = ctx.Input("D_DOut_New"); + + PADDLE_ENFORCE_NOT_NULL( + ddX, platform::errors::NotFound( + "Cannot get input Variable ddX, variable name = %s", + ctx.InputName("DDX"))); + PADDLE_ENFORCE_NOT_NULL( + Out, platform::errors::NotFound( + "Cannot get input Variable Out, variable name = %s", + ctx.InputName("Out"))); + PADDLE_ENFORCE_NOT_NULL( + dOut, platform::errors::NotFound( + "Cannot get input Variable dOut, variable name = %s", + ctx.InputName("DOut"))); + PADDLE_ENFORCE_NOT_NULL( + d_ddOut, platform::errors::NotFound( + "Cannot get input Variable d_ddOut, variable name = %s", + ctx.InputName("D_DDOut"))); + PADDLE_ENFORCE_NOT_NULL( + d_dOutNew, + platform::errors::NotFound( + "Cannot get input Variable d_dOutNew, variable name = %s", + ctx.InputName("D_DOutNew"))); + + // set output d_OutNew、d_dOut、d_ddx + d_dOut = ctx.Output("D_DOut"); + d_OutNew = ctx.Output("D_OutNew"); + d_ddx = ctx.Output("D_DDx"); + + if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); + if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); + if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); + auto& place = ctx.template device_context(); + Functor functor; + functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input + d_dOut, d_OutNew, d_ddx); // output + } +}; + template class SquareDoubleGradKernel : public framework::OpKernel { diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index c54f711c7ce129..825d74388bc0b4 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -71,6 +71,28 @@ def test_grad(self): self.func(p) +class TestTanhTripleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + shape = [2, 3, 7, 9] + eps = 0.0005 + dtype = np.float64 + x = layers.data('x', shape, False, dtype=dtype) + x.persistable = True + y = layers.tanh(x) + x_arr = np.random.random(shape).astype(dtype) + x_arr[np.abs(x_arr) < 0.005] = 0.002 + gradient_checker.triple_grad_check( + [x], y, x_init=x_arr, place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + class TestTanhDoubleGradCheck(unittest.TestCase): @prog_scope() def func(self, place): From b3f02c57ea6d4088c58458f56b9041848bbd7ae4 Mon Sep 17 00:00:00 2001 From: jiangcheng Date: Fri, 15 Oct 2021 12:27:06 +0800 Subject: [PATCH 173/298] Add BuildCinnPass (#36345) * Add CinnSubgraphSearchPass * solve CI problem of subgraph order not same * fix some bug by review advices * ensure the independently of subgraph, that mean the subgraph should not have link to out-graph * rename cinn_subgraph_search_pass to build_cinn_pass and delete paddle_to_cinn_pass * add flag to control wheter append build cinn pass * remove AppendPass at ParallelExecutorPassBuilder * rename paddle_to_cinn_pass to build_cinn_pass in build_strategy and close test_run_from_cinn --- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../fluid/framework/details/build_strategy.cc | 3 +- paddle/fluid/framework/ir/CMakeLists.txt | 2 - .../fluid/framework/ir/paddle_to_cinn_pass.cc | 31 -- .../fluid/framework/ir/paddle_to_cinn_pass.h | 30 -- .../framework/ir/paddle_to_cinn_pass_test.cc | 40 -- .../framework/paddle2cinn/CMakeLists.txt | 2 + .../framework/paddle2cinn/build_cinn_pass.cc | 293 ++++++++++++ .../framework/paddle2cinn/build_cinn_pass.h | 61 +++ .../paddle2cinn/build_cinn_pass_test.cc | 442 ++++++++++++++++++ .../test_parallel_executor_run_cinn.py | 2 +- 11 files changed, 802 insertions(+), 106 deletions(-) delete mode 100644 paddle/fluid/framework/ir/paddle_to_cinn_pass.cc delete mode 100644 paddle/fluid/framework/ir/paddle_to_cinn_pass.h delete mode 100644 paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc create mode 100644 paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc create mode 100644 paddle/fluid/framework/paddle2cinn/build_cinn_pass.h create mode 100644 paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index ad81b48847af9f..5e2fd08406fa75 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass - paddle_to_cinn_pass fix_op_run_order_pass) + fix_op_run_order_pass build_cinn_pass) if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM)) set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass) endif() diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index a55b809055f3e7..6b6ee408331232 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -74,7 +74,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Note: This pass is used to enable cinn. if (FLAGS_use_cinn) { - AppendPass("paddle_to_cinn_pass"); + AppendPass("build_cinn_pass"); } SetCollectiveContext(); } @@ -486,6 +486,7 @@ USE_PASS(fuse_momentum_op_pass); USE_PASS(fuse_all_reduce_op_pass); USE_PASS(runtime_context_cache_pass); USE_PASS(add_reader_dependency_pass); +USE_PASS(build_cinn_pass); #ifdef PADDLE_WITH_MKLDNN USE_PASS(mkldnn_placement_pass); #endif diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index a2e9fc3a3d9ac5..904450b5b251ee 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -59,7 +59,6 @@ cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass) cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper) pass_library(graph_to_program_pass base) -pass_library(paddle_to_cinn_pass base DEPS cinn_runner) pass_library(graph_viz_pass base) pass_library(lock_free_optimize_pass base DEPS string_helper) pass_library(fc_fuse_pass inference) @@ -144,7 +143,6 @@ cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) -cc_test(paddle_to_cinn_pass_test SRCS paddle_to_cinn_pass_test.cc DEPS paddle_to_cinn_pass proto_desc) cc_test(cost_model_test SRCS cost_model_test.cc DEPS cost_model op_registry) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass) diff --git a/paddle/fluid/framework/ir/paddle_to_cinn_pass.cc b/paddle/fluid/framework/ir/paddle_to_cinn_pass.cc deleted file mode 100644 index fbf2cfb8d41d6a..00000000000000 --- a/paddle/fluid/framework/ir/paddle_to_cinn_pass.cc +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/ir/paddle_to_cinn_pass.h" - -#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" - -namespace paddle { -namespace framework { -namespace ir { - -void PaddleToCinnPass::ApplyImpl(ir::Graph* graph) const { - paddle2cinn::CinnRunner::GetInstance()->ReplaceWithCinn(graph); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(paddle_to_cinn_pass, paddle::framework::ir::PaddleToCinnPass); diff --git a/paddle/fluid/framework/ir/paddle_to_cinn_pass.h b/paddle/fluid/framework/ir/paddle_to_cinn_pass.h deleted file mode 100644 index f3b9bd21ebf9ca..00000000000000 --- a/paddle/fluid/framework/ir/paddle_to_cinn_pass.h +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/ir/pass.h" - -namespace paddle { -namespace framework { -namespace ir { - -class PaddleToCinnPass : public Pass { - protected: - void ApplyImpl(ir::Graph* graph) const override; -}; - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc b/paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc deleted file mode 100644 index 49d2ce295f3852..00000000000000 --- a/paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/ir/paddle_to_cinn_pass.h" - -#include "gtest/gtest.h" - -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/program_desc.h" - -namespace paddle { -namespace framework { -namespace ir { - -TEST(PaddleToCinnPassTest, TodoTest) { - ProgramDesc program; - Graph graph(program); - - auto pass = paddle::framework::ir::PassRegistry::Instance().Get( - "paddle_to_cinn_pass"); - - pass->Apply(&graph); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -USE_PASS(paddle_to_cinn_pass); diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt index 8621c7363a09f1..4a653332177272 100644 --- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt +++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt @@ -1,7 +1,9 @@ cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper lod_tensor proto_desc) cc_library(cinn_compiled_object SRCS cinn_compiled_object.cc DEPS feed_fetch_method graph lod_tensor proto_desc) cc_library(cinn_runner SRCS cinn_runner.cc DEPS cinn_cache_key cinn_compiled_object feed_fetch_method graph lod_tensor scope) +cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector) cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key) cc_test(cinn_runner_test SRCS cinn_runner_test.cc DEPS cinn_runner proto_desc) cc_test(cinn_compiled_object_test SRCS cinn_compiled_object_test.cc DEPS cinn_compiled_object) +cc_test(test_build_cinn_pass SRCS build_cinn_pass_test.cc DEPS build_cinn_pass) diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc new file mode 100644 index 00000000000000..ffdbb46bd7c066 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc @@ -0,0 +1,293 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/ir/subgraph_detector.h" +// #include "cinn/frontend/op_mapper_registry.h" +// #include "cinn/frontend/op_mappers/use_op_mappers.h" + +// TODO(jiangcheng05): just for local compile, remove after +// paddle and CINN have been binded +// The APIs are the same as CINN: +// https://github.com/PaddlePaddle/CINN/blob/develop/cinn/utils/registry.h +namespace cinn { +namespace frontend { +class OpMapperRegistry { + public: + static OpMapperRegistry* Global() { + static OpMapperRegistry inst; + return &inst; + } + + inline const OpMapperRegistry* Find(const std::string& name) { + std::unordered_set fmap_ = {"mul", "add", "relu", "sigmoid", + "softmax"}; + auto p = fmap_.find(name); + if (p != fmap_.end()) { + return this; + } else { + return nullptr; + } + } +}; + +} // namespace frontend +} // namespace cinn + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using framework::ir::Graph; +using framework::ir::Node; + +using GraphNodeVec = std::vector; +using GraphNodeSet = std::unordered_set; + +// Create new subgraph with and op nodes are cluster nodes, and all +// var node are from internal nodes +std::unique_ptr CreateNewSubGraph( + const GraphNodeSet& cluster, const GraphNodeSet& cluster_internals) { + // Graph's constructor must has one parameter, and in our code, + // the ProgramDesc is useless, so here we pass a temporary object. + auto sub_graph = std::make_unique(framework::ProgramDesc()); + + std::unordered_map old_op2new_op; + for (auto* op : cluster) { + auto sub_node = sub_graph->CreateOpNode(op->Op()); + old_op2new_op[op] = sub_node; + } + + std::unordered_map old_var2new_var; + for (auto* var : cluster_internals) { + auto sub_node = sub_graph->CreateVarNode(var->Var()); + old_var2new_var[var] = sub_node; + } + + // the subgraph is independently, so here we only need link + // to the node in new subgraph, and discard the link to + // out-graph. + for (auto* op : cluster) { + for (auto* var : op->inputs) { + if (cluster_internals.count(var)) { + old_op2new_op[op]->inputs.emplace_back(old_var2new_var[var]); + } + } + for (auto* var : op->outputs) { + if (cluster_internals.count(var)) { + old_op2new_op[op]->outputs.emplace_back(old_var2new_var[var]); + } + } + } + + for (auto* var : cluster_internals) { + for (auto* op : var->inputs) { + if (cluster.count(op)) { + old_var2new_var[var]->inputs.emplace_back(old_op2new_op[op]); + } + } + for (auto* op : var->outputs) { + if (cluster.count(op)) { + old_var2new_var[var]->outputs.emplace_back(old_op2new_op[op]); + } + } + } + + return sub_graph; +} + +// This interface is used to classify all variables involved in a cluster into +// three types: inputs, outputs, and internals. +// Specially, the internal node is a node that only used by sub-graph, and +// out-graph should not using this node at all. +// inputs & outputs & internals == NULL +// inputs | outputs | internals == all graph node +void AnalyseClusterVariables(const GraphNodeSet& cluster, + GraphNodeSet* cluster_inputs, + GraphNodeSet* cluster_outputs, + GraphNodeSet* cluster_internals) { + // collecting all input and output of op + for (auto* op_node : cluster) { + for (auto* input_var_node : op_node->inputs) { + cluster_inputs->insert(input_var_node); + } + for (auto* output_var_node : op_node->outputs) { + cluster_outputs->insert(output_var_node); + } + } + // remove output node from cluster_inputs, + // and add cluster_internals node + for (auto* var_node : *cluster_outputs) { + if (cluster_inputs->count(var_node) > 0) { + // if a input node also exists in output list, remove + cluster_inputs->erase(var_node); + + // the internal node is must an output node of sub-graph, + // but not any input node of out-graph. + bool is_only_used_internal = true; + for (auto* next_op_node : var_node->outputs) { + is_only_used_internal &= (cluster.count(next_op_node) > 0); + } + if (is_only_used_internal) { + cluster_internals->insert(var_node); + } + } + } + + // if a output node also exists in input list, remove. + for (auto* var_node : *cluster_inputs) { + cluster_outputs->erase(var_node); + } + // if a output node also exists in internal list, remove. + for (auto* var_node : *cluster_internals) { + cluster_outputs->erase(var_node); + } +} + +Node* AddSpecialOpToGraph(Graph* graph, const GraphNodeSet& cluster_inputs, + const GraphNodeSet& cluster_outputs) { + // add special cinn op + framework::OpDesc special_op_desc; + special_op_desc.SetType(kCinnLaunchOp); + auto* special_op_node = graph->CreateOpNode(&special_op_desc); + special_op_node->inputs.assign(cluster_inputs.begin(), cluster_inputs.end()); + special_op_node->outputs.assign(cluster_outputs.begin(), + cluster_outputs.end()); + return special_op_node; +} + +void AddLinkToSpecialOp(Node* special_op_node, + const GraphNodeSet& cluster_inputs, + const GraphNodeSet& cluster_outputs) { + // add new link from cluster_inputs to special_op_node + for (auto* var_node : cluster_inputs) { + var_node->outputs.push_back(special_op_node); + } + + // add new link from special_op_node to cluster_outputs + for (auto* var_node : cluster_outputs) { + var_node->inputs.push_back(special_op_node); + } +} + +void RemoveLinkFromCluster(const GraphNodeSet& cluster, + const GraphNodeSet& cluster_inputs, + const GraphNodeSet& cluster_outputs) { + // remove all nodes in cluster + auto get_preserved_ops = [&cluster](const GraphNodeVec& ops) { + GraphNodeVec nodes; + for (auto* op_node : ops) { + if (cluster.find(op_node) == cluster.end()) { + nodes.emplace_back(op_node); + } + } + return nodes; + }; + + // removing useless link from cluster_inputs to cluster + for (auto* var_node : cluster_inputs) { + auto preserved_nodes = get_preserved_ops(var_node->outputs); + var_node->outputs.assign(preserved_nodes.begin(), preserved_nodes.end()); + } + + // removing useless link from cluster to cluster_outputs + for (auto* var_node : cluster_outputs) { + auto preserved_nodes = get_preserved_ops(var_node->inputs); + var_node->inputs.assign(preserved_nodes.begin(), preserved_nodes.end()); + } +} + +// Removing cluster node and internals node from Graph +void RemoveSubGraphFromGraph(const GraphNodeSet& cluster, + const GraphNodeSet& cluster_internals, + Graph* graph) { + for (auto* op_node : cluster) { + graph->RemoveNode(op_node); + } + for (auto* var_node : cluster_internals) { + graph->RemoveNode(var_node); + } +} + +// Replacing Cinn subgraph to a special op node, whose op_type is +// kCinnLaunchOp, and inputs ares cluster_inputs and outputs are +// cluster_outputs. +// Meanwhile, move all links of cluster to the special op. +void ReplaceSubGraphWithSpecialOpNode(const GraphNodeSet& cluster, + const GraphNodeSet& cluster_inputs, + const GraphNodeSet& cluster_outputs, + const GraphNodeSet& cluster_internals, + Graph* graph) { + // First, add the special op node whose name is "kCinnLaunchOp" into graph + auto special_op_node = + AddSpecialOpToGraph(graph, cluster_inputs, cluster_outputs); + // Second, remove all graph's links which are from or to cluster nodes + RemoveLinkFromCluster(cluster, cluster_inputs, cluster_outputs); + // Third, add new links from or to the the special op node + AddLinkToSpecialOp(special_op_node, cluster_inputs, cluster_outputs); + // Finally, remove the cinn sub graph from graph + RemoveSubGraphFromGraph(cluster, cluster_internals, graph); +} + +// Search all subgraphs which all op node supported by CINN, +// Here we using SubgraphDetector to detecte the subgraph that +// all of op node supported by CINN. We using OpMapperRegistry +// to check whether the op node supported by CINN. +void SearchAllSubgraphs(Graph* graph, + std::vector>* cinn_subgraphs) { + auto teller = [](const Node* node) { + return ::cinn::frontend::OpMapperRegistry::Global()->Find(node->Name()) != + nullptr; + }; + std::vector clusters = + framework::ir::SubgraphDetector(graph, teller)(); + + cinn_subgraphs->clear(); + for (const auto& node_vec : clusters) { + // classify var node to inputs, outputs, and internals. + GraphNodeSet cluster_set(node_vec.begin(), node_vec.end()); + + GraphNodeSet cluster_inputs, cluster_outputs, cluster_internals; + AnalyseClusterVariables(cluster_set, &cluster_inputs, &cluster_outputs, + &cluster_internals); + + cinn_subgraphs->emplace_back( + CreateNewSubGraph(cluster_set, cluster_internals)); + + // replacing subgraph to a new special op node + ReplaceSubGraphWithSpecialOpNode(cluster_set, cluster_inputs, + cluster_outputs, cluster_internals, graph); + } +} + +void BuildCinnPass::ApplyImpl(Graph* graph) const { + auto& cinn_subgraphs = + Get>>("cinn_subgraphs"); + SearchAllSubgraphs(graph, &cinn_subgraphs); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle + +REGISTER_PASS(build_cinn_pass, paddle::framework::paddle2cinn::BuildCinnPass); diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h new file mode 100644 index 00000000000000..e71160ba108ecf --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +constexpr char kCinnLaunchOp[] = "CinnLaunchOp"; + +// A pass named BuildCinnPass, the function of this pass is: +// +// a) Detect the subgraphs that can be compiled by the CINN compiler. We call a +// detected subgraph a cluster, which is consisted of several op nodes. +// +// b) Call the CINN compiler to compile each original cluster and get the +// compiled cluster, which is consisted of several kCinnLaunchOp. +// +// c) Replace the original cluster with corresponding compiled cluster on the +// original graph. +// +// In this pass, some questions are handled with cautions: +// +// a) How to determine whether two op nodes can be divided into a cluster? +// Firstly, both op nodes should be compile supported. +// Secondly, there should be a direct path between the two op nodes through a +// var node. +// Thirdly, there should be no extral path between the two op nodes through +// unsupported op nodes. +// Lastly, if op nodes a and b can be divied into a cluster, op nodes b and c +// can be devided into a cluster, a and c can also be devided into a cluster. +// The implementation of cluster detection is enclosured in class +// SubGraphDetector. +// +// b) How to deal with the links between the var nodes in global graph and the +// op nodes in a cluster? +// We first add links between the var nodes in global graph and the op nodes in +// the compiled cluster, and then remove useless links between the var nodes in +// global graph and the op nodes in the original cluster. +class BuildCinnPass : public framework::ir::Pass { + protected: + void ApplyImpl(framework::ir::Graph* graph) const override; +}; + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc new file mode 100644 index 00000000000000..883d5c6fbfb391 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -0,0 +1,442 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" + +#include +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using framework::ir::Graph; +using framework::ir::Node; + +inline bool CheckNodeExisted(const std::unordered_set& nodes, + const std::string& op_name) { + return std::find_if(nodes.begin(), nodes.end(), [&op_name](const Node* node) { + return node->Name() == op_name; + }) != nodes.end(); +} + +inline int CountNode(const std::unordered_set& nodes, + const std::string& op_name) { + return std::count_if( + nodes.begin(), nodes.end(), + [&op_name](const Node* node) { return node->Name() == op_name; }); +} + +inline Node* GetNode(const std::unordered_set& nodes, + const std::string& op_name) { + return *std::find_if( + nodes.begin(), nodes.end(), + [&op_name](const Node* node) { return node->Name() == op_name; }); +} + +std::unique_ptr BuildNoCinnSubgraph() { + ProgramDesc prog; + auto g = std::make_unique(prog); + // var1 -- + // | --> fake1 --> var3 --> fake2 --> var4 + // var2 -- + OpDesc fake1_op; + fake1_op.SetType("fake1"); + OpDesc fake2_op; + fake2_op.SetType("fake2"); + + VarDesc var1("var1"); + VarDesc var2("var2"); + VarDesc var3("var3"); + VarDesc var4("var4"); + + ir::Node* fake1 = g->CreateOpNode(&fake1_op); + ir::Node* fake2 = g->CreateOpNode(&fake2_op); + + ir::Node* v1 = g->CreateVarNode(&var1); + ir::Node* v2 = g->CreateVarNode(&var2); + ir::Node* v3 = g->CreateVarNode(&var3); + ir::Node* v4 = g->CreateVarNode(&var4); + + // fill op node + fake1->inputs = {v1, v2}; + fake1->outputs = {v3}; + fake2->inputs = {v3}; + fake2->outputs = {v4}; + + // fill variable node + v1->outputs = {fake1}; + v2->outputs = {fake1}; + + v3->inputs = {fake1}; + v3->outputs = {fake2}; + + v4->inputs = {fake2}; + + return g; +} + +TEST(BuildCinnPassTest, NoCinnSubgraph) { + auto g = BuildNoCinnSubgraph(); + auto previous_nodes = g->Nodes(); + + auto pass = + paddle::framework::ir::PassRegistry::Instance().Get("build_cinn_pass"); + std::vector> cinn_subgraphs; + pass->SetNotOwned>>("cinn_subgraphs", + &cinn_subgraphs); + pass->Apply(g.get()); + + // After search, origin graph should no change + ASSERT_EQ(previous_nodes, g->Nodes()); + + // After search, there should one cinn subgraph + ASSERT_TRUE(cinn_subgraphs.empty()); +} + +std::unique_ptr BuildAllOpSupportCinnGraph() { + ProgramDesc prog; + auto g = std::make_unique(prog); + + // v1 -- + // | + // | --> mul --> v3 -- + // | | + // v2 -- | --> add --> v5 --> relu --> v6 + // | + // v4 -- + + OpDesc add_op; + add_op.SetType("add"); + OpDesc mul_op; + mul_op.SetType("mul"); + OpDesc relu_op; + relu_op.SetType("relu"); + + VarDesc var1("var1"); + VarDesc var2("var2"); + VarDesc var3("var3"); + VarDesc var4("var4"); + VarDesc var5("var5"); + VarDesc var6("var6"); + + ir::Node* add = g->CreateOpNode(&add_op); + ir::Node* mul = g->CreateOpNode(&mul_op); + ir::Node* relu = g->CreateOpNode(&relu_op); + + ir::Node* v1 = g->CreateVarNode(&var1); + ir::Node* v2 = g->CreateVarNode(&var2); + ir::Node* v3 = g->CreateVarNode(&var3); + ir::Node* v4 = g->CreateVarNode(&var4); + ir::Node* v5 = g->CreateVarNode(&var5); + ir::Node* v6 = g->CreateVarNode(&var6); + + // fill op node + mul->inputs = {v1, v2}; + mul->outputs = {v3}; + add->inputs = {v3, v4}; + add->outputs = {v5}; + relu->inputs = {v5}; + relu->outputs = {v6}; + + // fill variable node + v1->outputs = {mul}; + v2->outputs = {mul}; + + v3->inputs = {mul}; + v3->outputs = {add}; + + v4->outputs = {add}; + + v5->inputs = {add}; + v5->outputs = {relu}; + + v6->inputs = {relu}; + + return g; +} + +TEST(BuildCinnPassTest, AllOpSupportCinn) { + auto g = BuildAllOpSupportCinnGraph(); + + auto pass = + paddle::framework::ir::PassRegistry::Instance().Get("build_cinn_pass"); + std::vector> cinn_subgraphs; + pass->SetNotOwned>>("cinn_subgraphs", + &cinn_subgraphs); + pass->Apply(g.get()); + + // After search, the graph should as following + // v1 --| + // v2 --| --> kCinnLaunchOp --> v6 + // v4 --| + const auto& nodes = g->Nodes(); + ASSERT_EQ(nodes.size(), static_cast(5)); + + // A new op named kCinnLaunchOp should be added + ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp)); + auto* cinn_op = GetNode(nodes, kCinnLaunchOp); + auto* v1 = GetNode(nodes, "var1"); + auto* v2 = GetNode(nodes, "var2"); + auto* v4 = GetNode(nodes, "var4"); + auto* v6 = GetNode(nodes, "var6"); + + ASSERT_EQ( + std::unordered_set(cinn_op->inputs.begin(), cinn_op->inputs.end()), + std::unordered_set({v1, v2, v4})); + ASSERT_EQ(cinn_op->outputs, std::vector({v6})); + ASSERT_EQ(v1->outputs, std::vector({cinn_op})); + ASSERT_EQ(v6->inputs, std::vector({cinn_op})); + + // previous op (mul, add, relu) should all removed + ASSERT_FALSE(CheckNodeExisted(nodes, "mul")); + ASSERT_FALSE(CheckNodeExisted(nodes, "add")); + ASSERT_FALSE(CheckNodeExisted(nodes, "relu")); + + // After search, there should has just one cinn subgraph + // mul --> v3 --> add --> v5 --> relu + ASSERT_EQ(cinn_subgraphs.size(), static_cast(1)); + const auto& subgraph = cinn_subgraphs.back(); + + const auto& subnodes = subgraph->Nodes(); + ASSERT_EQ(subnodes.size(), static_cast(5)); + + ASSERT_TRUE(CheckNodeExisted(subnodes, "mul")); + ASSERT_TRUE(CheckNodeExisted(subnodes, "add")); + ASSERT_TRUE(CheckNodeExisted(subnodes, "relu")); +} + +std::unique_ptr BuildGraphWithOneCinnSubgraph() { + ProgramDesc prog; + auto g = std::make_unique(prog); + + // fake1 --> v1 -- + // | + // | --> mul --> v3 --> relu --> v4 --> fake2 + // | + // v2 -- + + OpDesc fake1_op; + fake1_op.SetType("fake1"); + OpDesc mul_op; + mul_op.SetType("mul"); + OpDesc relu_op; + relu_op.SetType("relu"); + OpDesc fake2_op; + fake2_op.SetType("fake2"); + + VarDesc var1("var1"); + VarDesc var2("var2"); + VarDesc var3("var3"); + VarDesc var4("var4"); + + ir::Node* fake1 = g->CreateOpNode(&fake1_op); + ir::Node* mul = g->CreateOpNode(&mul_op); + ir::Node* relu = g->CreateOpNode(&relu_op); + ir::Node* fake2 = g->CreateOpNode(&fake2_op); + + ir::Node* v1 = g->CreateVarNode(&var1); + ir::Node* v2 = g->CreateVarNode(&var2); + ir::Node* v3 = g->CreateVarNode(&var3); + ir::Node* v4 = g->CreateVarNode(&var4); + + // fill op node + fake1->outputs = {v1}; + mul->inputs = {v2, v1}; + mul->outputs = {v3}; + relu->inputs = {v3}; + relu->outputs = {v4}; + fake2->inputs = {v4}; + + // fill variable node + v2->outputs = {mul}; + + v1->inputs = {fake1}; + v1->outputs = {mul}; + + v3->inputs = {mul}; + v3->outputs = {relu}; + + v4->inputs = {relu}; + v4->outputs = {fake2}; + + return g; +} + +TEST(BuildCinnPassTest, OneCinnSubgraph) { + auto g = BuildGraphWithOneCinnSubgraph(); + + auto pass = + paddle::framework::ir::PassRegistry::Instance().Get("build_cinn_pass"); + std::vector> cinn_subgraphs; + pass->SetNotOwned>>("cinn_subgraphs", + &cinn_subgraphs); + pass->Apply(g.get()); + + // After search, the graph should as following + // fake1 --> v1 -- + // | --> kCinnLaunchOp --> v4 --> fake2 + // v2 -- + const auto& nodes = g->Nodes(); + ASSERT_EQ(nodes.size(), static_cast(6)); + + // A new op named kCinnLaunchOp should be added + ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp)); + + // previous op (mul, add, relu) should be removed + ASSERT_FALSE(CheckNodeExisted(nodes, "mul")); + ASSERT_FALSE(CheckNodeExisted(nodes, "relu")); + + // previous op (fake1, fake2) should be preserved + ASSERT_TRUE(CheckNodeExisted(nodes, "fake1")); + ASSERT_TRUE(CheckNodeExisted(nodes, "fake2")); + + // After search, there should has just one cinn subgraph + // mul --> v3 --> relu + ASSERT_EQ(cinn_subgraphs.size(), static_cast(1)); + const auto& subgraph = cinn_subgraphs.back(); + + const auto& subnodes = subgraph->Nodes(); + ASSERT_EQ(subnodes.size(), static_cast(3)); + + ASSERT_TRUE(CheckNodeExisted(subnodes, "mul")); + ASSERT_TRUE(CheckNodeExisted(subnodes, "relu")); +} + +std::unique_ptr BuildGraphWithMultiCinnSubgraph() { + ProgramDesc prog; + auto g = std::make_unique(prog); + + // fake1 --> v1 -- + // | + // | --> mul --> v3 --> fake2 --> v4 --> relu --> v5 --> fake3 + // | + // v2 -- + + OpDesc fake1_op; + fake1_op.SetType("fake1"); + OpDesc mul_op; + mul_op.SetType("mul"); + OpDesc relu_op; + relu_op.SetType("relu"); + OpDesc fake2_op; + fake2_op.SetType("fake2"); + OpDesc fake3_op; + fake3_op.SetType("fake3"); + + VarDesc var1("var1"); + VarDesc var2("var2"); + VarDesc var3("var3"); + VarDesc var4("var4"); + VarDesc var5("var5"); + + ir::Node* fake1 = g->CreateOpNode(&fake1_op); + ir::Node* mul = g->CreateOpNode(&mul_op); + ir::Node* relu = g->CreateOpNode(&relu_op); + ir::Node* fake2 = g->CreateOpNode(&fake2_op); + ir::Node* fake3 = g->CreateOpNode(&fake3_op); + + ir::Node* v1 = g->CreateVarNode(&var1); + ir::Node* v2 = g->CreateVarNode(&var2); + ir::Node* v3 = g->CreateVarNode(&var3); + ir::Node* v4 = g->CreateVarNode(&var4); + ir::Node* v5 = g->CreateVarNode(&var5); + + // fill op node + fake1->outputs = {v1}; + mul->inputs = {v2, v1}; + mul->outputs = {v3}; + fake2->inputs = {v3}; + fake2->outputs = {v4}; + relu->inputs = {v4}; + relu->outputs = {v5}; + fake3->inputs = {v5}; + + // fill variable node + v2->outputs = {mul}; + + v1->inputs = {fake1}; + v1->outputs = {mul}; + + v3->inputs = {mul}; + v3->outputs = {fake2}; + + v4->inputs = {fake2}; + v4->outputs = {relu}; + + v5->inputs = {relu}; + v5->outputs = {fake3}; + + return g; +} + +TEST(BuildCinnPassTest, MultiCinnSubgraph) { + auto g = BuildGraphWithMultiCinnSubgraph(); + + auto pass = + paddle::framework::ir::PassRegistry::Instance().Get("build_cinn_pass"); + std::vector> cinn_subgraphs; + pass->SetNotOwned>>("cinn_subgraphs", + &cinn_subgraphs); + pass->Apply(g.get()); + + // After search, the graph should as following + // fake1 -> v1 - + // | -> CinnOp -> v3 -> fake2 -> v4 -> CinnOp ->v5 -> fake3 + // v2 - + const auto& nodes = g->Nodes(); + ASSERT_EQ(nodes.size(), static_cast(10)); + + // A new op named kCinnLaunchOp should be added + ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp)); + ASSERT_EQ(CountNode(nodes, kCinnLaunchOp), 2); + + // previous op (mul, add, relu) should be removed + ASSERT_FALSE(CheckNodeExisted(nodes, "mul")); + ASSERT_FALSE(CheckNodeExisted(nodes, "relu")); + + // previous op (fake1, fake2) should be preserved + ASSERT_TRUE(CheckNodeExisted(nodes, "fake1")); + ASSERT_TRUE(CheckNodeExisted(nodes, "fake2")); + ASSERT_TRUE(CheckNodeExisted(nodes, "fake3")); + + // After search, there should has two cinn subgraphs, + // and each of subgraphs just has one node. + ASSERT_EQ(cinn_subgraphs.size(), static_cast(2)); + + // subgraph1: relu + const auto& subgraph1 = cinn_subgraphs[0]; + const auto& subnodes1 = subgraph1->Nodes(); + ASSERT_EQ(subnodes1.size(), static_cast(1)); + + // subgraph2: mul + const auto& subgraph2 = cinn_subgraphs[1]; + const auto& subnodes2 = subgraph2->Nodes(); + ASSERT_EQ(subnodes2.size(), static_cast(1)); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle + +USE_PASS(build_cinn_pass); diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py index e8b1d838261f45..d4722c2e1819f9 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py @@ -23,7 +23,7 @@ class TestParallelExecutorRunCinn(unittest.TestCase): def test_run_from_cinn(self): - paddle.set_flags({'FLAGS_use_cinn': True}) + paddle.set_flags({'FLAGS_use_cinn': False}) main_program = paddle.static.Program() startup_program = paddle.static.Program() From f45e6cf6f476b25b52c194120401b920e8675785 Mon Sep 17 00:00:00 2001 From: Feiyu Chan Date: Fri, 15 Oct 2021 12:46:24 +0800 Subject: [PATCH 174/298] dynamic load mkl as a fft backend when it is avaialble and requested (#36414) --- paddle/fluid/operators/CMakeLists.txt | 15 ++- paddle/fluid/operators/spectral_op.cc | 113 +++++++++--------- paddle/fluid/platform/dynload/CMakeLists.txt | 6 + .../fluid/platform/dynload/dynamic_loader.cc | 16 +++ .../fluid/platform/dynload/dynamic_loader.h | 1 + paddle/fluid/platform/dynload/mklrt.cc | 51 ++++++++ paddle/fluid/platform/dynload/mklrt.h | 80 +++++++++++++ 7 files changed, 221 insertions(+), 61 deletions(-) create mode 100644 paddle/fluid/platform/dynload/mklrt.cc create mode 100644 paddle/fluid/platform/dynload/mklrt.h diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index b910b4ec73901b..bb31fcf854d88f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -102,10 +102,21 @@ else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() + if (WITH_GPU AND (NOT WITH_ROCM)) - op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda ${OP_HEADER_DEPS}) + if (MKL_FOUND AND WITH_ONEMKL) + op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda dynload_mklrt ${OP_HEADER_DEPS}) + target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE}) + else() + op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda ${OP_HEADER_DEPS}) + endif() else() - op_library(spectral_op SRCS spectral_op.cc DEPS ${OP_HEADER_DEPS}) + if (MKL_FOUND AND WITH_ONEMKL) + op_library(spectral_op SRCS spectral_op.cc DEPS dynload_mklrt ${OP_HEADER_DEPS}) + target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE}) + else() + op_library(spectral_op SRCS spectral_op.cc DEPS ${OP_HEADER_DEPS}) + endif() endif() op_library(lstm_op DEPS ${OP_HEADER_DEPS} lstm_compute) diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc index fb50702233b3ba..b5edc1dda533b0 100644 --- a/paddle/fluid/operators/spectral_op.cc +++ b/paddle/fluid/operators/spectral_op.cc @@ -27,7 +27,7 @@ #include "paddle/fluid/platform/complex.h" #if defined(PADDLE_WITH_ONEMKL) -#include +#include "paddle/fluid/platform/dynload/mklrt.h" #elif defined(PADDLE_WITH_POCKETFFT) #include "extern_pocketfft/pocketfft_hdronly.h" #endif @@ -357,46 +357,45 @@ FFTNormMode get_norm_from_string(const std::string& norm, bool forward) { // FFT Functors #if defined(PADDLE_WITH_ONEMKL) +#define MKL_DFTI_CHECK(expr) \ + do { \ + MKL_LONG status = (expr); \ + if (!platform::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \ + PADDLE_THROW(platform::errors::External( \ + platform::dynload::DftiErrorMessage(status))); \ + } while (0); + namespace { -static inline void MKL_DFTI_CHECK(MKL_INT status) { - if (status && !DftiErrorClass(status, DFTI_NO_ERROR)) { - PADDLE_THROW(platform::errors::External(DftiErrorMessage(status))); - } -} struct DftiDescriptorDeleter { void operator()(DFTI_DESCRIPTOR_HANDLE handle) { if (handle != nullptr) { - MKL_DFTI_CHECK(DftiFreeDescriptor(&handle)); + MKL_DFTI_CHECK(platform::dynload::DftiFreeDescriptor(&handle)); } } }; +// A RAII wrapper for MKL_DESCRIPTOR* class DftiDescriptor { public: void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type, MKL_LONG signal_ndim, MKL_LONG* sizes) { - if (desc_ != nullptr) { - PADDLE_THROW(platform::errors::AlreadyExists( - "DFT DESCRIPTOR can only be initialized once.")); - } + PADDLE_ENFORCE_EQ(desc_.get(), nullptr, + platform::errors::AlreadyExists( + "DftiDescriptor has already been initialized.")); + DFTI_DESCRIPTOR* raw_desc; - if (signal_ndim == 1) { - MKL_DFTI_CHECK( - DftiCreateDescriptor(&raw_desc, precision, signal_type, 1, sizes[0])); - } else { - MKL_DFTI_CHECK(DftiCreateDescriptor(&raw_desc, precision, signal_type, - signal_ndim, sizes)); - } + MKL_DFTI_CHECK(platform::dynload::DftiCreateDescriptorX( + &raw_desc, precision, signal_type, signal_ndim, sizes)); desc_.reset(raw_desc); } DFTI_DESCRIPTOR* get() const { - if (desc_ == nullptr) { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "DFTI DESCRIPTOR has not been initialized.")); - } - return desc_.get(); + DFTI_DESCRIPTOR* raw_desc = desc_.get(); + PADDLE_ENFORCE_NOT_NULL(raw_desc, + platform::errors::PreconditionNotMet( + "DFTI DESCRIPTOR has not been initialized.")); + return raw_desc; } private: @@ -421,7 +420,9 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype, return DFTI_DOUBLE; default: PADDLE_THROW(platform::errors::InvalidArgument( - "Input data type should be FP32, FP64, COMPLEX64 or COMPLEX128.")); + "Invalid input datatype (%s), input data type should be FP32, " + "FP64, COMPLEX64 or COMPLEX128.", + framework::DataTypeToString(in_dtype))); } }(); @@ -430,35 +431,27 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype, const DFTI_CONFIG_VALUE domain = (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL; - // const bool complex_input = framework::IsComplexType(in_dtype); - // const bool complex_output = framework::IsComplexType(out_dtype); - // const DFTI_CONFIG_VALUE domain = [&] { - // if (forward) { - // return complex_input ? DFTI_COMPLEX : DFTI_REAL; - // } else { - // return complex_output ? DFTI_COMPLEX : DFTI_REAL; - // } - // }(); - DftiDescriptor descriptor; std::vector fft_sizes(signal_sizes.cbegin(), signal_sizes.cend()); const MKL_LONG signal_ndim = fft_sizes.size() - 1; descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1); // placement inplace or not inplace - MKL_DFTI_CHECK( - DftiSetValue(descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE)); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE)); // number of transformations const MKL_LONG batch_size = fft_sizes[0]; - MKL_DFTI_CHECK( - DftiSetValue(descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size)); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size)); // input & output distance const MKL_LONG idist = in_strides[0]; const MKL_LONG odist = out_strides[0]; - MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist)); - MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_OUTPUT_DISTANCE, odist)); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(), + DFTI_INPUT_DISTANCE, idist)); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(), + DFTI_OUTPUT_DISTANCE, odist)); // input & output stride std::vector mkl_in_stride(1 + signal_ndim, 0); @@ -467,15 +460,15 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype, mkl_in_stride[i] = in_strides[i]; mkl_out_stride[i] = out_strides[i]; } - MKL_DFTI_CHECK( - DftiSetValue(descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data())); - MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_OUTPUT_STRIDES, - mkl_out_stride.data())); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data())); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data())); // conjugate even storage if (!(fft_type == FFTTransformType::C2C)) { - MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, - DFTI_COMPLEX_COMPLEX)); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)); } MKL_LONG signal_numel = @@ -496,11 +489,12 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype, return DFTI_BACKWARD_SCALE; } }(); - MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), scale_direction, scale)); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(), + scale_direction, scale)); } // commit the descriptor - MKL_DFTI_CHECK(DftiCommitDescriptor(descriptor.get())); + MKL_DFTI_CHECK(platform::dynload::DftiCommitDescriptor(descriptor.get())); return descriptor; } @@ -592,15 +586,16 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out, collapsed_input.numel(), collapsed_input_conj.data()); for_range(functor); - MKL_DFTI_CHECK(DftiComputeBackward(desc.get(), - collapsed_input_conj.data(), - collapsed_output.data())); + MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward( + desc.get(), collapsed_input_conj.data(), + collapsed_output.data())); } else if (fft_type == FFTTransformType::R2C && !forward) { framework::Tensor collapsed_output_conj(collapsed_output.type()); collapsed_output_conj.mutable_data(collapsed_output.dims(), ctx.GetPlace()); - MKL_DFTI_CHECK(DftiComputeForward(desc.get(), collapsed_input.data(), - collapsed_output_conj.data())); + MKL_DFTI_CHECK(platform::dynload::DftiComputeForward( + desc.get(), collapsed_input.data(), + collapsed_output_conj.data())); // conjugate the output platform::ForRange for_range(ctx, collapsed_output.numel()); math::ConjFunctor functor(collapsed_output_conj.data(), @@ -609,13 +604,13 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out, for_range(functor); } else { if (forward) { - MKL_DFTI_CHECK(DftiComputeForward(desc.get(), - collapsed_input.data(), - collapsed_output.data())); + MKL_DFTI_CHECK(platform::dynload::DftiComputeForward( + desc.get(), collapsed_input.data(), + collapsed_output.data())); } else { - MKL_DFTI_CHECK(DftiComputeBackward(desc.get(), - collapsed_input.data(), - collapsed_output.data())); + MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward( + desc.get(), collapsed_input.data(), + collapsed_output.data())); } } diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index c0d4b349a9e09b..8c64aad46cfc80 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -49,3 +49,9 @@ endif() cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader) add_dependencies(dynload_lapack extern_lapack) # TODO(TJ): add iomp, mkldnn? + +if (MKL_FOUND AND WITH_ONEMKL) + message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}") + cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader) + target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE}) +endif() diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index a83f085f7d2d81..0c5c47e38f85ef 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -53,6 +53,12 @@ DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); +DEFINE_string(mkl_dir, "", + "Specify path for loading libmkl_rt.so. " + "For insrance, /opt/intel/oneapi/mkl/latest/lib/intel64/." + "If default, " + "dlopen will search mkl from LD_LIBRARY_PATH"); + DEFINE_string(op_dir, "", "Specify path for loading user-defined op library."); #ifdef PADDLE_WITH_HIP @@ -518,6 +524,16 @@ void* GetCUFFTDsoHandle() { #endif } +void* GetMKLRTDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll"); +#else + return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so"); +#endif +} + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 82c36d9e224f4e..6260efdf71c590 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -43,6 +43,7 @@ void* GetLAPACKDsoHandle(); void* GetOpDsoHandle(const std::string& dso_name); void* GetNvtxDsoHandle(); void* GetCUFFTDsoHandle(); +void* GetMKLRTDsoHandle(); void SetPaddleLibPath(const std::string&); } // namespace dynload diff --git a/paddle/fluid/platform/dynload/mklrt.cc b/paddle/fluid/platform/dynload/mklrt.cc new file mode 100644 index 00000000000000..45fad15fb583ed --- /dev/null +++ b/paddle/fluid/platform/dynload/mklrt.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/mklrt.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag mklrt_dso_flag; +void* mklrt_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +MKLDFTI_ROUTINE_EACH(DEFINE_WRAP); + +DFTI_EXTERN MKL_LONG DftiCreateDescriptorX(DFTI_DESCRIPTOR_HANDLE* desc, + enum DFTI_CONFIG_VALUE prec, + enum DFTI_CONFIG_VALUE domain, + MKL_LONG dim, MKL_LONG* sizes) { + if (prec == DFTI_SINGLE) { + if (dim == 1) { + return DftiCreateDescriptor_s_1d(desc, domain, sizes[0]); + } else { + return DftiCreateDescriptor_s_md(desc, domain, dim, sizes); + } + } else if (prec == DFTI_DOUBLE) { + if (dim == 1) { + return DftiCreateDescriptor_d_1d(desc, domain, sizes[0]); + } else { + return DftiCreateDescriptor_d_md(desc, domain, dim, sizes); + } + } else { + return DftiCreateDescriptor(desc, prec, domain, dim, sizes); + } +} + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/mklrt.h b/paddle/fluid/platform/dynload/mklrt.h new file mode 100644 index 00000000000000..423cd4d0a254c8 --- /dev/null +++ b/paddle/fluid/platform/dynload/mklrt.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include // NOLINT + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag mklrt_dso_flag; +extern void* mklrt_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load mkldfti routine + * via operator overloading. + */ +#define DYNAMIC_LOAD_MKLRT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using mklrtFunc = decltype(&::__name); \ + std::call_once(mklrt_dso_flag, []() { \ + mklrt_dso_handle = paddle::platform::dynload::GetMKLRTDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(mklrt_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +// mkl_dfti.h has a macro that shadows the function with the same name +// un-defeine this macro so as to export that function +#undef DftiCreateDescriptor + +#define MKLDFTI_ROUTINE_EACH(__macro) \ + __macro(DftiCreateDescriptor); \ + __macro(DftiCreateDescriptor_s_1d); \ + __macro(DftiCreateDescriptor_d_1d); \ + __macro(DftiCreateDescriptor_s_md); \ + __macro(DftiCreateDescriptor_d_md); \ + __macro(DftiSetValue); \ + __macro(DftiGetValue); \ + __macro(DftiCommitDescriptor); \ + __macro(DftiComputeForward); \ + __macro(DftiComputeBackward); \ + __macro(DftiFreeDescriptor); \ + __macro(DftiErrorClass); \ + __macro(DftiErrorMessage); + +MKLDFTI_ROUTINE_EACH(DYNAMIC_LOAD_MKLRT_WRAP) + +#undef DYNAMIC_LOAD_MKLRT_WRAP + +// define another function to avoid naming conflict +DFTI_EXTERN MKL_LONG DftiCreateDescriptorX(DFTI_DESCRIPTOR_HANDLE* desc, + enum DFTI_CONFIG_VALUE prec, + enum DFTI_CONFIG_VALUE domain, + MKL_LONG dim, MKL_LONG* sizes); + +} // namespace dynload +} // namespace platform +} // namespace paddle From 37257d6a8584b437db36f20c43109b1950474ded Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Fri, 15 Oct 2021 13:51:52 +0800 Subject: [PATCH 175/298] fix no_grad context error in train mode when using save/load (#36434) * fix no_grad context error in train mode when using save/load * change net to train mode in test case --- python/paddle/fluid/dygraph/io.py | 8 ++++++++ .../fluid/tests/unittests/test_io_save_load.py | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py index 2318a08462d5d5..75a27f256962c9 100644 --- a/python/paddle/fluid/dygraph/io.py +++ b/python/paddle/fluid/dygraph/io.py @@ -844,6 +844,8 @@ def _run_dygraph(instance, input, program_holder): continue persistable_var._set_grad_type(grad_var.type()) + drop_scope_if_no_grad(instance, tmp_scope_vec) + # 3. prepare output, keep same form with inputs outs = output_vars if len(output_vars) == 1: @@ -851,6 +853,12 @@ def _run_dygraph(instance, input, program_holder): return outs +def drop_scope_if_no_grad(instance, scope_vec): + tracer = framework._dygraph_tracer() + if (not instance._is_test) and (not tracer._has_grad): + scope_vec.value().get_scope().drop_kids() + + def _run_static_graph(input, program_holder, trace_program): main_program = framework.default_main_program() param_var_names = _get_persistable_var_names(trace_program) diff --git a/python/paddle/fluid/tests/unittests/test_io_save_load.py b/python/paddle/fluid/tests/unittests/test_io_save_load.py index c532c1bdbaa051..89ca28510b9b92 100644 --- a/python/paddle/fluid/tests/unittests/test_io_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_io_save_load.py @@ -15,6 +15,7 @@ from __future__ import print_function import unittest +import paddle import paddle.fluid as fluid from paddle.fluid import core @@ -69,5 +70,22 @@ def test_useless_feeded_var_names(self): main_program=main_prog) +class TestWhenTrainWithNoGrad(unittest.TestCase): + def test_when_train_with_no_grad(self): + paddle.disable_static() + net = paddle.nn.Linear(1024, 1) + net = paddle.jit.to_static(net) + x = paddle.rand([1024], 'float32') + net(x) + save_path = './train_with_no_grad' + paddle.jit.save(net, save_path) + net = paddle.jit.load(save_path) + net.train() + + with paddle.no_grad(): + x = paddle.rand([1024], 'float32') + net(x) + + if __name__ == '__main__': unittest.main() From 277c9a5552ca3c58aca4ab76db22ed4a9c7ead1a Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Fri, 15 Oct 2021 14:06:13 +0800 Subject: [PATCH 176/298] add resnext (#36070) * add resnext model * add zh docs * add unittest * test performance Co-authored-by: Ainavo Co-authored-by: pithygit Co-authored-by: Ainavo Co-authored-by: pithygit --- python/paddle/tests/test_pretrained_model.py | 3 +- python/paddle/tests/test_vision_models.py | 18 + python/paddle/vision/__init__.py | 7 + python/paddle/vision/models/__init__.py | 16 +- python/paddle/vision/models/resnext.py | 364 +++++++++++++++++++ 5 files changed, 406 insertions(+), 2 deletions(-) create mode 100644 python/paddle/vision/models/resnext.py diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py index fba1435c75e9c2..ac2b1194dd8b11 100644 --- a/python/paddle/tests/test_pretrained_model.py +++ b/python/paddle/tests/test_pretrained_model.py @@ -53,7 +53,8 @@ def infer(self, arch): def test_models(self): arches = [ - 'mobilenet_v1', 'mobilenet_v2', 'resnet18', 'vgg16', 'alexnet' + 'mobilenet_v1', 'mobilenet_v2', 'resnet18', 'vgg16', 'alexnet', + 'resnext50_32x4d' ] for arch in arches: self.infer(arch) diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py index ea42c22e289ede..9ef81655085071 100644 --- a/python/paddle/tests/test_vision_models.py +++ b/python/paddle/tests/test_vision_models.py @@ -73,6 +73,24 @@ def test_resnet152(self): def test_alexnet(self): self.models_infer('alexnet') + def test_resnext50_32x4d(self): + self.models_infer('resnext50_32x4d') + + def test_resnext50_64x4d(self): + self.models_infer('resnext50_64x4d') + + def test_resnext101_32x4d(self): + self.models_infer('resnext101_32x4d') + + def test_resnext101_64x4d(self): + self.models_infer('resnext101_64x4d') + + def test_resnext152_32x4d(self): + self.models_infer('resnext152_32x4d') + + def test_resnext152_64x4d(self): + self.models_infer('resnext152_64x4d') + def test_vgg16_num_classes(self): vgg16 = models.__dict__['vgg16'](pretrained=False, num_classes=10) diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py index b8ac548a966636..3ea4f5cd2d4de2 100644 --- a/python/paddle/vision/__init__.py +++ b/python/paddle/vision/__init__.py @@ -46,6 +46,13 @@ from .models import LeNet # noqa: F401 from .models import AlexNet # noqa: F401 from .models import alexnet # noqa: F401 +from .models import ResNeXt # noqa: F401 +from .models import resnext50_32x4d # noqa: F401 +from .models import resnext50_64x4d # noqa: F401 +from .models import resnext101_32x4d # noqa: F401 +from .models import resnext101_64x4d # noqa: F401 +from .models import resnext152_32x4d # noqa: F401 +from .models import resnext152_64x4d # noqa: F401 from .transforms import BaseTransform # noqa: F401 from .transforms import Compose # noqa: F401 from .transforms import Resize # noqa: F401 diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py index b85333614637f0..3f48b1475e23ba 100644 --- a/python/paddle/vision/models/__init__.py +++ b/python/paddle/vision/models/__init__.py @@ -30,6 +30,13 @@ from .lenet import LeNet # noqa: F401 from .alexnet import AlexNet # noqa: F401 from .alexnet import alexnet # noqa: F401 +from .resnext import ResNeXt # noqa: F401 +from .resnext import resnext50_32x4d # noqa: F401 +from .resnext import resnext50_64x4d # noqa: F401 +from .resnext import resnext101_32x4d # noqa: F401 +from .resnext import resnext101_64x4d # noqa: F401 +from .resnext import resnext152_32x4d # noqa: F401 +from .resnext import resnext152_64x4d # noqa: F401 __all__ = [ #noqa 'ResNet', @@ -49,5 +56,12 @@ 'mobilenet_v2', 'LeNet', 'AlexNet', - 'alexnet' + 'alexnet', + 'ResNeXt', + 'resnext50_32x4d', + 'resnext50_64x4d', + 'resnext101_32x4d', + 'resnext101_64x4d', + 'resnext152_32x4d', + 'resnext152_64x4d' ] diff --git a/python/paddle/vision/models/resnext.py b/python/paddle/vision/models/resnext.py new file mode 100644 index 00000000000000..2e1073c8ac5ce2 --- /dev/null +++ b/python/paddle/vision/models/resnext.py @@ -0,0 +1,364 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.fluid.param_attr import ParamAttr +from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Linear, MaxPool2D +from paddle.nn.initializer import Uniform +from paddle.utils.download import get_weights_path_from_url + +__all__ = [] + +model_urls = { + 'resnext50_32x4d': + ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_32x4d_pretrained.pdparams', + 'bf04add2f7fd22efcbe91511bcd1eebe'), + "resnext50_64x4d": + ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_64x4d_pretrained.pdparams', + '46307df0e2d6d41d3b1c1d22b00abc69'), + 'resnext101_32x4d': + ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x4d_pretrained.pdparams', + '078ca145b3bea964ba0544303a43c36d'), + 'resnext101_64x4d': + ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_64x4d_pretrained.pdparams', + '4edc0eb32d3cc5d80eff7cab32cd5c64'), + 'resnext152_32x4d': + ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_32x4d_pretrained.pdparams', + '7971cc994d459af167c502366f866378'), + 'resnext152_64x4d': + ('https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_64x4d_pretrained.pdparams', + '836943f03709efec364d486c57d132de'), +} + + +class ConvBNLayer(nn.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + super(ConvBNLayer, self).__init__() + self._conv = Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + bias_attr=False) + self._batch_norm = BatchNorm(num_filters, act=act) + + def forward(self, inputs): + x = self._conv(inputs) + x = self._batch_norm(x) + return x + + +class BottleneckBlock(nn.Layer): + def __init__(self, + num_channels, + num_filters, + stride, + cardinality, + shortcut=True): + super(BottleneckBlock, self).__init__() + self.conv0 = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters, + filter_size=1, + act='relu') + self.conv1 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters, + filter_size=3, + groups=cardinality, + stride=stride, + act='relu') + self.conv2 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters * 2 if cardinality == 32 else num_filters, + filter_size=1, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters * 2 + if cardinality == 32 else num_filters, + filter_size=1, + stride=stride) + + self.shortcut = shortcut + + def forward(self, inputs): + x = self.conv0(inputs) + conv1 = self.conv1(x) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + x = paddle.add(x=short, y=conv2) + x = F.relu(x) + return x + + +class ResNeXt(nn.Layer): + """ResNeXt model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_ + + Args: + depth (int, optional): depth of resnext. Default: 50. + cardinality (int, optional): cardinality of resnext. Default: 32. + num_classes (int, optional): output dim of last fc layer. If num_classes <=0, last fc layer + will not be defined. Default: 1000. + with_pool (bool, optional): use pool before the last fc layer or not. Default: True. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import ResNeXt + + resnext50_32x4d = ResNeXt(depth=50, cardinality=32) + + """ + + def __init__(self, + depth=50, + cardinality=32, + num_classes=1000, + with_pool=True): + super(ResNeXt, self).__init__() + + self.depth = depth + self.cardinality = cardinality + self.num_classes = num_classes + self.with_pool = with_pool + + supported_depth = [50, 101, 152] + assert depth in supported_depth, \ + "supported layers are {} but input layer is {}".format( + supported_depth, depth) + supported_cardinality = [32, 64] + assert cardinality in supported_cardinality, \ + "supported cardinality is {} but input cardinality is {}" \ + .format(supported_cardinality, cardinality) + layer_cfg = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]} + layers = layer_cfg[depth] + num_channels = [64, 256, 512, 1024] + num_filters = [128, 256, 512, + 1024] if cardinality == 32 else [256, 512, 1024, 2048] + + self.conv = ConvBNLayer( + num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu') + self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1) + + self.block_list = [] + for block in range(len(layers)): + shortcut = False + for i in range(layers[block]): + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + num_channels=num_channels[block] if i == 0 else + num_filters[block] * int(64 // self.cardinality), + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=self.cardinality, + shortcut=shortcut)) + self.block_list.append(bottleneck_block) + shortcut = True + + if with_pool: + self.pool2d_avg = AdaptiveAvgPool2D(1) + + if num_classes > 0: + self.pool2d_avg_channels = num_channels[-1] * 2 + stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0) + self.out = Linear( + self.pool2d_avg_channels, + num_classes, + weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv))) + + def forward(self, inputs): + with paddle.static.amp.fp16_guard(): + x = self.conv(inputs) + x = self.pool2d_max(x) + for block in self.block_list: + x = block(x) + if self.with_pool: + x = self.pool2d_avg(x) + if self.num_classes > 0: + x = paddle.reshape(x, shape=[-1, self.pool2d_avg_channels]) + x = self.out(x) + return x + + +def _resnext(arch, depth, cardinality, pretrained, **kwargs): + model = ResNeXt(depth=depth, cardinality=cardinality, **kwargs) + if pretrained: + assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format( + arch) + weight_path = get_weights_path_from_url(model_urls[arch][0], + model_urls[arch][1]) + + param = paddle.load(weight_path) + model.set_dict(param) + + return model + + +def resnext50_32x4d(pretrained=False, **kwargs): + """ResNeXt-50 32x4d model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import resnext50_32x4d + + # build model + model = resnext50_32x4d() + + # build model and load imagenet pretrained weight + # model = resnext50_32x4d(pretrained=True) + """ + return _resnext('resnext50_32x4d', 50, 32, pretrained, **kwargs) + + +def resnext50_64x4d(pretrained=False, **kwargs): + """ResNeXt-50 64x4d model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import resnext50_64x4d + + # build model + model = resnext50_64x4d() + + # build model and load imagenet pretrained weight + # model = resnext50_64x4d(pretrained=True) + """ + return _resnext('resnext50_64x4d', 50, 64, pretrained, **kwargs) + + +def resnext101_32x4d(pretrained=False, **kwargs): + """ResNeXt-101 32x4d model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import resnext101_32x4d + + # build model + model = resnext101_32x4d() + + # build model and load imagenet pretrained weight + # model = resnext101_32x4d(pretrained=True) + """ + return _resnext('resnext101_32x4d', 101, 32, pretrained, **kwargs) + + +def resnext101_64x4d(pretrained=False, **kwargs): + """ResNeXt-101 64x4d model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import resnext101_64x4d + + # build model + model = resnext101_64x4d() + + # build model and load imagenet pretrained weight + # model = resnext101_64x4d(pretrained=True) + """ + return _resnext('resnext101_64x4d', 101, 64, pretrained, **kwargs) + + +def resnext152_32x4d(pretrained=False, **kwargs): + """ResNeXt-152 32x4d model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import resnext152_32x4d + + # build model + model = resnext152_32x4d() + + # build model and load imagenet pretrained weight + # model = resnext152_32x4d(pretrained=True) + """ + return _resnext('resnext152_32x4d', 152, 32, pretrained, **kwargs) + + +def resnext152_64x4d(pretrained=False, **kwargs): + """ResNeXt-152 64x4d model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import resnext152_64x4d + + # build model + model = resnext152_64x4d() + + # build model and load imagenet pretrained weight + # model = resnext152_64x4d(pretrained=True) + """ + return _resnext('resnext152_64x4d', 152, 64, pretrained, **kwargs) From 2de0b58e383b9e9fddef23041ac8470e3191abd6 Mon Sep 17 00:00:00 2001 From: feng_shuai Date: Fri, 15 Oct 2021 14:23:54 +0800 Subject: [PATCH 177/298] feat: Add TRT support for 3D(batch_norm_op and elementwise_add_op) (#36446) --- paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc | 7 ++++--- paddle/fluid/inference/tensorrt/convert/elementwise_op.cc | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 7ea41839cb939f..71a2fa68f1749f 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -147,9 +147,10 @@ class BatchNormOpConverter : public OpConverter { X = expand_layer->getOutput(0); } - layer = TRT_ENGINE_ADD_LAYER( - engine_, Scale, *X, nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(), - scale_weights.get(), power_weights.get()); + layer = TRT_ENGINE_ADD_LAYER(engine_, ScaleNd, *X, + nvinfer1::ScaleMode::kCHANNEL, + shift_weights.get(), scale_weights.get(), + power_weights.get(), dynamic_shape_offset); auto output_name = op_desc.Output("Y").front(); engine_->SetWeights(op_desc.Input("Bias").front(), diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 2f802ea8d181ea..8569dd63478529 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -83,8 +83,8 @@ class ElementwiseWeightOpConverter : public OpConverter { } if (op_type_ == "add") { nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( - engine_, Scale, *X, scale_mode, shift_weights.get(), - scale_weights.get(), power_weights.get()); + engine_, ScaleNd, *X, scale_mode, shift_weights.get(), + scale_weights.get(), power_weights.get(), dynamic_shape_offset); layer = scale_layer; } else if (op_type_ == "mul") { nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( From 12882b2f07d728a9d40175c492c523c496372ddd Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Fri, 15 Oct 2021 14:27:58 +0800 Subject: [PATCH 178/298] Add ResNetUnit Python API (#35426) --- .../inplace_addto_op_pass.cc | 9 +- .../fluid/operators/fused/resnet_unit_op.cc | 5 +- .../fluid/operators/fused/resnet_unit_op.cu | 19 +- python/paddle/incubate/operators/__init__.py | 1 + .../paddle/incubate/operators/resnet_unit.py | 269 ++++++++++++++++++ 5 files changed, 289 insertions(+), 14 deletions(-) create mode 100644 python/paddle/incubate/operators/resnet_unit.py diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc index 849d0dabab7796..d09de5be84c358 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc @@ -179,7 +179,8 @@ void InplaceAddToOpPass::Run(Graph *graph) const { out_var_ptr->GeneratedOp()); // NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy - if (right_generated_op->Name() != "conv2d_grad") { + if (right_generated_op->Name() != "conv2d_grad" && + right_generated_op->Name() != "resnet_unit_grad") { continue; } @@ -224,11 +225,13 @@ static bool IsValidConv2DGradDataGradNode(const Node &node) { if (node.inputs.empty()) return false; auto *generated_op = node.inputs[0]; auto *op_desc = generated_op->Op(); - if (op_desc == nullptr || op_desc->Type() != "conv2d_grad") { + if (op_desc == nullptr || (op_desc->Type() != "conv2d_grad" && + op_desc->Type() != "resnet_unit_grad")) { return false; } const auto &outputs = op_desc->Outputs(); - auto iter = outputs.find(GradVarName("Input")); + std::string grad_var_name = op_desc->Type() == "conv2d_grad" ? "Input" : "X"; + auto iter = outputs.find(GradVarName(grad_var_name)); return iter != outputs.end() && !iter->second.empty() && iter->second[0] == node.Name() && !op_desc->GetAttrIfExists("use_addto"); diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc index 062fd3f1cf4088..d2ac089d4d1d21 100644 --- a/paddle/fluid/operators/fused/resnet_unit_op.cc +++ b/paddle/fluid/operators/fused/resnet_unit_op.cc @@ -232,13 +232,14 @@ class ResNetUnitOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") .SetDefault(false); + AddAttr("use_addto", "").SetDefault(false); AddAttr("act_type", "The activation type to be fused.") .SetDefault("relu"); AddComment(R"DOC( -Fusion op of the basic unit of resnet block. +Fusion op of the basic unit of resnet block. The implementation is based on the latest fusion op interface in cuDNN v8.0. -For more details: +For more details: https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnFusedOps_t )DOC"); diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu index a0126e5a9d4283..b121864f80e4d9 100644 --- a/paddle/fluid/operators/fused/resnet_unit_op.cu +++ b/paddle/fluid/operators/fused/resnet_unit_op.cu @@ -55,7 +55,7 @@ class ResNetUnitKernel : public framework::OpKernel { int padding = ctx.Attr("padding"); int stride = ctx.Attr("stride"); int stride_z = ctx.Attr("stride_z"); - int dilate = ctx.Attr("dilate"); + int dilation = ctx.Attr("dilation"); int group = ctx.Attr("group"); double eps = static_cast(ctx.Attr("epsilon")); double momentum = static_cast(ctx.Attr("momentum")); @@ -87,7 +87,7 @@ class ResNetUnitKernel : public framework::OpKernel { sum_x.Resize(param_dims); sum_of_squares_x.Resize(param_dims); CudnnNormConvolution conv_x_op(dev_ctx, input_x_shape, filter_x_shape, - output_shape, padding, stride, dilate, + output_shape, padding, stride, dilation, group); conv_x_op.Forward(dev_ctx, *input_x, *filter_x, conv_out_x, &sum_x, &sum_of_squares_x); @@ -129,8 +129,8 @@ class ResNetUnitKernel : public framework::OpKernel { sum_z.Resize(param_dims); sum_of_squares_z.Resize(param_dims); CudnnNormConvolution conv_z_op(dev_ctx, input_z_shape, filter_z_shape, - output_shape, padding, stride_z, dilate, - group); + output_shape, padding, stride_z, + dilation, group); conv_z_op.Forward(dev_ctx, *input_z, *filter_z, conv_out_z, &sum_z, &sum_of_squares_z); @@ -189,7 +189,7 @@ class ResNetUnitGradKernel : public framework::OpKernel { int padding = ctx.Attr("padding"); int stride = ctx.Attr("stride"); int stride_z = ctx.Attr("stride_z"); - int dilate = ctx.Attr("dilate"); + int dilation = ctx.Attr("dilation"); int group = ctx.Attr("group"); double eps = static_cast(ctx.Attr("epsilon")); double momentum = static_cast(ctx.Attr("momentum")); @@ -263,7 +263,7 @@ class ResNetUnitGradKernel : public framework::OpKernel { auto filter_z_shape = framework::vectorize(filter_z->dims()); CudnnNormConvolutionGrad conv_z_op(dev_ctx, z_shape, filter_z_shape, output_shape, padding, stride_z, - dilate, group); + dilation, group); conv_z_op.Backward(dev_ctx, *z, *filter_z, conv_out_z_grad, z_grad, filter_z_grad); } else { @@ -278,11 +278,12 @@ class ResNetUnitGradKernel : public framework::OpKernel { } // 2. Backward of Conv for x, get x_grad and filter_x_grad + bool use_addto = ctx.Attr("use_addto"); CudnnNormConvolutionGrad conv_x_op(dev_ctx, x_shape, filter_x_shape, - output_shape, padding, stride, dilate, - group); + output_shape, padding, stride, + dilation, group); conv_x_op.Backward(dev_ctx, *x, *filter_x, conv_out_x_grad, x_grad, - filter_x_grad); + filter_x_grad, use_addto); } }; diff --git a/python/paddle/incubate/operators/__init__.py b/python/paddle/incubate/operators/__init__.py index 694cde4f28624b..9a6710d0950974 100644 --- a/python/paddle/incubate/operators/__init__.py +++ b/python/paddle/incubate/operators/__init__.py @@ -14,3 +14,4 @@ from .softmax_mask_fuse_upper_triangle import softmax_mask_fuse_upper_triangle # noqa: F401 from .softmax_mask_fuse import softmax_mask_fuse # noqa: F401 +from .resnet_unit import ResNetUnit #noqa: F401 diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py new file mode 100644 index 00000000000000..cba1d4863cbd43 --- /dev/null +++ b/python/paddle/incubate/operators/resnet_unit.py @@ -0,0 +1,269 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import collections +import itertools +import six +import math +import sys +import warnings +from functools import partial, reduce + +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle import framework +from paddle.device import get_device, get_cudnn_version +from paddle.nn import initializer as I +from paddle.nn import Layer, LayerList +from paddle.fluid.layers import utils +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as +from paddle.fluid.data_feeder import convert_dtype +from paddle.fluid.param_attr import ParamAttr +from paddle import _C_ops +__all__ = ['resnet_unit', 'ResNetUnit'] + + +def resnet_unit(x, filter_x, scale_x, bias_x, mean_x, var_x, z, filter_z, + scale_z, bias_z, mean_z, var_z, stride, stride_z, padding, + dilation, groups, momentum, eps, data_format, fuse_add, + has_shortcut, use_global_stats, is_test, act): + + helper = LayerHelper('resnet_unit', **locals()) + bn_param_dtype = fluid.core.VarDesc.VarType.FP32 + bit_mask_dtype = fluid.core.VarDesc.VarType.INT32 + out = helper.create_variable_for_type_inference(x.dtype) + bit_mask = helper.create_variable_for_type_inference( + dtype=bit_mask_dtype, stop_gradient=True) + # intermediate_out for x + conv_x = helper.create_variable_for_type_inference( + dtype=x.dtype, stop_gradient=True) + saved_mean_x = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + saved_invstd_x = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + running_mean_x = mean_x + running_var_x = var_x + # intermediate_out for z + conv_z = helper.create_variable_for_type_inference( + dtype=x.dtype, stop_gradient=True) + saved_mean_z = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + saved_invstd_z = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + running_mean_z = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) if mean_z is None else mean_z + running_var_z = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) if var_z is None else var_z + + inputs = { + 'X': x, + 'FilterX': filter_x, + 'ScaleX': scale_x, + 'BiasX': bias_x, + 'MeanX': mean_x, + 'VarX': var_x, + 'Z': z, + 'FilterZ': filter_z, + 'ScaleZ': scale_z, + 'BiasZ': bias_z, + 'MeanZ': mean_z, + 'VarZ': var_z + } + + attrs = { + 'stride': stride, + 'stride_z': stride_z, + 'padding': padding, + 'dilation': dilation, + 'group': groups, + 'momentum': momentum, + 'epsilon': eps, + 'data_format': data_format, + 'fuse_add': fuse_add, + 'has_shortcut': has_shortcut, + 'use_global_stats': use_global_stats, + 'is_test': is_test, + 'act_type': act + } + + outputs = { + 'Y': out, + 'BitMask': bit_mask, + 'ConvX': conv_x, + 'SavedMeanX': saved_mean_x, + 'SavedInvstdX': saved_invstd_x, + 'RunningMeanX': running_mean_x, + 'RunningVarX': running_var_x, + 'ConvZ': conv_z, + 'SavedMeanZ': saved_mean_z, + 'SavedInvstdZ': saved_invstd_z, + 'RunningMeanZ': running_mean_z, + 'RunningVarZ': running_var_z, + } + + helper.append_op( + type='resnet_unit', inputs=inputs, outputs=outputs, attrs=attrs) + + return out + + +class ResNetUnit(Layer): + r""" + ******Temporary version******. + ResNetUnit is designed for optimize the performence by using cudnnv8 API. + """ + + def __init__(self, + num_channels_x, + num_filters, + filter_size, + stride=1, + momentum=0.9, + eps=1e-5, + data_format='NHWC', + act='relu', + fuse_add=False, + has_shortcut=False, + use_global_stats=False, + is_test=False, + filter_x_attr=None, + scale_x_attr=None, + bias_x_attr=None, + moving_mean_x_name=None, + moving_var_x_name=None, + num_channels_z=1, + stride_z=1, + filter_z_attr=None, + scale_z_attr=None, + bias_z_attr=None, + moving_mean_z_name=None, + moving_var_z_name=None): + super(ResNetUnit, self).__init__() + self._stride = stride + self._stride_z = stride_z + self._dilation = 1 + self._kernel_size = utils.convert_to_list(filter_size, 2, 'kernel_size') + self._padding = (filter_size - 1) // 2 + self._groups = 1 + self._momentum = momentum + self._eps = eps + self._data_format = data_format + self._act = act + self._fuse_add = fuse_add + self._has_shortcut = has_shortcut + self._use_global_stats = use_global_stats + self._is_test = is_test + + # check format + valid_format = {'NHWC'} + if data_format not in valid_format: + raise ValueError( + "conv_format must be one of {}, but got conv_format='{}'". + format(valid_format, data_format)) + + def _get_default_param_initializer(channels): + filter_elem_num = np.prod(self._kernel_size) * channels + std = (2.0 / filter_elem_num)**0.5 + return I.Normal(0.0, std) + + # initial filter + bn_param_dtype = fluid.core.VarDesc.VarType.FP32 + bn_param_shape = [1, 1, 1, num_filters] + filter_x_shape = [num_filters, filter_size, filter_size, num_channels_x] + filter_z_shape = [num_filters, filter_size, filter_size, num_channels_z] + + self.filter_x = self.create_parameter( + shape=filter_x_shape, + attr=filter_x_attr, + default_initializer=_get_default_param_initializer(num_channels_x)) + self.scale_x = self.create_parameter( + shape=bn_param_shape, + attr=scale_x_attr, + dtype=bn_param_dtype, + default_initializer=I.Constant(1.0)) + self.bias_x = self.create_parameter( + shape=bn_param_shape, + attr=bias_x_attr, + dtype=bn_param_dtype, + is_bias=True) + self.mean_x = self.create_parameter( + attr=ParamAttr( + name=moving_mean_x_name, + initializer=I.Constant(0.0), + trainable=False), + shape=bn_param_shape, + dtype=bn_param_dtype) + self.mean_x.stop_gradient = True + self.var_x = self.create_parameter( + attr=ParamAttr( + name=moving_var_x_name, + initializer=I.Constant(1.0), + trainable=False), + shape=bn_param_shape, + dtype=bn_param_dtype) + self.var_x.stop_gradient = True + if has_shortcut: + self.filter_z = self.create_parameter( + shape=filter_z_shape, + attr=filter_z_attr, + default_initializer=_get_default_param_initializer( + num_channels_z)) + self.scale_z = self.create_parameter( + shape=bn_param_shape, + attr=scale_z_attr, + dtype=bn_param_dtype, + default_initializer=I.Constant(1.0)) + self.bias_z = self.create_parameter( + shape=bn_param_shape, + attr=bias_z_attr, + dtype=bn_param_dtype, + is_bias=True) + self.mean_z = self.create_parameter( + attr=ParamAttr( + name=moving_mean_z_name, + initializer=I.Constant(0.0), + trainable=False), + shape=bn_param_shape, + dtype=bn_param_dtype) + self.mean_z.stop_gradient = True + self.var_z = self.create_parameter( + attr=ParamAttr( + name=moving_var_z_name, + initializer=I.Constant(1.0), + trainable=False), + shape=bn_param_shape, + dtype=bn_param_dtype) + self.var_z.stop_gradient = True + else: + self.filter_z = None + self.scale_z = None + self.bias_z = None + self.mean_z = None + self.var_z = None + + def forward(self, x, z=None): + if self._fuse_add and z is None: + raise ValueError("z can not be None") + + out = resnet_unit( + x, self.filter_x, self.scale_x, self.bias_x, self.mean_x, + self.var_x, z, self.filter_z, self.scale_z, self.bias_z, + self.mean_z, self.var_z, self._stride, self._stride_z, + self._padding, self._dilation, self._groups, self._momentum, + self._eps, self._data_format, self._fuse_add, self._has_shortcut, + self._use_global_stats, self._is_test, self._act) + return out From e703a2edf459bb3d21f7ee646aac7da6567d0f17 Mon Sep 17 00:00:00 2001 From: duanboqiang Date: Fri, 15 Oct 2021 16:07:19 +0800 Subject: [PATCH 179/298] fix opt-offload save bug (#36433) --- .../paddle/distributed/fleet/meta_optimizers/sharding/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index 447b52ace69787..d04a3a53db3e2b 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -906,7 +906,7 @@ def is_opt_vars(var): "_velocity_0" ] for check in checks: - if var.name.endswith(check): + if var.name.endswith(check) and var.persistable: return True return False From adb8049460b3c14b0d0422fdc2fa10547fc9e912 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 15 Oct 2021 22:54:11 +0800 Subject: [PATCH 180/298] Remove wrong __restrict__ of CUDA LarsMomentumOpKernel (#36460) * remove wrong restrict * remove master_param_out __restrict__ * update --- .../operators/optimizers/lars_momentum_op.cu | 104 ++++++------------ 1 file changed, 31 insertions(+), 73 deletions(-) diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index b640e62221f777..89326679d5d501 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -84,22 +84,18 @@ class LarsThreadConfig { template __device__ inline void VectorizeLarsUpdate( - const T* __restrict__ grad, const MT* __restrict__ param, - const MT* __restrict__ velocity, T* __restrict__ param_out, - MT* __restrict__ velocity_out, const MT mu, MT local_lr, + const T* __restrict__ grad, const MT* param, const MT* velocity, + T* param_out, MT* velocity_out, const MT mu, MT local_lr, const MT lars_weight_decay, const MT rescale_grad, const int tid, - const int grid_stride, const int numel, - MT* __restrict__ master_param_out = nullptr) { + const int grid_stride, const int numel, MT* master_param_out = nullptr) { using VecType = paddle::platform::AlignedVector; using VecMType = paddle::platform::AlignedVector; int main = numel >> (VecSize >> 1); int tail_offset = main * VecSize; - const VecType* __restrict__ grad_vec = reinterpret_cast(grad); - const VecMType* __restrict__ param_vec = - reinterpret_cast(param); - const VecMType* __restrict__ velocity_vec = - reinterpret_cast(velocity); + const VecType* grad_vec = reinterpret_cast(grad); + const VecMType* param_vec = reinterpret_cast(param); + const VecMType* velocity_vec = reinterpret_cast(velocity); VecType* param_out_vec = reinterpret_cast(param_out); VecMType* velocity_out_vec = reinterpret_cast(velocity_out); @@ -157,66 +153,30 @@ __forceinline__ __device__ void L2NormKernel( template __global__ void L2NormKernel( #endif - const T* __restrict__ p_data, const T* __restrict__ g_data, - MT* __restrict__ p_buffer, MT* __restrict__ g_buffer, const int64_t numel, - const int repeat_times, const MT rescale_grad, const int thresh = 0, - MT* __restrict__ p_n = nullptr, MT* __restrict__ g_n = nullptr) { + const T* p_data, const T* __restrict__ g_data, MT* __restrict__ p_buffer, + MT* __restrict__ g_buffer, const int64_t numel, const int repeat_times, + const MT rescale_grad, const int thresh = 0, MT* __restrict__ p_n = nullptr, + MT* __restrict__ g_n = nullptr) { __shared__ MT s_buffer[2]; int tid = threadIdx.x + blockDim.x * blockIdx.x; int grid_stride = LARS_BLOCK_SIZE * gridDim.x; const MT rescale_pow = rescale_grad * rescale_grad; - if (threadIdx.x == 0) { - s_buffer[0] = static_cast(0); - s_buffer[1] = static_cast(0); - } + MT p_tmp = static_cast(0); MT g_tmp = static_cast(0); - - if (repeat_times == 0) { - if (tid < numel) { - p_tmp = static_cast(p_data[tid]); - g_tmp = static_cast(g_data[tid]); - } - MT tmp0 = math::blockReduceSum(p_tmp * p_tmp, FINAL_MASK); - MT tmp1 = math::blockReduceSum(g_tmp * g_tmp, FINAL_MASK); - if (threadIdx.x == 0) { - s_buffer[0] += tmp0; - s_buffer[1] += tmp1; - } - } else { - /* Avoid occupy too much temp buffer. Slice the whole data into 2 parts, - the front of data whose quantity is excatly multiple of grid-thread - number, and delt in for loop, the rest is delt with another step. */ - for (int i = 0; i < repeat_times; ++i) { - p_tmp = static_cast(p_data[tid]); - g_tmp = static_cast(g_data[tid]); - tid += grid_stride; - MT tmp0 = math::blockReduceSum(p_tmp * p_tmp, FINAL_MASK); - MT tmp1 = math::blockReduceSum(g_tmp * g_tmp, FINAL_MASK); - if (threadIdx.x == 0) { - s_buffer[0] += tmp0; - s_buffer[1] += tmp1; - } - __syncthreads(); - } - MT p_val = 0; - MT g_val = 0; - if (tid < numel) { - p_val = static_cast(p_data[tid]); - g_val = static_cast(g_data[tid]); - } - MT tmp0 = math::blockReduceSum(p_val * p_val, FINAL_MASK); - MT tmp1 = math::blockReduceSum(g_val * g_val, FINAL_MASK); - if (threadIdx.x == 0) { - s_buffer[0] += tmp0; - s_buffer[1] += tmp1; - } + while (tid < numel) { + MT tmp0 = static_cast(p_data[tid]); + MT tmp1 = static_cast(g_data[tid]); + p_tmp += (tmp0 * tmp0); + g_tmp += (tmp1 * tmp1); + tid += grid_stride; } - __syncthreads(); + p_tmp = math::blockReduceSum(p_tmp, FINAL_MASK); + g_tmp = math::blockReduceSum(g_tmp, FINAL_MASK); if (threadIdx.x == 0) { - p_buffer[blockIdx.x] = s_buffer[0]; - g_buffer[blockIdx.x] = s_buffer[1]; + p_buffer[blockIdx.x] = p_tmp; + g_buffer[blockIdx.x] = g_tmp; } #if CUDA_VERSION >= 11000 cg->sync(); // Grid sync for writring partial result to gloabl memory @@ -236,10 +196,9 @@ __global__ void L2NormKernel( template __forceinline__ __device__ void MomentumUpdate( - const T* __restrict__ param, const T* __restrict__ grad, - const MT* __restrict__ velocity, T* param_out, MT* velocity_out, - const MT* __restrict__ master_param, MT* __restrict__ master_param_out, - const MT* __restrict__ learning_rate, const MT mu, + const T* param, const T* __restrict__ grad, const MT* velocity, + T* param_out, MT* velocity_out, const MT* master_param, + MT* master_param_out, const MT* __restrict__ learning_rate, const MT mu, const MT lars_weight_decay, const MT lars_coeff, const MT epsilon, const MT rescale_grad, const MT param_norm, const MT grad_norm, const int tid, const int grid_stride, const int64_t numel, @@ -316,14 +275,13 @@ __global__ void MergedMomentumLarsKernel(LarsParamWarpper lars_warpper, template __global__ void MomentumLarsKernel( - const T* __restrict__ param, const T* __restrict__ grad, - const MT* __restrict__ velocity, T* param_out, MT* velocity_out, - const MT* __restrict__ master_param, MT* __restrict__ master_param_out, - const MT* __restrict__ learning_rate, MT* __restrict__ p_buffer, - MT* __restrict__ g_buffer, const MT mu, const MT lars_coeff, - const MT lars_weight_decay, const MT epsilon, const MT rescale_grad, - const int repeat_times, const int thresh, const int64_t numel, - const bool is_amp) { + const T* param, const T* __restrict__ grad, const MT* velocity, + T* param_out, MT* velocity_out, const MT* master_param, + MT* master_param_out, const MT* __restrict__ learning_rate, + MT* __restrict__ p_buffer, MT* __restrict__ g_buffer, const MT mu, + const MT lars_coeff, const MT lars_weight_decay, const MT epsilon, + const MT rescale_grad, const int repeat_times, const int thresh, + const int64_t numel, const bool is_amp) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int grid_stride = gridDim.x * LARS_BLOCK_SIZE; #if CUDA_VERSION >= 11000 From 0452f27cba16b6e152ec3a39b581e5588ec74d2b Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Sat, 16 Oct 2021 12:48:38 +0800 Subject: [PATCH 181/298] fix the initializer of resnet unit op (#36483) * fix the initializer of resnet unit op * fix the initializer of resnet unit op --- python/paddle/incubate/operators/resnet_unit.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py index cba1d4863cbd43..f2f391bdca946a 100644 --- a/python/paddle/incubate/operators/resnet_unit.py +++ b/python/paddle/incubate/operators/resnet_unit.py @@ -187,9 +187,7 @@ def _get_default_param_initializer(channels): filter_z_shape = [num_filters, filter_size, filter_size, num_channels_z] self.filter_x = self.create_parameter( - shape=filter_x_shape, - attr=filter_x_attr, - default_initializer=_get_default_param_initializer(num_channels_x)) + shape=filter_x_shape, attr=filter_x_attr, default_initializer=None) self.scale_x = self.create_parameter( shape=bn_param_shape, attr=scale_x_attr, @@ -220,8 +218,7 @@ def _get_default_param_initializer(channels): self.filter_z = self.create_parameter( shape=filter_z_shape, attr=filter_z_attr, - default_initializer=_get_default_param_initializer( - num_channels_z)) + default_initializer=None) self.scale_z = self.create_parameter( shape=bn_param_shape, attr=scale_z_attr, From 314cc4952474c8105176a1f1988d3ffb812a154d Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Sun, 17 Oct 2021 16:40:05 +0800 Subject: [PATCH 182/298] Revert "fix the initializer of resnet unit op (#36483)" (#36487) This reverts commit 0452f27cba16b6e152ec3a39b581e5588ec74d2b. --- python/paddle/incubate/operators/resnet_unit.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py index f2f391bdca946a..cba1d4863cbd43 100644 --- a/python/paddle/incubate/operators/resnet_unit.py +++ b/python/paddle/incubate/operators/resnet_unit.py @@ -187,7 +187,9 @@ def _get_default_param_initializer(channels): filter_z_shape = [num_filters, filter_size, filter_size, num_channels_z] self.filter_x = self.create_parameter( - shape=filter_x_shape, attr=filter_x_attr, default_initializer=None) + shape=filter_x_shape, + attr=filter_x_attr, + default_initializer=_get_default_param_initializer(num_channels_x)) self.scale_x = self.create_parameter( shape=bn_param_shape, attr=scale_x_attr, @@ -218,7 +220,8 @@ def _get_default_param_initializer(channels): self.filter_z = self.create_parameter( shape=filter_z_shape, attr=filter_z_attr, - default_initializer=None) + default_initializer=_get_default_param_initializer( + num_channels_z)) self.scale_z = self.create_parameter( shape=bn_param_shape, attr=scale_z_attr, From 4e036fa1a0c21b5b089809f575d37b2a0e6538da Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Sun, 17 Oct 2021 23:01:23 +0800 Subject: [PATCH 183/298] refine rescale_grad (#36490) --- paddle/fluid/operators/optimizers/lars_momentum_op.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index 89326679d5d501..2c27a2135c14b2 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -160,7 +160,6 @@ __global__ void L2NormKernel( __shared__ MT s_buffer[2]; int tid = threadIdx.x + blockDim.x * blockIdx.x; int grid_stride = LARS_BLOCK_SIZE * gridDim.x; - const MT rescale_pow = rescale_grad * rescale_grad; MT p_tmp = static_cast(0); MT g_tmp = static_cast(0); @@ -190,7 +189,7 @@ __global__ void L2NormKernel( } __syncthreads(); *p_n = Sqrt(s_buffer[0]); - *g_n = Sqrt(rescale_pow * s_buffer[1]); + *g_n = rescale_grad * Sqrt(s_buffer[1]); #endif } From e496d1e9b05906b38e2e5d424b6d4ad571ff678f Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Mon, 18 Oct 2021 10:46:30 +0800 Subject: [PATCH 184/298] modify ut of cond (#36475) --- python/paddle/fluid/tests/unittests/test_linalg_cond.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_linalg_cond.py b/python/paddle/fluid/tests/unittests/test_linalg_cond.py index 237c96430249bc..d13bdd676b48e3 100644 --- a/python/paddle/fluid/tests/unittests/test_linalg_cond.py +++ b/python/paddle/fluid/tests/unittests/test_linalg_cond.py @@ -32,7 +32,8 @@ def test_static_assert_true(self, x_list, p_list): exe = static.Executor() result = exe.run(feed={"X": x}, fetch_list=[output]) expected_output = np.linalg.cond(x, p) - self.assertTrue(np.allclose(result, expected_output)) + np.testing.assert_allclose( + result[0], expected_output, rtol=5e-5) def test_dygraph_assert_true(self, x_list, p_list): @@ -41,7 +42,8 @@ def test_dygraph_assert_true(self, x_list, p_list): input_tensor = paddle.to_tensor(x) output = paddle.linalg.cond(input_tensor, p) expected_output = np.linalg.cond(x, p) - self.assertTrue(np.allclose(output, expected_output)) + np.testing.assert_allclose( + output.numpy(), expected_output, rtol=5e-5) def gen_input(): @@ -156,5 +158,4 @@ def test_dygraph_empty_tensor_input(self): if __name__ == "__main__": paddle.enable_static() - # paddle.device.set_device("cpu") unittest.main() From 79dbbcced6da823187432dd5f3a40a95b0e864c7 Mon Sep 17 00:00:00 2001 From: Tongxin Bai Date: Mon, 18 Oct 2021 11:01:59 +0800 Subject: [PATCH 185/298] [autograd.functional] Fix a bug on handling v=None in vjp and jvp (#36445) * autograd.functional passed pylint checker. * autograd.functional: fix import errors. * autograd.functional: fixed unit tests. * autograd.functional minor format change * [autograd.functional] Fixed vjp and jvp's v=None bug. --- python/paddle/autograd/functional.py | 19 +++++++++++------ .../tests/unittests/autograd/test_vjp_jvp.py | 21 +++++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py index 17c7ad5b18af5f..66ae1562edb68a 100644 --- a/python/paddle/autograd/functional.py +++ b/python/paddle/autograd/functional.py @@ -23,10 +23,11 @@ @contextlib.contextmanager def gradient_scope(*var_lists, create_graph=False, allow_unused=False): - def grad_fn(ys, xs, v, create_graph=create_graph): - assert len(ys) == len(v), ( - f'`v` is expected to be of the same size as the output. ' - f'Here the output is {ys}, and `v` is {v}.') + def grad_fn(ys, xs, v=None, create_graph=create_graph): + if v is not None: + assert len(ys) == len(v), ( + f'The argument {v} is expected to be of the same size as the output. ' + f'Here the output is {ys}, and `v` is {v}.') if allow_unused: ys = [ to_tensor( @@ -49,6 +50,8 @@ def return_fn(out): return out def process(vl): + if vl is None: + return None out = [] # If v is treated as constant in the outer scope, its gradient is guaranteed # not to be taken beyond this scope. Within this scope, however, v's gradient @@ -151,7 +154,9 @@ def func_unused(x, y): # [[2., 1.], # [1., 0.]]), None] """ - xs, v = _tensors(inputs, "inputs"), _tensors(v, "v") + xs = _tensors(inputs, "inputs") + if v is not None: + v = _tensors(v, "v") with gradient_scope( xs, v, create_graph=create_graph, @@ -221,7 +226,9 @@ def func(x): # [0., 0.]])] """ - xs, v = _tensors(inputs, "inputs"), _tensors(v, "v") + xs = _tensors(inputs, "inputs") + if v is not None: + v = _tensors(v, "v") with gradient_scope( xs, v, create_graph=create_graph, diff --git a/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py b/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py index f3680ab2a62238..c228ad79321d43 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py @@ -205,6 +205,16 @@ def test_vjp_i2o2_no_create_graph(self): vjp_result, grad_result = vjp(), grad() self.check_results(grad_result, vjp_result) + def test_vjp_i2o2_omitting_v_no_create_graph(self): + test_cases = [ + [o2, ['A', 'A']], #noqa + ] #noqa + for f, inputs in test_cases: + inputs = self.gen_inputs(inputs) + vjp, grad = self.gen_test_pairs(f, inputs) + vjp_result, grad_result = vjp(), grad() + self.check_results(grad_result, vjp_result) + def test_vjp_nested_no_create_graph(self): x = self.gen_input('a') test_cases = [ @@ -289,6 +299,17 @@ def test_jvp_i2o2_no_create_graph(self): reverse_jac = jac(vjp, f, inputs) self.check_results(forward_jac, reverse_jac) + def test_jvp_i2o2_omitting_v_no_create_graph(self): + test_cases = [ #noqa + [o2, ['A', 'A']], #noqa + ] #noqa + for f, inputs in test_cases: + inputs = self.gen_inputs(inputs) + results_omitting_v = jvp(f, inputs) + v = [ones_like(x) for x in inputs] + results_with_v = jvp(f, inputs, v) + self.check_results(results_omitting_v, results_with_v) + if __name__ == "__main__": unittest.main() From d3c9394202579ab65bedfb3cbe0cc058a410f600 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Sun, 17 Oct 2021 22:22:30 -0500 Subject: [PATCH 186/298] Fix conv2d op_teller error (#36474) --- paddle/fluid/inference/tensorrt/op_teller.cc | 24 +++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 59368a299c59e2..89159c0bb636c9 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -242,9 +242,31 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (desc.HasAttr("padding_algorithm")) { auto padding_algorithm = BOOST_GET_CONST(std::string, desc.GetAttr("padding_algorithm")); - if (padding_algorithm == "SAME" || padding_algorithm == "VALID") { + if (padding_algorithm == "VALID") { return false; } + if (padding_algorithm == "SAME") { + if (desc.HasAttr("dilations")) { + const std::vector dilations = + BOOST_GET_CONST(std::vector, desc.GetAttr("dilations")); + if (dilations[0] != 1 || dilations[1] != 1) { + VLOG(3) << "In Same mode, Dilations must be (1, 1) for " + "tensorRT, but given (" + << dilations[0] << ", " << dilations[1] << ")"; + return false; + } + } + } + } + + if (use_no_calib_int8) { + if (desc.HasAttr("padding_algorithm")) { + auto padding_algorithm = + BOOST_GET_CONST(std::string, desc.GetAttr("padding_algorithm")); + if (padding_algorithm == "SAME") { + return false; + } + } } if (desc.HasAttr("enable_int8")) { From d19a9b3954f7e29356410824213806b7e27d37e4 Mon Sep 17 00:00:00 2001 From: taixiurong Date: Mon, 18 Oct 2021 11:24:04 +0800 Subject: [PATCH 187/298] [XPU AMP] 1. xpu support gradient acc 2. xpu support create tensor in dygraph 3. xpu support update weight params in amp (#36439) --- .../fluid/imperative/gradient_accumulator.cc | 47 ++++- .../reduce_ops/reduce_mean_op_xpu.cc | 99 ++++++++-- paddle/fluid/operators/slice_op_xpu.cc | 174 ++++++++---------- paddle/fluid/platform/xpu/xpu2_op_list.h | 11 +- python/paddle/fluid/framework.py | 12 ++ python/paddle/optimizer/adamw.py | 7 - python/paddle/tensor/creation.py | 4 +- 7 files changed, 238 insertions(+), 116 deletions(-) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index fbc5453f82146a..fd6a070c3fc529 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -87,9 +87,17 @@ class TensorAddFunctor : public boost::static_visitor<> { #ifdef PADDLE_WITH_XPU void operator()(const platform::XPUPlace& place) { + using XPUType = typename XPUTypeTrait::Type; platform::XPUDeviceContext* ctx = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)); - xpu::add(ctx->x_context(), x_, y_, y_, static_cast(numel_)); + int r = xpu::add( + ctx->x_context(), reinterpret_cast(x_), + reinterpret_cast(y_), reinterpret_cast(y_), + static_cast(numel_)); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU add kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); } #else void operator()(const platform::XPUPlace& place) { @@ -154,6 +162,24 @@ class TensorAddFunctor : public boost::static_visitor<> { T* y_; }; +#ifdef PADDLE_WITH_XPU +template +void XPUTensorAddFunctor(const platform::Place& place, + const framework::Tensor& src, framework::Tensor* dst) { + using XPUType = typename XPUTypeTrait::Type; + platform::XPUDeviceContext* ctx = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + const XPUType* x = reinterpret_cast(src.data()); + XPUType* y = reinterpret_cast(dst->mutable_data(place)); + int r = xpu::add(ctx->x_context(), x, y, y, + static_cast(src.numel())); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU add kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); +} +#endif + template void TensorAddImpl(const framework::Tensor& src, framework::Tensor* dst, const platform::Place& place) { @@ -226,7 +252,26 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { return; } #endif + +#ifdef PADDLE_WITH_XPU + if (platform::is_xpu_place(place)) { + if (data_type == framework::DataTypeTrait::DataType()) { + XPUTensorAddFunctor(place, src_tensor, dst_tensor); + } else if (data_type == + framework::DataTypeTrait::DataType()) { + XPUTensorAddFunctor(place, src_tensor, dst_tensor); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Gradient accumulation of data type (%s) on place (%s) is not " + "supported in imperative mode", + framework::DataTypeToString(data_type), place)); + } + return; + } +#endif + PADDLE_TENSOR_ADD(float); + #ifndef PADDLE_WITH_XPU // NOTE(phlrain): xpu only support float PADDLE_TENSOR_ADD(double); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc index b82ecbbe2fcdcc..d6c1dc5f02d422 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc @@ -23,30 +23,103 @@ namespace paddle { namespace operators { template class ReduceMeanXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { PADDLE_ENFORCE_EQ( platform::is_xpu_place(context.GetPlace()), true, platform::errors::Unavailable("This kernel only runs on XPU.")); - // bool reduce_all = context.Attr("reduce_all"); + bool reduce_all = context.Attr("reduce_all"); auto* input = context.Input("X"); auto* output = context.Output("Out"); output->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - int ndim = input->dims().size(); - std::vector idims; + + std::vector xdims; for (int i = 0; i < input->dims().size(); i++) { - idims.push_back(input->dims()[i]); + xdims.push_back(input->dims()[i]); } - auto dims = context.Attr>("dim"); - int rdim = dims.size(); - int r = - xpu::reduce(dev_ctx.x_context(), input->data(), output->data(), - idims.data(), ndim, dims.data(), rdim, xpu::REDUCE_MEAN); - PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true, - platform::errors::External("XPU kernel error!")); + auto rdims = context.Attr>("dim"); + if (reduce_all) { + rdims.clear(); + for (size_t i = 0; i < xdims.size(); i++) { + rdims.push_back(static_cast(i)); + } + } + int r = xpu::reduce_mean( + dev_ctx.x_context(), reinterpret_cast(input->data()), + reinterpret_cast(output->data()), xdims, rdims); + + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU reduce_mean kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); } }; + +template +class ReduceMeanGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + + XPUType* x_data = + reinterpret_cast(input_grad->mutable_data(ctx.GetPlace())); + const XPUType* dy_data = + reinterpret_cast(output_grad->data()); + + bool reduce_all = ctx.Attr("reduce_all"); + auto reduce_dims = ctx.Attr>("dim"); + + std::vector xdims; + for (int i = 0; i < input->dims().size(); i++) { + xdims.push_back(input->dims()[i]); + } + std::vector ydims; + for (int i = 0; i < output_grad->dims().size(); i++) { + ydims.push_back(output_grad->dims()[i]); + } + + int reduce_numel = 1; + if (reduce_all) { + reduce_dims.clear(); + for (size_t d = 0; d < xdims.size(); ++d) { + reduce_dims.push_back(static_cast(d)); + } + } + for (auto& d : reduce_dims) { + if (d < 0) { + d = d + xdims.size(); + } + reduce_numel *= xdims[d]; + } + + float val = 1.0f / static_cast(reduce_numel); + + auto& dev_ctx = ctx.template device_context(); + + int r = xpu::constant(dev_ctx.x_context(), x_data, input->numel(), + static_cast(val)); + + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU constant kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); + r = xpu::broadcast_mul(dev_ctx.x_context(), x_data, dy_data, x_data, xdims, + ydims); + + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU broadcast_mul kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + } +}; + } // namespace operators } // namespace paddle @@ -54,4 +127,8 @@ REGISTER_OP_XPU_KERNEL( reduce_mean, ops::ReduceMeanXPUKernel); +REGISTER_OP_XPU_KERNEL( + reduce_mean_grad, + ops::ReduceMeanGradXPUKernel); + #endif diff --git a/paddle/fluid/operators/slice_op_xpu.cc b/paddle/fluid/operators/slice_op_xpu.cc index 5f98efe8e91466..6ac1027b0ce195 100644 --- a/paddle/fluid/operators/slice_op_xpu.cc +++ b/paddle/fluid/operators/slice_op_xpu.cc @@ -27,6 +27,8 @@ using Tensor = framework::Tensor; template class SliceXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { auto in = ctx.Input("Input"); @@ -83,114 +85,93 @@ class SliceXPUKernel : public framework::OpKernel { } auto& dev_ctx = ctx.template device_context(); - auto* in_data = in->data(); - auto* out_data = out->mutable_data(ctx.GetPlace()); - int r = xpu::slice(dev_ctx.x_context(), in_data, out_data, shape, - starts_extension, ends_extension); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU slice kernel error!")); + const XPUType* in_data = reinterpret_cast(in->data()); + XPUType* out_data = + reinterpret_cast(out->mutable_data(ctx.GetPlace())); + int r = xpu::slice(dev_ctx.x_context(), in_data, out_data, shape, + starts_extension, ends_extension); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU slice kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); } }; template class SliceGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* d_in = ctx.Output(framework::GradVarName("Input")); - d_in->mutable_data(ctx.GetPlace()); - - auto in_dims = d_in->dims(); - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); + auto* input = ctx.Input("Input"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dinput = ctx.Output(framework::GradVarName("Input")); + + auto axes_int = ctx.Attr>("axes"); + auto starts_int = ctx.Attr>("starts"); + auto ends_int = ctx.Attr>("ends"); + std::vector axes(axes_int.begin(), axes_int.end()); + std::vector starts(starts_int.begin(), starts_int.end()); + std::vector ends(ends_int.begin(), ends_int.end()); + + // Get the accurate attribute value of starts and ends + auto starts_tensor_list = ctx.MultiInput("StartsTensorList"); + if (ctx.HasInput("StartsTensor")) { + starts = GetDataFromTensor(ctx.Input("StartsTensor")); + } else if (starts_tensor_list.size() > 0) { + starts = GetDataFromTensorList(starts_tensor_list); + } - // prepare starts, ends on XPU - int dim_value = 0, start = 0, end = 0; - // If a negative value is passed for any of the start or end indices, - // it represents number of elements before the end of that dimension. - // If the value passed to start or end is larger than the n - // (the number of elements in this dimension), it represents n. - for (size_t i = 0; i < axes.size(); ++i) { - dim_value = in_dims[axes[i]]; - start = starts[i]; - end = ends[i]; - start = start < 0 ? (start + dim_value) : start; - end = end < 0 ? (end + dim_value) : end; - start = std::max(start, 0); - end = std::max(end, 0); - end = std::min(end, dim_value); - PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument( - "end should greater than start")); - starts[i] = start; - ends[i] = end; + auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); + if (ctx.HasInput("EndsTensor")) { + ends = GetDataFromTensor(ctx.Input("EndsTensor")); + } else if (ends_tensor_list.size() > 0) { + ends = GetDataFromTensorList(ends_tensor_list); } - size_t shape_size = in_dims.size(); - // the slice XPU kernel require that the length of `start`, `end` must be - // equal - // to the dims size of input tensor, therefore, if shape_size > axes.size(), - // the `starts_extension` and `ends_extension` is necessary. - std::vector starts_extension(shape_size, 0); - std::vector ends_extension(shape_size, 0); - if (shape_size > axes.size()) { - for (size_t i = 0; i < shape_size; ++i) { - ends_extension[i] = in_dims[i]; - } - for (size_t i = 0; i < axes.size(); ++i) { - starts_extension[axes[i]] = starts[i]; - ends_extension[axes[i]] = ends[i]; + + const auto& in_dims = input->dims(); + int rank = in_dims.size(); + + std::vector pad_left(rank); + std::vector out_dims(rank); + std::vector pad_right(rank); + int cnt = 0; + for (int i = 0; i < in_dims.size(); ++i) { + int start = 0; + int end = in_dims[i]; + int axis = cnt < static_cast(axes.size()) ? axes[cnt] : -1; + if (axis == i) { + start = starts[cnt]; + if (start < 0) { + start = (start + in_dims[i]); + } + start = std::max(start, static_cast(0)); + end = ends[cnt]; + if (end < 0) { + end = (end + in_dims[i]); + } + end = std::min(end, static_cast(in_dims[i])); + cnt++; } - } - int* starts_device = nullptr; - int* ends_device = nullptr; - int* starts_host = - shape_size > axes.size() ? starts_extension.data() : starts.data(); - int* ends_host = - shape_size > axes.size() ? ends_extension.data() : ends.data(); - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&starts_device), - shape_size * sizeof(int)), - XPU_SUCCESS, - platform::errors::External("XPU has no enough memory")); - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&ends_device), - shape_size * sizeof(int)), - XPU_SUCCESS, - platform::errors::External("XPU has no enough memory")); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - starts_device, platform::CPUPlace(), starts_host, - shape_size * sizeof(int)); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - ends_device, platform::CPUPlace(), ends_host, - shape_size * sizeof(int)); - // prepare shape on XPU - std::vector shape(shape_size, 0); - for (size_t i = 0; i < shape_size; ++i) { - shape[i] = in_dims[i]; + pad_left[i] = start; + out_dims[i] = end - start; + pad_right[i] = in_dims[i] - out_dims[i] - pad_left[i]; } - int* shape_device = nullptr; - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&shape_device), - shape_size * sizeof(int)), - XPU_SUCCESS, - platform::errors::External("XPU has no enough memory")); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - shape_device, platform::CPUPlace(), shape.data(), - shape_size * sizeof(int)); auto& dev_ctx = ctx.template device_context(); - int r = - xpu::slice_backward(dev_ctx.x_context(), shape_device, starts_device, - ends_device, shape_size, d_out->data(), - d_in->data(), d_in->numel(), d_out->numel()); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("xpu slice kernel error")); - dev_ctx.Wait(); - // free device data - xpu_free(shape_device); - xpu_free(starts_device); - xpu_free(ends_device); + const XPUType* dout_data = + reinterpret_cast(dout->data()); + XPUType* din_data = + reinterpret_cast(dinput->mutable_data(ctx.GetPlace())); + int r = xpu::pad(dev_ctx.x_context(), dout_data, din_data, + out_dims, pad_left, pad_right, XPUType(0)); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU pad kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); } }; - } // namespace operators } // namespace paddle @@ -198,8 +179,13 @@ namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( slice, ops::SliceXPUKernel, - ops::SliceXPUKernel); + ops::SliceXPUKernel, + ops::SliceXPUKernel); REGISTER_OP_XPU_KERNEL( slice_grad, - ops::SliceGradXPUKernel); + ops::SliceGradXPUKernel, + ops::SliceGradXPUKernel, + ops::SliceGradXPUKernel); #endif diff --git a/paddle/fluid/platform/xpu/xpu2_op_list.h b/paddle/fluid/platform/xpu/xpu2_op_list.h index 651243a4dfe667..5d45e5d9d5050e 100644 --- a/paddle/fluid/platform/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/xpu/xpu2_op_list.h @@ -109,7 +109,16 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace())})}, {"iou_similarity", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})} + {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reduce_mean_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, + {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, // AddMore }; diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index c6367911b88f82..156ba07a4ce08b 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -313,6 +313,18 @@ def _current_expected_place(): "You are using GPU version Paddle, but your CUDA device is not set properly. CPU device will be used by default." ) _global_expected_place_ = core.CPUPlace() + elif core.is_compiled_with_xpu(): + try: + device_count = core.get_xpu_device_count() + except Exception as e: + device_count = 0 + if device_count > 0: + _global_expected_place_ = core.XPUPlace(0) + else: + warnings.warn( + "You are using XPU version Paddle, but your XPU device is not set properly. CPU device will be used by default." + ) + _global_expected_place_ = core.CPUPlace() else: _global_expected_place_ = core.CPUPlace() diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index f26ee80d0af607..55aaac8dc48524 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -190,9 +190,6 @@ def __init__(self, self.type = "adamw" - if core.is_compiled_with_xpu(): - self.type = "adam" - # Use _auxiliary_vars together with _set_auxiliary_var/_get_auxiliary_var to achieve that. self._auxiliary_vars = dict() @@ -259,10 +256,6 @@ def _append_decoupled_weight_decay(self, block, param_and_grad): paddle.fluid.layers.assign(input=scaled_param, output=param) def _append_optimize_op(self, block, param_and_grad): - if paddle.is_compiled_with_xpu(): - self._append_decoupled_weight_decay(block, param_and_grad) - return super(AdamW, self)._append_optimize_op(block, param_and_grad) - assert isinstance(block, framework.Block) if isinstance(param_and_grad, dict): param_and_grad = self._update_param_group(param_and_grad) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 71968d67ed693c..72b6bd29fd9e78 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -104,9 +104,9 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): if place is None: place = _current_expected_place() elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace, - core.CUDAPlace, core.NPUPlace)): + core.CUDAPlace, core.NPUPlace, core.XPUPlace)): raise ValueError( - "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace" + "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace" ) #Todo(zhouwei): Support allocate tensor on any other specified card From 623e36b0d8869691b5eb05652134310462a641cc Mon Sep 17 00:00:00 2001 From: Wangzheee <634486483@qq.com> Date: Mon, 18 Oct 2021 13:46:10 +0800 Subject: [PATCH 188/298] add IPluginV2Layer: AddPluginV2Ext (#36493) --- paddle/fluid/inference/tensorrt/engine.cc | 13 +++++++------ paddle/fluid/inference/tensorrt/engine.h | 6 ++++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index d075656d15747c..24644645eee49b 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -135,12 +135,6 @@ void TensorRTEngine::FreezeNetwork() { } for (int j = 0; j < layer->getNbOutputs(); j++) { auto *temp_out = layer->getOutput(j); - if (temp_out->isNetworkOutput()) { - VLOG(1) << "Layer(Name: " << layer->getName() - << ") is set to float32 because its output(" - << temp_out->getName() << ") is the output of the network."; - return false; - } if (!temp_out->dynamicRangeIsSet()) { VLOG(1) << "Layer(Name: " << layer->getName() << ") is set to float32 because its output(" @@ -357,6 +351,13 @@ nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2Ext( return network()->addPluginV2(inputs, num_inputs, *plugin); } +nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2IOExt( + nvinfer1::ITensor *const *inputs, int num_inputs, + nvinfer1::IPluginV2IOExt *plugin) { + owned_plugin_v2ioext_.emplace_back(plugin); + return network()->addPluginV2(inputs, num_inputs, *plugin); +} + void TensorRTEngine::freshDeviceId() { int count; cudaGetDeviceCount(&count); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index e22c2488d3b8b6..edf69dc7aa2b5f 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -323,6 +323,10 @@ class TensorRTEngine { int num_inputs, plugin::PluginTensorRTV2Ext* plugin); + nvinfer1::IPluginV2Layer* AddPluginV2IOExt(nvinfer1::ITensor* const* inputs, + int num_inputs, + nvinfer1::IPluginV2IOExt* plugin); + void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) { quant_dynamic_range_[tensor] = range; } @@ -429,6 +433,7 @@ class TensorRTEngine { bool with_ernie() { return with_ernie_; } bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; } bool with_dynamic_shape() { return with_dynamic_shape_; } + AnalysisConfig::Precision precision() { return precision_; } #if IS_TRT_VERSION_GE(6000) nvinfer1::IPluginV2Layer* AddDynamicPlugin( @@ -550,6 +555,7 @@ class TensorRTEngine { std::vector> owned_plugin_; std::vector> owned_plugin_v2ext_; + std::vector> owned_plugin_v2ioext_; // TensorRT related internal members template From 051544b6e8af9cef61ba9870b4ab39af40875ce3 Mon Sep 17 00:00:00 2001 From: ceci3 Date: Mon, 18 Oct 2021 14:19:16 +0800 Subject: [PATCH 189/298] quant support matmul_v2 (#36469) * quant support matmul_v2 * fix format --- .../fluid/contrib/slim/quantization/quantization_pass.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index e89db1fb1da05b..dc355fec0d362a 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -51,6 +51,7 @@ "depthwise_conv2d", "mul", "matmul", + "matmul_v2", "relu", "leaky_relu", "relu6", @@ -91,6 +92,7 @@ "conv2d_transpose": [["Input", "Filter"], ["Output"]], "mul": [["X", "Y"], ["Out"]], "matmul": [["X", "Y"], ["Out"]], + "matmul_v2": [["X", "Y"], ["Out"]], "pool2d": [["X"], ["Out"]], "elementwise_add": [["X", "Y"], ["Out"]], "concat": [["X"], ["Out"]], @@ -139,7 +141,9 @@ _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose'] -_channelwise_quant_axis1_ops = ['conv2d_transpose', 'mul'] +_channelwise_quant_axis1_ops = [ + 'conv2d_transpose', 'mul', 'matmul', 'matmul_v2' +] def _get_op_input_var_names(op): @@ -1785,7 +1789,8 @@ class AddQuantDequantPass(object): "bilinear_interp", "nearest_interp", "trilinear_interp", "slice", "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6", "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2", - "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm" + "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm", + "matmul_v2" ] # To be compatible with PaddleSlim, not remove _activation_type for now From 3845afff784453547b59a82e926b17d865550051 Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Mon, 18 Oct 2021 14:50:59 +0800 Subject: [PATCH 190/298] Add operators for async read & async write (#36333) * fix async_read bug * change index place to cpu * add tensor size judge * add async_read & async_write test * fix bug in async_write * fix mac py3 ci * fix bug for cpu version paddle * fix windows ci bug * change input argument error type * change const_cast to mutable_data * add async_write out-of-bound check and consumate error hint * fix a small bug for dst_tensor * add docs and refine codes * refine docs * notest,test=windows_ci * fix windows ci * fix require * fix code-block * add core.is_compiled_with_cuda() --- paddle/fluid/pybind/imperative.cc | 337 +++++++++++++++++++ python/paddle/tests/test_async_read_write.py | 109 ++++++ 2 files changed, 446 insertions(+) create mode 100644 python/paddle/tests/test_async_read_write.py diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 2e22ee90133a86..f94afaa56b8dfd 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -2249,6 +2249,343 @@ void BindImperative(py::module *m_ptr) { const py::args args, const py::kwargs kwargs) { return imperative::PyLayerApply(place, cls, args, kwargs); }); + +#if defined(PADDLE_WITH_CUDA) + m.def( + "async_write", + [](const imperative::VarBase &src, imperative::VarBase &dst, + const imperative::VarBase &offset, const imperative::VarBase &count) { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(src.Place()), true, + platform::errors::InvalidArgument( + "Required `src` device should be CUDAPlace, but received %d. ", + src.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cuda_pinned_place(dst.Place()), true, + platform::errors::InvalidArgument( + "Required `dst` device should be CUDAPinnedPlace, " + "but received %d. ", + dst.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(offset.Place()), true, + platform::errors::InvalidArgument("Required `offset` device should " + "be CPUPlace, but received %d. ", + offset.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(count.Place()), true, + platform::errors::InvalidArgument( + "Required `count` device should be CPUPlace, but received %d. ", + count.Place())); + + // TODO(daisiming): In future, add index as arguments following + // async_read. + auto &src_tensor = src.Var().Get(); + auto *dst_tensor = dst.MutableVar()->GetMutable(); + auto &offset_tensor = offset.Var().Get(); + auto &count_tensor = count.Var().Get(); + const auto &deviceId = paddle::platform::GetCurrentDeviceId(); + + PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`offset` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(count_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`count` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(offset_tensor.numel(), count_tensor.numel(), + platform::errors::InvalidArgument( + "`offset` and `count` tensor size dismatch.")); + PADDLE_ENFORCE_EQ( + src_tensor.dims().size(), dst_tensor->dims().size(), + platform::errors::InvalidArgument( + "`src` and `dst` should have the same tensor shape, " + "except for the first dimension.")); + for (int i = 1; i < src_tensor.dims().size(); i++) { + PADDLE_ENFORCE_EQ( + src_tensor.dims()[i], dst_tensor->dims()[i], + platform::errors::InvalidArgument( + "`src` and `dst` should have the same tensor shape, " + "except for the first dimension.")); + } + + auto stream = paddle::platform::stream::get_current_stream(deviceId) + ->raw_stream(); + + int64_t size = src_tensor.numel() / src_tensor.dims()[0]; + auto *src_data = src_tensor.data(); + auto *dst_data = dst_tensor->mutable_data(dst.Place()); + const int64_t *offset_data = offset_tensor.data(); + const int64_t *count_data = count_tensor.data(); + int64_t src_offset = 0, dst_offset, c; + for (int64_t i = 0; i < offset_tensor.numel(); i++) { + dst_offset = offset_data[i], c = count_data[i]; + PADDLE_ENFORCE_LE(src_offset + c, src_tensor.dims()[0], + platform::errors::InvalidArgument( + "Invalid offset or count index")); + PADDLE_ENFORCE_LE(dst_offset + c, dst_tensor->dims()[0], + platform::errors::InvalidArgument( + "Invalid offset or count index")); + cudaMemcpyAsync( + dst_data + (dst_offset * size), src_data + (src_offset * size), + c * size * sizeof(float), cudaMemcpyDeviceToHost, stream); + src_offset += c; + } + }, + R"DOC( + This api provides a way to write pieces of source tensor to destination tensor + inplacely and asynchronously. In which, we use `offset` and `count` to determine + where to copy. `offset` means the begin points of the copy pieces of `src`, and + `count` means the lengths of the copy pieces of `src`. To be noted, the copy process + will run asynchronously from cuda to pin memory. We can simply remember this as + "gpu async_write to pin_memory". + + Arguments: + + src (Tensor): The source tensor, and the data type should be `float32` currently. + Besides, `src` should be placed on CUDAPlace. + + dst (Tensor): The destination tensor, and the data type should be `float32` currently. + Besides, `dst` should be placed on CUDAPinnedPlace. The shape of `dst` + should be the same with `src` except for the first dimension. + + offset (Tensor): The offset tensor, and the data type should be `int64` currently. + Besides, `offset` should be placed on CPUPlace. The shape of `offset` + should be one-dimensional. + + count (Tensor): The count tensor, and the data type should be `int64` currently. + Besides, `count` should be placed on CPUPlace. The shape of `count` + should be one-dimensinal. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + from paddle.fluid import core + from paddle.device import cuda + + if core.is_compiled_with_cuda(): + src = paddle.rand(shape=[100, 50, 50]) + dst = paddle.emtpy(shape=[200, 50, 50]).pin_memory() + offset = paddle.to_tensor( + np.array([0, 60], dtype="int64"), place=paddle.CPUPlace()) + count = paddle.to_tensor( + np.array([40, 60], dtype="int64"), place=paddle.CPUPlace()) + + stream = cuda.Stream() + with cuda.stream_guard(stream): + core.async_write(src, dst, offset, count) + + offset_a = paddle.gather(dst, paddle.to_tensor(np.arange(0, 40))) + offset_b = paddle.gather(dst, paddle.to_tensor(np.arange(60, 120))) + offset_array = paddle.concat([offset_a, offset_b], axis=0) + print(np.allclose(src.numpy(), offset_array.numpy())) # True +)DOC"); + + m.def( + "async_read", + [](const imperative::VarBase &src, imperative::VarBase &dst, + const imperative::VarBase &index, imperative::VarBase &buffer, + const imperative::VarBase &offset, const imperative::VarBase &count) { + PADDLE_ENFORCE_EQ(platform::is_cuda_pinned_place(src.Place()), true, + platform::errors::InvalidArgument( + "Required `src` device should be " + "CUDAPinnedPlace, but received %d.", + src.Place())); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(dst.Place()), true, + platform::errors::InvalidArgument( + "Required `dst` device should be CUDAPlace, but received %d.", + dst.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(index.Place()), true, + platform::errors::InvalidArgument( + "Required `index` device should be CPUPlace, but received %d.", + index.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cuda_pinned_place(buffer.Place()), true, + platform::errors::InvalidArgument( + "Required `buffer` device should be CUDAPinnedPlace, " + "but received %d.", + buffer.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(offset.Place()), true, + platform::errors::InvalidArgument( + "Required `offset` device should be CPUPlace, but received %d.", + offset.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(count.Place()), true, + platform::errors::InvalidArgument( + "Required `count` device should be CPUPlace, but received %d.", + count.Place())); + + auto &src_tensor = src.Var().Get(); + auto *dst_tensor = dst.MutableVar()->GetMutable(); + auto &index_tensor = index.Var().Get(); + auto *buffer_tensor = + buffer.MutableVar()->GetMutable(); + auto &offset_tensor = offset.Var().Get(); + auto &count_tensor = count.Var().Get(); + auto *dst_data = dst_tensor->mutable_data(dst.Place()); + const auto &deviceId = paddle::platform::GetCurrentDeviceId(); + + PADDLE_ENFORCE_EQ(src_tensor.dims().size(), dst_tensor->dims().size(), + platform::errors::InvalidArgument( + "`src` and `dst` should have same tensor shape, " + "except for the first dimension.")); + PADDLE_ENFORCE_EQ( + src_tensor.dims().size(), buffer_tensor->dims().size(), + platform::errors::InvalidArgument( + "`src` and `buffer` should have same tensor shape, " + "except for the first dimension.")); + for (int i = 1; i < src_tensor.dims().size(); i++) { + PADDLE_ENFORCE_EQ( + src_tensor.dims()[i], dst_tensor->dims()[i], + platform::errors::InvalidArgument( + "`src` and `dst` should have the same tensor shape, " + "except for the first dimension.")); + PADDLE_ENFORCE_EQ( + src_tensor.dims()[i], buffer_tensor->dims()[i], + platform::errors::InvalidArgument( + "`src` and `buffer` should have the same tensor shape, " + "except for the first dimension.")); + } + PADDLE_ENFORCE_EQ(index_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`index` tensor should be one-dimensional.")); + + auto stream = paddle::platform::stream::get_current_stream(deviceId) + ->raw_stream(); + + int64_t numel = 0; // total copy length + int64_t copy_flag = offset_tensor.dims()[0]; + int64_t size = src_tensor.numel() / src_tensor.dims()[0]; + + if (copy_flag != 0) { + PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`offset` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(count_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`count` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(offset_tensor.numel(), count_tensor.numel(), + platform::errors::InvalidArgument( + "`offset` and `count` tensor size dismatch.")); + auto *offset_data = offset_tensor.data(); + auto *count_data = count_tensor.data(); + for (int64_t i = 0; i < count_tensor.numel(); i++) { + numel += count_data[i]; + } + PADDLE_ENFORCE_LE(numel + index_tensor.numel(), + buffer_tensor->dims()[0], + platform::errors::InvalidArgument( + "Buffer tensor size is too small.")); + PADDLE_ENFORCE_LE(numel + index_tensor.numel(), dst_tensor->dims()[0], + platform::errors::InvalidArgument( + "Target tensor size is too small.")); + + int64_t src_offset, dst_offset = 0, c; + auto *src_data = src_tensor.data(); + for (int64_t i = 0; i < offset_tensor.numel(); i++) { + src_offset = offset_data[i], c = count_data[i]; + PADDLE_ENFORCE_LE(src_offset + c, src_tensor.dims()[0], + platform::errors::InvalidArgument( + "Invalid offset or count index.")); + PADDLE_ENFORCE_LE(dst_offset + c, dst_tensor->dims()[0], + platform::errors::InvalidArgument( + "Invalid offset or count index.")); + cudaMemcpyAsync( + dst_data + (dst_offset * size), src_data + (src_offset * size), + c * size * sizeof(float), cudaMemcpyHostToDevice, stream); + dst_offset += c; + } + } else { + PADDLE_ENFORCE_LE(index_tensor.numel(), buffer_tensor->dims()[0], + platform::errors::InvalidArgument( + "Buffer tensor size is too small.")); + } + + // Select the index data to the buffer + auto index_select = [](const framework::Tensor &src_tensor, + const framework::Tensor &index_tensor, + framework::Tensor *buffer_tensor) { + auto *src_data = src_tensor.data(); + auto *index_data = index_tensor.data(); + auto *buffer_data = + buffer_tensor->mutable_data(buffer_tensor->place()); + const int &slice_size = src_tensor.numel() / src_tensor.dims()[0]; + const int ©_bytes = slice_size * sizeof(float); + int64_t c = 0; + for (int64_t i = 0; i < index_tensor.numel(); i++) { + std::memcpy(buffer_data + c * slice_size, + src_data + index_data[i] * slice_size, copy_bytes); + c += 1; + } + }; + index_select(src_tensor, index_tensor, buffer_tensor); + + // Copy the data to device memory + cudaMemcpyAsync(dst_data + (numel * size), buffer_tensor->data(), + index_tensor.numel() * size * sizeof(float), + cudaMemcpyHostToDevice, stream); + }, + R"DOC( + This api provides a way to read from pieces of source tensor to destination tensor + asynchronously. In which, we use `index`, `offset` and `count` to determine where + to read. `index` means the index position of src tensor we want to read. `offset` + and count means the begin points and length of pieces of src tensor we want to read. + To be noted, the copy process will run asynchronously from pin memory to cuda place. + We can simply remember this as "cuda async_read from pin_memory". + + Arguments: + + src (Tensor): The source tensor, and the data type should be `float32` currently. + Besides, `src` should be placed on CUDAPinnedPlace. + + dst (Tensor): The destination tensor, and the data type should be `float32` currently. + Besides, `dst` should be placed on CUDAPlace. The shape of `dst` should + be the same with `src` except for the first dimension. + + index (Tensor): The index tensor, and the data type should be `int64` currently. + Besides, `index` should be on CPUplace. The shape of `index` should + be one-dimensional. + + buffer (Tensor): The buffer tensor, used to buffer index copy tensor temporarily. + The data type should be `float32` currently, and should be placed + on CUDAPinnedPlace. The shape of `buffer` should be the same with `src` except for the first dimension. + + offset (Tensor): The offset tensor, and the data type should be `int64` currently. + Besides, `offset` should be placed on CPUPlace. The shape of `offset` + should be one-dimensional. + + count (Tensor): The count tensor, and the data type should be `int64` currently. + Besides, `count` should be placed on CPUPlace. The shape of `count` + should be one-dimensinal. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + from paddle.fluid import core + from paddle.device import cuda + + if core.is_compiled_with_cuda(): + src = paddle.rand(shape=[100, 50, 50], dtype="float32").pin_memory() + dst = paddle.empty(shape=[100, 50, 50], dtype="float32") + offset = paddle.to_tensor( + np.array([0, 60], dtype="int64"), place=paddle.CPUPlace()) + count = paddle.to_tensor( + np.array([40, 60], dtype="int64"), place=paddle.CPUPlace()) + buffer = paddle.empty(shape=[50, 50, 50], dtype="float32").pin_memory() + index = paddle.to_tensor( + np.array([1, 3, 5, 7, 9], dtype="int64")).cpu() + + stream = cuda.Stream() + with cuda.stream_guard(stream): + core.async_read(src, dst, index, buffer, offset, count) + +)DOC"); +#endif } } // namespace pybind diff --git a/python/paddle/tests/test_async_read_write.py b/python/paddle/tests/test_async_read_write.py new file mode 100644 index 00000000000000..91875b446aba4d --- /dev/null +++ b/python/paddle/tests/test_async_read_write.py @@ -0,0 +1,109 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle +from paddle.fluid import core +from paddle.device import cuda + + +class TestAsyncRead(unittest.TestCase): + def setUp(self): + self.empty = paddle.to_tensor( + np.array( + [], dtype="int64"), place=paddle.CPUPlace()) + data = np.random.randn(100, 50, 50).astype("float32") + self.src = paddle.to_tensor(data, place=paddle.CUDAPinnedPlace()) + self.dst = paddle.empty(shape=[100, 50, 50], dtype="float32") + self.index = paddle.to_tensor( + np.array( + [1, 3, 5, 7, 9], dtype="int64")).cpu() + self.buffer = paddle.empty( + shape=[50, 50, 50], dtype="float32").pin_memory() + self.stream = cuda.Stream() + + def test_async_read_empty_offset_and_count(self): + with cuda.stream_guard(self.stream): + core.async_read(self.src, self.dst, self.index, self.buffer, + self.empty, self.empty) + array1 = paddle.gather(self.src, self.index) + array2 = self.dst[:len(self.index)] + + self.assertTrue(np.allclose(array1.numpy(), array2.numpy())) + + def test_async_read_success(self): + offset = paddle.to_tensor( + np.array( + [10, 20], dtype="int64"), place=paddle.CPUPlace()) + count = paddle.to_tensor( + np.array( + [5, 10], dtype="int64"), place=paddle.CPUPlace()) + with cuda.stream_guard(self.stream): + core.async_read(self.src, self.dst, self.index, self.buffer, offset, + count) + + # index data + index_array1 = paddle.gather(self.src, self.index) + count_numel = paddle.sum(count).numpy()[0] + index_array2 = self.dst[count_numel:count_numel + len(self.index)] + self.assertTrue(np.allclose(index_array1.numpy(), index_array2.numpy())) + + # offset, count + offset_a = paddle.gather(self.src, paddle.to_tensor(np.arange(10, 15))) + offset_b = paddle.gather(self.src, paddle.to_tensor(np.arange(20, 30))) + offset_array1 = paddle.concat([offset_a, offset_b], axis=0) + offset_array2 = self.dst[:count_numel] + self.assertTrue( + np.allclose(offset_array1.numpy(), offset_array2.numpy())) + + def test_async_read_only_1dim(self): + src = paddle.rand([40], dtype="float32").pin_memory() + dst = paddle.empty([40], dtype="float32") + buffer_ = paddle.empty([20]).pin_memory() + with cuda.stream_guard(self.stream): + core.async_read(src, dst, self.index, buffer_, self.empty, + self.empty) + array1 = paddle.gather(src, self.index) + array2 = dst[:len(self.index)] + self.assertTrue(np.allclose(array1.numpy(), array2.numpy())) + + +class TestAsyncWrite(unittest.TestCase): + def setUp(self): + self.src = paddle.rand(shape=[100, 50, 50, 5], dtype="float32") + self.dst = paddle.empty( + shape=[200, 50, 50, 5], dtype="float32").pin_memory() + self.stream = cuda.Stream() + + def test_async_write_success(self): + offset = paddle.to_tensor( + np.array( + [0, 60], dtype="int64"), place=paddle.CPUPlace()) + count = paddle.to_tensor( + np.array( + [40, 60], dtype="int64"), place=paddle.CPUPlace()) + with cuda.stream_guard(self.stream): + core.async_write(self.src, self.dst, offset, count) + + offset_a = paddle.gather(self.dst, paddle.to_tensor(np.arange(0, 40))) + offset_b = paddle.gather(self.dst, paddle.to_tensor(np.arange(60, 120))) + offset_array = paddle.concat([offset_a, offset_b], axis=0) + self.assertTrue(np.allclose(self.src.numpy(), offset_array.numpy())) + + +if __name__ == "__main__": + if core.is_compiled_with_cuda(): + unittest.main() From 8757fc5b24f0884df57719690d2b0c3fd860d0b6 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Mon, 18 Oct 2021 15:09:46 +0800 Subject: [PATCH 191/298] [NPU] fix dtype for arg_max, test=develop (#36457) --- paddle/fluid/operators/arg_max_op_npu.cc | 57 ++++++++----- paddle/fluid/operators/npu_op_runner.cc | 15 ++++ paddle/fluid/operators/npu_op_runner.h | 6 ++ .../unittests/npu/test_arg_max_op_npu.py | 83 ++++++++++++++++--- python/paddle/nn/functional/loss.py | 15 ++-- 5 files changed, 139 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/operators/arg_max_op_npu.cc b/paddle/fluid/operators/arg_max_op_npu.cc index 38f9813ad02b40..8b70332c651c8b 100644 --- a/paddle/fluid/operators/arg_max_op_npu.cc +++ b/paddle/fluid/operators/arg_max_op_npu.cc @@ -17,30 +17,49 @@ limitations under the Licnse. */ namespace paddle { namespace operators { + using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; -template -class ArgMaxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - int64_t axis = ctx.Attr("axis"); - auto dtype = ctx.Attr("dtype"); +template +struct VisitDataArgNPUMaxFunctor { + const framework::ExecutionContext& ctx; - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); + explicit VisitDataArgNPUMaxFunctor(const framework::ExecutionContext& ctx) + : ctx(ctx) {} + template + void apply() const { + auto& x = *(ctx.Input("X")); + auto& out = *(ctx.Output("Out")); + out.template mutable_data(ctx.GetPlace()); + auto axis = ctx.Attr("axis"); + auto dtype = ctx.Attr("dtype"); + auto stream = ctx.template device_context().stream(); NpuOpRunner runner; runner.SetType("ArgMaxV2") - .AddInput(*x) + .AddInput(x) .AddInput(std::vector{axis}) - .AddOutput(*out) - .AddAttr("dtype", dtype); + .AddOutput(out) + .AddAttrDataType("dtype", dtype) + .Run(stream); + } +}; - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); +template +class ArgMaxNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dtype = ctx.Attr("dtype"); + if (dtype < 0) { + framework::VisitDataTypeTiny(static_cast( + framework::proto::VarType::INT64), + VisitDataArgNPUMaxFunctor(ctx)); + return; + } + framework::VisitDataTypeTiny( + static_cast(dtype), + VisitDataArgNPUMaxFunctor(ctx)); } }; @@ -48,7 +67,5 @@ class ArgMaxNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - arg_max, ops::ArgMaxNPUKernel, - ops::ArgMaxNPUKernel); +REGISTER_OP_NPU_KERNEL(arg_max, ops::ArgMaxNPUKernel, + ops::ArgMaxNPUKernel); diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index d10e94962d6a6d..830e18cb8a14c0 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -188,6 +188,21 @@ NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name, return *this; } +NpuOpRunner &NpuOpRunner::AddAttrDataType(const std::string &name, + const NPUAttribute &attr) { + PADDLE_ENFORCE_EQ( + (attr.type() == typeid(int)), true, + platform::errors::InvalidArgument( + "Attr type is NOT equal to framework::proto::VarType::Type.")); + if (!attr_) { + attr_ = aclopCreateAttr(); + } + auto dtype = ConvertToNpuDtype( + static_cast(BOOST_GET_CONST(int, attr))); + PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrDataType(attr_, name.c_str(), dtype)); + return *this; +} + NpuOpRunner &NpuOpRunner::AddAttrs(const NPUAttributeMap &attrs) { for (const auto &pair : attrs) { AddAttr(pair.first, pair.second); diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index 45e973970a956d..6db5f17d671181 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -58,6 +58,12 @@ class NpuOpRunner { NpuOpRunner &AddAttr(const std::string &name, const NPUAttribute &attr); + // NOTE(qili93): need to add indivisual api for aclopSetAttrDataType + // as typeid(aclDataType) and typeid(framework::proto::VarType::Type) + // always go to attr.type() == typeid(int) to call aclopSetAttrInt + NpuOpRunner &AddAttrDataType(const std::string &name, + const NPUAttribute &attr); + NpuOpRunner &AddAttrs(const NPUAttributeMap &attrs); NpuOpRunner &AddInput(const Tensor &tensor); diff --git a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py index 9bc46697c0dfc0..85ade1179b7d61 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py @@ -1,10 +1,10 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -20,30 +20,31 @@ sys.path.append("..") from op_test import OpTest import paddle +import paddle.fluid as fluid import paddle.fluid.core as core +from paddle.fluid import Program, program_guard paddle.enable_static() class BaseTestCase(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + def initTestCase(self): self.op_type = 'arg_max' - self.dims = (3, 4) + self.dims = (3, 4, 5) self.dtype = 'float32' - self.axis = 1 + self.axis = 0 def setUp(self): + self.set_npu() self.initTestCase() - self.__class__.use_npu = True - self.place = paddle.NPUPlace(0) - np.random.seed(2021) - self.x = (np.random.random(self.dims)).astype(self.dtype) + self.x = (1000 * np.random.random(self.dims)).astype(self.dtype) self.inputs = {'X': self.x} self.attrs = {'axis': self.axis} - if self.op_type == "arg_min": - self.outputs = {'Out': np.argmin(self.x, axis=self.axis)} - else: - self.outputs = {'Out': np.argmax(self.x, axis=self.axis)} + self.outputs = {'Out': np.argmax(self.x, axis=self.axis)} def test_check_output(self): self.check_output_with_place(self.place) @@ -211,6 +212,64 @@ def initTestCase(self): self.axis = 0 +class BaseTestComplex1_1(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (4, 5, 6) + self.dtype = 'float32' + self.axis = 2 + + def setUp(self): + self.set_npu() + self.initTestCase() + self.x = (np.random.random(self.dims)).astype(self.dtype) + self.inputs = {'X': self.x} + self.attrs = { + 'axis': self.axis, + 'dtype': int(core.VarDesc.VarType.INT32) + } + self.outputs = { + 'Out': np.argmax( + self.x, axis=self.axis).astype("int32") + } + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class BaseTestComplex1_2(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (4, 5, 6) + self.dtype = 'float16' + self.axis = 2 + + def setUp(self): + self.set_npu() + self.initTestCase() + self.x = (np.random.random(self.dims)).astype(self.dtype) + self.inputs = {'X': self.x} + self.attrs = { + 'axis': self.axis, + 'dtype': int(core.VarDesc.VarType.INT32) + } + self.outputs = { + 'Out': np.argmax( + self.x, axis=self.axis).astype("int32") + } + + def test_check_output(self): + self.check_output_with_place(self.place) + + class TestArgMaxAPI(unittest.TestCase): def initTestCase(self): self.dims = (3, 4, 5) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index b1db45ad506695..adf93b24d3926b 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1675,11 +1675,16 @@ def cross_entropy(input, raise ValueError( "Target({}) is out of class_dimension's upper bound({})". format(invalid_label[0], input.shape[axis] - 1)) - - _, out = _C_ops.softmax_with_cross_entropy( - input, label, 'soft_label', soft_label, 'ignore_index', - ignore_index, 'numeric_stable_mode', True, 'axis', axis, - 'use_softmax', use_softmax) + if core.is_compiled_with_npu(): + _, _, out = _C_ops.softmax_with_cross_entropy( + input, label, 'soft_label', soft_label, 'ignore_index', + ignore_index, 'numeric_stable_mode', True, 'axis', axis, + 'use_softmax', use_softmax) + else: + _, out = _C_ops.softmax_with_cross_entropy( + input, label, 'soft_label', soft_label, 'ignore_index', + ignore_index, 'numeric_stable_mode', True, 'axis', axis, + 'use_softmax', use_softmax) if weight is not None: From cbd15f7d00b4e639b2b115d4aee61a8b48faa9ce Mon Sep 17 00:00:00 2001 From: Qi Li Date: Mon, 18 Oct 2021 15:10:07 +0800 Subject: [PATCH 192/298] [NPU] add kernels for elementwise_add gather_nd tile, test=develop (#36464) --- .../elementwise/elementwise_add_op_npu.cc | 3 ++ paddle/fluid/operators/gather_nd_op_npu.cc | 36 +++++++++--------- paddle/fluid/operators/tile_op_npu.cc | 38 +++++++++++-------- .../npu/test_elementwise_add_op_npu.py | 15 +++++--- .../unittests/npu/test_gather_nd_op_npu.py | 16 ++++---- .../tests/unittests/npu/test_tile_op_npu.py | 20 +++++++++- 6 files changed, 80 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc index cd1d50a017c363..41d5d718c24209 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc @@ -146,6 +146,9 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ElementwiseAddNPUKernel, +#endif ops::ElementwiseAddNPUKernel); REGISTER_OP_NPU_KERNEL(elementwise_add_grad, diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc index d04e0bce36fab2..8102322bd3b0ce 100644 --- a/paddle/fluid/operators/gather_nd_op_npu.cc +++ b/paddle/fluid/operators/gather_nd_op_npu.cc @@ -18,7 +18,10 @@ limitations under the License. */ namespace paddle { namespace operators { -template +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template class GatherNdNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -49,14 +52,12 @@ class GatherNdNPUKernel : public framework::OpKernel { framework::proto::VarType::INT64))); const auto &runner = NpuOpRunner("GatherNd", {*x, *index}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); + auto stream = ctx.template device_context().stream(); runner.Run(stream); } }; -template +template class GatherNdGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -91,10 +92,7 @@ class GatherNdGradNPUKernel : public framework::OpKernel { dout = &tmp_tensor2; } - auto stream = - ctx.template device_context() - .stream(); - + auto stream = ctx.template device_context().stream(); platform::NPUMemsetAsync(static_cast(p), 0, dx->numel() * sizeof(T), stream); @@ -108,13 +106,13 @@ class GatherNdGradNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - gather_nd, ops::GatherNdNPUKernel, - ops::GatherNdNPUKernel); - -REGISTER_OP_NPU_KERNEL( - gather_nd_grad, - ops::GatherNdGradNPUKernel, - ops::GatherNdGradNPUKernel); +REGISTER_OP_NPU_KERNEL(gather_nd, + ops::GatherNdNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::GatherNdNPUKernel, +#endif + ops::GatherNdNPUKernel); + +REGISTER_OP_NPU_KERNEL(gather_nd_grad, + ops::GatherNdGradNPUKernel, + ops::GatherNdGradNPUKernel); diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc index c85a1cbc671af1..95d7cb9e362c78 100644 --- a/paddle/fluid/operators/tile_op_npu.cc +++ b/paddle/fluid/operators/tile_op_npu.cc @@ -16,7 +16,11 @@ limitations under the License. */ namespace paddle { namespace operators { -template + +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template class TileNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -92,18 +96,21 @@ class TileNPUKernel : public framework::OpKernel { std::vector temp(repeat_times.size(), 1); if (repeat_times == temp) { - framework::TensorCopy( - *in0, context.GetPlace(), - context.template device_context(), out0); + framework::TensorCopy(*in0, context.GetPlace(), + context.template device_context(), + out0); return; } - const auto& runner = - NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}}); - auto stream = - context.template device_context() - .stream(); - runner.Run(stream); + // const auto& runner = + // NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}}); + auto stream = context.template device_context().stream(); + NpuOpRunner runner; + runner.SetType("Tile") + .AddInput(*in0) + .AddInput(std::move(repeat_times)) + .AddOutput(*out0) + .Run(stream); } }; @@ -111,8 +118,9 @@ class TileNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - tile, ops::TileNPUKernel, - ops::TileNPUKernel, - ops::TileNPUKernel); +REGISTER_OP_NPU_KERNEL(tile, ops::TileNPUKernel, ops::TileNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::TileNPUKernel, +#endif + ops::TileNPUKernel, + ops::TileNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py index 9b27e75e37d255..75c70e0a131ac9 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py @@ -65,7 +65,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16: + if self.dtype == np.float16 or self.dtype == np.int64: return self.check_grad_with_place( @@ -75,7 +75,7 @@ def test_check_grad_normal(self): max_relative_error=0.006, ) def test_check_grad_ingore_x(self): - if self.dtype == np.float16: + if self.dtype == np.float16 or self.dtype == np.int64: return self.check_grad_with_place( @@ -86,7 +86,7 @@ def test_check_grad_ingore_x(self): max_relative_error=0.006, ) def test_check_grad_ingore_y(self): - if self.dtype == np.float16: + if self.dtype == np.float16 or self.dtype == np.int64: return self.check_grad_with_place( @@ -102,6 +102,11 @@ def init_dtype(self): self.dtype = np.float16 +class TestINT64ElementwiseAddOp(TestElementwiseAddOp): + def init_dtype(self): + self.dtype = np.int64 + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseAddOp_scalar(TestElementwiseAddOp): @@ -507,8 +512,8 @@ def gen_data(): def test_dygraph(self): with fluid.dygraph.guard(paddle.NPUPlace(0)): - np_x = np.array([2, 3, 4]).astype('float64') - np_y = np.array([1, 5, 2]).astype('float64') + np_x = np.array([2, 3, 4]).astype('float32') + np_y = np.array([1, 5, 2]).astype('float32') x = fluid.dygraph.to_variable(np_x) y = fluid.dygraph.to_variable(np_y) z = self._executed_api(x, y) diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py index b124a546241717..acb4ffd686fa26 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py @@ -61,7 +61,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -88,7 +88,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -120,7 +120,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place( @@ -153,7 +153,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -184,7 +184,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -217,7 +217,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -252,7 +252,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -276,7 +276,7 @@ def test_imperative(self): paddle.enable_static() -for _typename in {'float16', 'float32'}: +for _typename in {'float16', 'float32', 'int64'}: test_class1('gather_nd', _typename) test_class2('gather_nd', _typename) test_class3('gather_nd', _typename) diff --git a/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py index 0da80189f7d406..0e61fa00fdf28b 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py @@ -206,7 +206,7 @@ def setUp(self): self.op_type = "tile" self.inputs = { 'X': np.random.randint( - 10, size=(2, 4, 5)).astype("int32") + 10, size=(2, 4, 5)).astype("int64") } self.attrs = {'repeat_times': [2, 1, 4]} output = np.tile(self.inputs['X'], (2, 1, 4)) @@ -219,6 +219,24 @@ def test_check_output(self): self.check_output_with_place(self.place) +# Situation 6: input x is Bool +class TestTileOpBool(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "tile" + self.inputs = {'X': np.random.randint(1, size=(2, 4, 5)).astype("bool")} + self.attrs = {'repeat_times': [2, 1, 4]} + output = np.tile(self.inputs['X'], (2, 1, 4)) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + # Test python API class TestTileAPI(unittest.TestCase): def test_api(self): From b7f7664764840d3192de81b5d601f17db10310f2 Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Mon, 18 Oct 2021 15:39:47 +0800 Subject: [PATCH 193/298] Add quant axis (#36467) * add_quant_axis * add_quant_axis * --amend * Update quant_conv2d_dequant_fuse_pass.cc --- .../ir/quant_conv2d_dequant_fuse_pass.cc | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 1864899b07e018..22babcc719aeb4 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -437,7 +437,11 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, BOOST_GET_CONST(int, quantized_op_node->Op()->GetAttr("bit_length")); int range = ((1 << (bit_length - 1)) - 1); std::vector weight_scale; - + int quant_axis = 0; + if (dequant_op_node->Op()->HasAttr("quant_axis")) { + quant_axis = + BOOST_GET_CONST(int, dequant_op_node->Op()->GetAttr("quant_axis")); + } // Get weight scale if (dequant_type == "fake_channel_wise_dequantize_max_abs") { Node* dequant_channel_scale_node = @@ -488,6 +492,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, } } if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + if (quant_axis == 0) { + } else { + PADDLE_ENFORCE_EQ( + quant_axis == 1, true, + platform::errors::InvalidArgument( + "'quant_axis' of mul/matmul/fc op weight dequantized by " + "[fake_channel_wise_dequantize_max_abs]should be 1, but " + "the received is %d", + quant_axis)); + } PADDLE_ENFORCE_EQ( weight_scale.size(), static_cast(w_dims[1]), platform::errors::InvalidArgument( @@ -511,6 +525,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, "model, please set the 'weight_quantize_type' params as " "'channel_wise_abs_max' and generate the quantized model again.", dequant_type)); + if (quant_axis == 0) { + } else { + PADDLE_ENFORCE_EQ( + quant_axis == 0, true, + platform::errors::InvalidArgument( + "'quant_axis' of conv2d/depthwise_conv2d op weight dequantized " + "by [fake_channel_wise_dequantize_max_abs]should be 0, but " + "the received is %d", + quant_axis)); + } PADDLE_ENFORCE_EQ( weight_scale.size(), static_cast(w_dims[0]), platform::errors::InvalidArgument( @@ -528,6 +552,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, "conv2d_transpose must be dequantized by " "[fake_channel_wise_dequantize_max_abs], but got %s", dequant_type)); + if (quant_axis == 0) { + } else { + PADDLE_ENFORCE_EQ( + quant_axis == 1, true, + platform::errors::InvalidArgument( + "'quant_axis' of conv2d_transpose op weight dequantized by " + "[fake_channel_wise_dequantize_max_abs]should be 1, but " + "the received is %d", + quant_axis)); + } PADDLE_ENFORCE_EQ( weight_scale.size(), static_cast(w_dims[1]), platform::errors::InvalidArgument( From 4c0ad7727efd5cf9d1d1bac3364f0ae487359e5c Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Mon, 18 Oct 2021 16:10:52 +0800 Subject: [PATCH 194/298] Lml/vhp (#36146) * init functional jacobian api * finish test with dtype float32 * add float64 test case * polish code * use atol=1e-5 with dtype float64 * fix for ci * set timeout for test_jacobian * init hessian API * save status * polish API docstring * modify docstring * add utils.py * save status * fix dygraph double grad dtype error when calling for high differential senario * reinvoke ci * test_hessian.py is ok * polish hessian API * init vhp * Revert "init vhp" This reverts commit cbd4d3b66abe82b0ac10721b9eddeb7d82e0a1c8. * init vhp * finish vhp API logically * add test for partial_engine.cc * modify numerical_delta with dtype float32 * merge fix for dtype float64 * spell fix * save status * polish code * rm _stop_gradient_pre_process * save status * add example for vhp interface * add _compute_numerical_vjp and _compute_numerical_vhp * test is ok * vhp is ok * add testVHPFloat64 * modify for comments * modify format * modify format * save status * test_vhp is ok * finish code polish * small modify for v is None Co-authored-by: JiabinYang <360788950@qq.com> --- python/paddle/autograd/__init__.py | 2 +- python/paddle/autograd/functional.py | 112 ++++++++++- python/paddle/autograd/utils.py | 4 +- .../tests/unittests/autograd/CMakeLists.txt | 1 + .../tests/unittests/autograd/test_vhp.py | 182 ++++++++++++++++++ .../fluid/tests/unittests/autograd/utils.py | 26 +++ 6 files changed, 319 insertions(+), 8 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/autograd/test_vhp.py diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py index cffc18e95e5ab3..bbfb9f22fc1cb4 100644 --- a/python/paddle/autograd/__init__.py +++ b/python/paddle/autograd/__init__.py @@ -18,6 +18,6 @@ from .py_layer import PyLayer, PyLayerContext # noqa: F401 from ..framework import set_grad_enabled # noqa: F401 from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401 -from .functional import vjp, jvp, jacobian, hessian # noqa: F401 +from .functional import vjp, jvp, jacobian, hessian, vhp # noqa: F401 __all__ = ['backward', 'PyLayer', 'PyLayerContext'] diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py index 66ae1562edb68a..c6235877f5b2d4 100644 --- a/python/paddle/autograd/functional.py +++ b/python/paddle/autograd/functional.py @@ -247,9 +247,9 @@ def func(x): def jacobian(func, inputs, create_graph=False, allow_unused=False): ''' .. note:: - **This API is ONLY available in imperative mode.** + **This API is ONLY available in the imperative mode.** - This API computes the Jacobian matrix of `func` with respect to `inputs`. + This function computes the Jacobian matrix of `func` with respect to `inputs`. Parameters: func (function): a Python function that takes a Tensor or a Tensor @@ -389,9 +389,9 @@ def func(x, y): def hessian(func, inputs, create_graph=False, allow_unused=False): ''' .. note:: - **This API is ONLY available in imperative mode.** + **This API is ONLY available in the imperative mode.** - This API computes the Hessian matrix of `func` with respect to `inputs`. + This function computes the Hessian matrix of `func` with respect to `inputs`. Parameters: func (function): a Python function that takes a Tensor or a Tensor @@ -509,3 +509,107 @@ def jac_func(*ins): return jacobian( jac_func, inputs, create_graph=create_graph, allow_unused=allow_unused) + + +@framework.dygraph_only +def vhp(func, inputs, v=None, create_graph=False, allow_unused=False): + ''' + .. note:: + **This API is ONLY available in the imperative mode.** + + This function computes the product between a vector ``v`` and the + Hessian matrix of `func` with respect to `inputs`. + + Parameters: + func (function): a Python function that takes a Tensor or a Tensor + list/tuple as inputs and returns a Tensor with a single element. + inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or + Tensor list/tuple of the function ``func``. + v (Tensor|list(Tensor)|tuple(Tensor)|None, optional): the vector used + to compute vector hessian product. ``v`` should have same shape + and dtype with ``inputs``. If ``v`` is None, it will be set as + Tensor|list(Tensor) with all elements 1. Defaults to "None". + create_graph (bool, optional): whether to create the gradient graphs + of the computing process. When it is True, higher order derivatives + are supported to compute; when it is False, the gradient graphs of + the computing process would be discarded. Defaults to ``False``. + allow_unused (bool, optional): whether to raise error or return None if + some Tensors of `inputs` are unreachable in the graph. Error would + be raised if allow_unused=False, and None would be returned as + their gradients if allow_unused=True. Default False. + Returns: + output (tuple): tuple with: + func_output (Tensor): output of ``func(inputs)`` + vhp (list(Tensor)): result of the vector hessian product + with the same shape and dtype as the inputs. + Examples 1: + .. code-block:: python + import paddle + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + x = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + vx = paddle.ones(shape=[2, 2], dtype='float32') * 2 + vhp_rslt = paddle.autograd.vhp(func, x, v=vx) + print(vhp_rslt) + # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [8.]), + # Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[8., 8.], + # [8., 8.]])) + + Examples 2: + .. code-block:: python + import paddle + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + x = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + vhp_rslt = paddle.autograd.vhp(func, x) + print(vhp_rslt) + # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [8.]), + # Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[4., 4.], + # [4., 4.]])) + + Examples 3: + .. code-block:: python + import paddle + def func(x, y): + return paddle.sum(paddle.matmul(x, x)) + + x = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + y = paddle.ones(shape=[2, 2], dtype='float32') + y.stop_gradient = False + vx = paddle.ones(shape=[2, 2], dtype='float32') * 2 + vy = paddle.ones(shape=[2, 2], dtype='float32') * 3 + vhp_rslt = paddle.autograd.vhp(func, [x, y], v=[vx, vy], allow_unused=True) + print(vhp_rslt) + # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [8.]), + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[8., 8.], + # [8., 8.]]), None]) + ''' + xs = _tensors(inputs, "inputs") + if v is not None: + v = _tensors(v, "v") + + with gradient_scope( + xs, v, create_graph=create_graph, + allow_unused=allow_unused) as [xs, v, grad_fn, return_fn]: + outputs = func(*xs) + ys = _tensors(outputs, "outputs") + assert len(ys) == 1 and isinstance( + ys[0], paddle.Tensor + ) and ys[0].shape == [ + 1 + ], "The function to compute vhp should return a Tensor with a single element" + jac = grad_fn(ys, xs, create_graph=True) + vhp = grad_fn(jac, xs, v) + outputs, vhp = return_fn(outputs), return_fn(vhp) + return outputs, vhp diff --git a/python/paddle/autograd/utils.py b/python/paddle/autograd/utils.py index 81fe19c1688c12..710c9ee18dfbfd 100644 --- a/python/paddle/autograd/utils.py +++ b/python/paddle/autograd/utils.py @@ -25,9 +25,7 @@ def _tensors(ts, name): name) return list(ts) else: - assert isinstance( - ts, paddle.Tensor - ) or ts is None, "{} must be Tensor or list of Tensor".format(name) + assert isinstance(ts, paddle.Tensor), "{} must be Tensor".format(name) return [ts] diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt index 369134c8989a0e..30d87e2c9b2b61 100644 --- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt @@ -8,3 +8,4 @@ endforeach(TEST_OP) set_tests_properties(test_jacobian PROPERTIES TIMEOUT 20) set_tests_properties(test_hessian PROPERTIES TIMEOUT 50) +set_tests_properties(test_vhp PROPERTIES TIMEOUT 50) diff --git a/python/paddle/fluid/tests/unittests/autograd/test_vhp.py b/python/paddle/fluid/tests/unittests/autograd/test_vhp.py new file mode 100644 index 00000000000000..09b25203e04a48 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/test_vhp.py @@ -0,0 +1,182 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle +import paddle.compat as cpt +import paddle.nn.functional as F +from utils import _compute_numerical_vhp + + +class TestVHP(unittest.TestCase): + @classmethod + def setUpClass(self): + self.shape = (2, 2) + self.dtype = 'float32' + self.np_dtype = np.float32 + self.numerical_delta = 1e-2 + self.rtol = 1e-2 + self.atol = 1e-2 + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + self.vx = paddle.rand(shape=self.shape, dtype=self.dtype) + self.vy = paddle.rand(shape=self.shape, dtype=self.dtype) + + def test_single_input(self): + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + numerical_func_output = func(self.x).numpy() + numerical_vhp = _compute_numerical_vhp( + func, self.x, self.vx, self.numerical_delta, self.np_dtype) + + self.x.stop_gradient = False + func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx) + assert np.allclose(func_output.numpy(), numerical_func_output, + self.rtol, self.atol) + assert np.allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, + self.atol) + + def test_multi_input(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, y)) + + numerical_func_output = func(self.x, self.y).numpy() + numerical_vhp = _compute_numerical_vhp( + func, [self.x, self.y], [self.vx, self.vy], self.numerical_delta, + self.np_dtype) + + self.x.stop_gradient = False + self.y.stop_gradient = False + func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y], + [self.vx, self.vy]) + assert np.allclose(func_output.numpy(), numerical_func_output, + self.rtol, self.atol) + for i in range(len(vhp)): + assert np.allclose(vhp[i].numpy(), numerical_vhp[i], self.rtol, + self.atol) + + def test_v_default(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, y)) + + numerical_func_output = func(self.x, self.y).numpy() + vx = paddle.ones(self.vx.shape, dtype=self.vx.dtype) + vy = paddle.ones(self.vy.shape, dtype=self.vy.dtype) + numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y], + [vx, vy], self.numerical_delta, + self.np_dtype) + + self.x.stop_gradient = False + self.y.stop_gradient = False + func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y]) + assert np.allclose(func_output.numpy(), numerical_func_output, + self.rtol, self.atol) + for i in range(len(vhp)): + assert np.allclose(vhp[i].numpy(), numerical_vhp[i], self.rtol, + self.atol) + + def test_allow_unused_false(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, x)) + + try: + self.x.stop_gradient = False + self.y.stop_gradient = False + _ = paddle.autograd.vhp(func, [self.x, self.y]) + except ValueError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("allow_unused") > 0 + + def test_allow_unused_true(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, x)) + + numerical_func_output = func(self.x, self.y).numpy() + numerical_vhp = _compute_numerical_vhp( + func, [self.x, self.y], [self.vx, self.vy], self.numerical_delta, + self.np_dtype) + + self.x.stop_gradient = False + self.y.stop_gradient = False + func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y], + [self.vx, self.vy], + allow_unused=True) + assert np.allclose(func_output.numpy(), numerical_func_output, + self.rtol, self.atol) + assert np.allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, + self.atol) + assert vhp[1] is None + + def test_create_graph_false(self): + def func(x): + return paddle.sum(F.sigmoid(x)) + + numerical_func_output = func(self.x).numpy() + numerical_vhp = _compute_numerical_vhp( + func, self.x, self.vx, self.numerical_delta, self.np_dtype) + + self.x.stop_gradient = False + func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx) + assert np.allclose(func_output.numpy(), numerical_func_output, + self.rtol, self.atol) + assert vhp[0].stop_gradient == True + assert np.allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, + self.atol) + try: + paddle.grad(vhp, self.x) + except RuntimeError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("has no gradient") > 0 + + def test_create_graph_true(self): + def func(x): + return paddle.sum(F.sigmoid(x)) + + numerical_func_output = func(self.x).numpy() + numerical_vhp = _compute_numerical_vhp( + func, self.x, self.vx, self.numerical_delta, self.np_dtype) + + self.x.stop_gradient = False + func_output, vhp = paddle.autograd.vhp(func, + self.x, + self.vx, + create_graph=True) + assert np.allclose(func_output.numpy(), numerical_func_output, + self.rtol, self.atol) + assert vhp[0].stop_gradient == False + assert np.allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, + self.atol) + triple_grad = paddle.grad(vhp, self.x) + assert triple_grad is not None + + +class TestVHPFloat64(TestVHP): + @classmethod + def setUpClass(self): + self.shape = (2, 2) + self.dtype = 'float64' + self.np_dtype = np.float64 + self.numerical_delta = 1e-5 + self.rtol = 1e-5 + self.atol = 1e-5 + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + self.vx = paddle.rand(shape=self.shape, dtype=self.dtype) + self.vy = paddle.rand(shape=self.shape, dtype=self.dtype) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/utils.py b/python/paddle/fluid/tests/unittests/autograd/utils.py index 3087e932051d8e..402e89ae476617 100644 --- a/python/paddle/fluid/tests/unittests/autograd/utils.py +++ b/python/paddle/fluid/tests/unittests/autograd/utils.py @@ -105,3 +105,29 @@ def _compute_numerical_hessian(func, xs, delta, np_dtype): jacobian_pos[0][i][0][p] - jacobian_neg[0][i][0][p] ) / delta / 2. return hessian + + +def _compute_numerical_vjp(func, xs, v, delta, np_dtype): + xs = _tensors(xs, "xs") + jacobian = np.array(_compute_numerical_jacobian(func, xs, delta, np_dtype)) + flat_v = np.array([v_el.numpy().reshape(-1) for v_el in v]) + vjp = [np.zeros((_product(x.shape)), dtype=np_dtype) for x in xs] + for j in range(len(xs)): + for q in range(_product(xs[j].shape)): + vjp[j][q] = np.sum(jacobian[:, j, :, q].reshape(flat_v.shape) * + flat_v) + vjp = [vjp[j].reshape(xs[j].shape) for j in range(len(xs))] + return vjp + + +def _compute_numerical_vhp(func, xs, v, delta, np_dtype): + xs = _tensors(xs, "xs") + hessian = np.array(_compute_numerical_hessian(func, xs, delta, np_dtype)) + flat_v = np.array([v_el.numpy().reshape(-1) for v_el in v]) + vhp = [np.zeros((_product(x.shape)), dtype=np_dtype) for x in xs] + for j in range(len(xs)): + for q in range(_product(xs[j].shape)): + vhp[j][q] = np.sum(hessian[:, j, :, q].reshape(flat_v.shape) * + flat_v) + vhp = [vhp[j].reshape(xs[j].shape) for j in range(len(xs))] + return vhp From bdac9ff6650d30f8b4fe0334e39c0a506757ea67 Mon Sep 17 00:00:00 2001 From: jakpiase <62569058+jakpiase@users.noreply.github.com> Date: Mon, 18 Oct 2021 12:38:24 +0200 Subject: [PATCH 195/298] Added softplus FP32 FWD OneDNN kernel (#36382) * added softplus * refactored softplus op * deleted unnecessary file * added missing file * added formatting * disabled tests if GPU is used * added reviewer suggestion * unified softplus kernel --- .../operators/mkldnn/activation_mkldnn_op.cc | 13 +++ .../operators/mkldnn/softplus_mkldnn_op.h | 94 +++++++++++++++++++ .../mkldnn/test_softplus_mkldnn_op.py | 78 +++++++++++++++ 3 files changed, 185 insertions(+) create mode 100644 paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 603a70458b0ceb..29106dc30498e8 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { @@ -169,6 +170,13 @@ struct GeluMKLDNNGradFunctor : public BaseActivationFunctor { } }; +template +struct SoftplusMKLDNNFunctor : public BaseActivationFunctor { + void operator()(const framework::ExecutionContext &ctx) const { + custom_softplus_eltwise_forward(ctx); + } +}; + template using ReluMKLDNNFunctor = MKLDNNActivationFunc; @@ -272,3 +280,8 @@ REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor); REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor, SigmoidMKLDNNGradFunctor); + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL( + softplus, MKLDNN, paddle::platform::CPUPlace, + ops::MKLDNNActivationKernel>); diff --git a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h new file mode 100644 index 00000000000000..fdb2c534e03634 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +template +class SoftplusMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { + public: + SoftplusMKLDNNHandler(const Tensor* x, const float beta, + const mkldnn::engine engine, platform::Place cpu_place) + : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { + auto x_tz = framework::vectorize(x->dims()); + auto x_md = + dnnl::memory::desc(x_tz, platform::MKLDNNGetDataType(), x->format()); + + auto beta_tz = std::vector(x_tz.size(), 1); + auto beta_md = dnnl::memory::desc(beta_tz, platform::MKLDNNGetDataType(), + x->format()); + + dnnl::post_ops post_ops; + post_ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_soft_relu, 0.0f, + 0.0f); + if (beta != 1.0f) { + post_ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, + 1.0f / beta, 0.0f); + } + + dnnl::primitive_attr attrs; + attrs.set_post_ops(post_ops); + + this->AcquireForwardPrimitiveDescriptor(attrs, dnnl::algorithm::binary_mul, + x_md, beta_md, x_md); + } + + std::shared_ptr AcquireBetaMemory(const float* beta) { + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->src1_desc(), platform::to_void_cast(beta)); + } +}; + +template +void custom_softplus_eltwise_forward(const framework::ExecutionContext& ctx) { + const auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + const auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + bool is_inplaced = x->IsSharedBufferWith(*out); + + const float beta = ctx.Attr("beta"); + + SoftplusMKLDNNHandler handler(x, beta, mkldnn_engine, ctx.GetPlace()); + + auto src_memory_p = handler.AcquireSrcMemory(x); + + auto beta_memory_p = handler.AcquireBetaMemory(&beta); + auto dst_memory_p = + is_inplaced ? src_memory_p : handler.AcquireDstMemory(out); + auto binary_p = handler.AcquireForwardPrimitive(); + + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_memory_p}, + {DNNL_ARG_SRC_1, *beta_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + binary_p->execute(astream, args); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); +} +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py new file mode 100644 index 00000000000000..92699cdbd27092 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py @@ -0,0 +1,78 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.framework import _current_expected_place + + +def ref_softplus(x, beta, threshold): + x_beta = beta * x + out = np.select([x_beta <= threshold, x_beta > threshold], + [np.log(1 + np.exp(x_beta)) / beta, x]) + return out + + +@OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)), + "GPU is not supported") +class TestSoftplusOneDNNOp(OpTest): + def setUp(self): + self.op_type = "softplus" + self.beta = 1 + self.threshold = 20 + self.config() + self.attrs = {'use_mkldnn': True, 'beta': self.beta} + self.inputs = {'X': np.random.random(self.x_shape).astype(np.float32)} + self.outputs = { + 'Out': ref_softplus(self.inputs['X'], self.beta, self.threshold) + } + + def config(self): + self.x_shape = (10, 10) + + def test_check_output(self): + self.check_output() + + +class TestSoftplus4DOneDNNOp(TestSoftplusOneDNNOp): + def config(self): + self.x_shape = (10, 5, 4, 2) + + +class TestSoftplus6DOneDNNOp(TestSoftplusOneDNNOp): + def config(self): + self.x_shape = (3, 2, 2, 5, 4, 2) + + +class TestSoftplus6DExtendedFunctorOneDNNOp(TestSoftplusOneDNNOp): + def config(self): + self.x_shape = (3, 5, 2, 5, 4, 2) + self.beta = 2.5 + + +class TestSoftplus3DExtendedFunctorOneDNNOp(TestSoftplusOneDNNOp): + def config(self): + self.x_shape = (20, 4, 2) + self.beta = 0.4 + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() From 10f0a0f6c8f71436bad715b0f74329e89ea076f9 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Mon, 18 Oct 2021 20:02:20 +0800 Subject: [PATCH 196/298] [HybridParallel]Support fp16 in dygraph hybrid parallel (#36420) * [HybridParallel]Support fp16 in dygraph hybrid parallel * update * update * update for recompute * add unittest of pp+fp16 * add unittest of recompute+fp16 * update * modify ut --- .../distributed/fleet/base/fleet_base.py | 40 ++++- .../fleet/meta_parallel/pipeline_parallel.py | 37 +++-- .../fleet/meta_parallel/pp_utils/utils.py | 13 +- .../distributed/fleet/utils/recompute.py | 15 +- python/paddle/fluid/framework.py | 2 +- .../unittests/hybrid_parallel_pp_fp16.py | 138 ++++++++++++++++++ .../tests/unittests/test_dygraph_recompute.py | 38 ++++- ...test_parallel_dygraph_pipeline_parallel.py | 5 +- 8 files changed, 257 insertions(+), 31 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 544c79a0b39691..571199b99b0d94 100755 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -35,6 +35,8 @@ from ..meta_parallel import PipelineParallel, ShardingParallel from ..meta_optimizers import HybridParallelOptimizer from paddle import _C_ops +from paddle.fluid import core +from paddle.fluid.dygraph import to_variable __all__ = [] @@ -1548,26 +1550,52 @@ def unscale_method(self, optimizer): if getattr(optimizer, '_param_groups', None) and isinstance( optimizer._param_groups[0], dict): param_grads = [] + param_grads_fp16 = [] + param_grads_fp32 = [] for group in optimizer._param_groups: for param in group['params']: if param._grad_ivar() is not None: param_grads.append(param._grad_ivar()) + if param._grad_ivar( + ).dtype == core.VarDesc.VarType.FP16: + param_grads_fp16.append(param._grad_ivar()) + else: + param_grads_fp32.append(param._grad_ivar()) else: param_grads = [ param._grad_ivar() for param in optimizer._parameter_list if param._grad_ivar() is not None ] - _C_ops.check_finite_and_unscale(param_grads, self._scale, - param_grads, self._found_inf) - - self._found_inf = paddle.cast(self._found_inf, dtype="int32") + param_grads_fp16 = [ + param._grad_ivar() for param in optimizer._parameter_list + if (param._grad_ivar() is not None) and (param._grad_ivar( + ).dtype == core.VarDesc.VarType.FP16) + ] + param_grads_fp32 = [ + param._grad_ivar() for param in optimizer._parameter_list + if (param._grad_ivar() is not None) and (param._grad_ivar( + ).dtype == core.VarDesc.VarType.FP32) + ] + temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) + temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + param_grads_fp16, + temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + param_grads_fp32, + temp_found_inf_fp32) + self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0 # TODO(shenliang03) Since dp allreduce in the optimizer is # after the gradscaler, check_finite needs to synchronize global # information. In the future, we should use check_group to speed. paddle.distributed.all_reduce( - self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None) - self._found_inf = paddle.cast(self._found_inf, dtype="bool") + paddle.to_tensor( + [self._found_inf], dtype="int32"), + op=paddle.distributed.ReduceOp.MAX, + group=None) # Only tensor_parallel and pipeline_parallel need to modify scaler if self._hcg.get_parallel_mode() in (ParallelMode.TENSOR_PARALLEL, diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 90960973972777..7c7637a90fec03 100755 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -145,9 +145,8 @@ def forward_backward_pipeline(self, data, scaler=None): p2p.send_backward(input_tensor_grad) self._layers.allreduce_shared_weight_gradients() - - train_loss = self._broadcast_final_loss() - + with paddle.amp.auto_cast(enable=False): + train_loss = self._broadcast_final_loss() return train_loss def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): @@ -172,7 +171,8 @@ def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): train_loss = self.forward_backward_pipeline(data, scaler) # optimizer - self._optimizer_step() + with paddle.amp.auto_cast(enable=False): + self._optimizer_step() return train_loss @@ -242,12 +242,13 @@ def _forward_step(self, input_tensor): output_tensor, paddle.Tensor ), "Currently, loss_fn should obtain Paddle.Tensor dtype" - if self.accumulate_steps > 1: - output_tensor = output_tensor / self.accumulate_steps + with paddle.amp.auto_cast(enable=False): + if self.accumulate_steps > 1: + output_tensor = output_tensor / self.accumulate_steps - if self.total_loss is None: - self.total_loss = paddle.zeros_like(output_tensor) - self.total_loss += output_tensor.detach() + if self.total_loss is None: + self.total_loss = paddle.zeros_like(output_tensor) + self.total_loss += output_tensor.detach() self.micro_batch_id += 1 return output_tensor @@ -321,13 +322,29 @@ def _broadcast_final_loss(self): if self.is_last_stage: assert self.total_loss is not None, "train_batch() in last stage should obtain vaild loss" loss = self.total_loss.detach() + is_fp32 = paddle.to_tensor( + 1) if loss.dtype == paddle.float32 else paddle.to_tensor(0) + paddle.distributed.broadcast( + is_fp32, + src=self.global_rank, + use_calc_stream=True, + group=self.pp_group) paddle.distributed.broadcast( loss, src=self.global_rank, use_calc_stream=True, group=self.pp_group) else: - loss = paddle.zeros(shape=[1], dtype="float32") + is_fp32 = paddle.to_tensor(1) + paddle.distributed.broadcast( + is_fp32, + src=self._hcg.get_rank_from_stage(self.num_stages - 1), + use_calc_stream=True, + group=self.pp_group) + loss = paddle.zeros( + shape=[1], + dtype="float32") if is_fp32.numpy()[0] else paddle.zeros( + shape=[1], dtype="float16") paddle.distributed.broadcast( loss, src=self._hcg.get_rank_from_stage(self.num_stages - 1), diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py index 08266096548c4a..7224ba6dedda0b 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py @@ -198,11 +198,14 @@ def forward(ctx, run_function, all_outputs, *args): # TODO support AMP tracer = framework._dygraph_tracer() - if tracer._amp_level == core.AmpLevel.O0: - ctx.is_fw_autocast = False + ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True + if tracer._amp_level == core.AmpLevel.O2: + ctx.amp_level = 'O2' + elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0): + ctx.amp_level = 'O1' else: - ctx.is_fw_autocast = True - ctx.amp_mode = 'O1' + raise ValueError("unsupported amp level: {}".format( + tracer._amp_level)) ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list() with paddle.no_grad(): @@ -263,7 +266,7 @@ def backward(ctx, *args): enable=ctx.is_fw_autocast, custom_white_list=ctx.amp_white_list, custom_black_list=ctx.amp_black_list, - level=ctx.amp_mode): + level=ctx.amp_level): detached_inputs = detach_variable(tuple(inputs)) outputs = ctx.run_function(*detached_inputs) diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py index 56a64049b16e15..2d1db5db945c3f 100755 --- a/python/paddle/distributed/fleet/utils/recompute.py +++ b/python/paddle/distributed/fleet/utils/recompute.py @@ -98,11 +98,14 @@ def forward(ctx, run_function, preserve_rng_state, *args): # TODO support AMP tracer = framework._dygraph_tracer() - if tracer._amp_level == core.AmpLevel.O0: - ctx.is_fw_autocast = False + ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True + if tracer._amp_level == core.AmpLevel.O2: + ctx.amp_level = 'O2' + elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0): + ctx.amp_level = 'O1' else: - ctx.is_fw_autocast = True - ctx.amp_mode = 'O1' + raise ValueError("unsupported amp level: {}".format( + tracer._amp_level)) ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list() with paddle.no_grad(): @@ -133,7 +136,7 @@ def backward(ctx, *args): enable=ctx.is_fw_autocast, custom_white_list=ctx.amp_white_list, custom_black_list=ctx.amp_black_list, - level=ctx.amp_mode): + level=ctx.amp_level): detached_inputs = detach_variable(tuple(inputs)) outputs = ctx.run_function(*detached_inputs) else: @@ -141,7 +144,7 @@ def backward(ctx, *args): enable=ctx.is_fw_autocast, custom_white_list=ctx.amp_white_list, custom_black_list=ctx.amp_black_list, - level=ctx.amp_mode): + level=ctx.amp_level): detached_inputs = detach_variable(tuple(inputs)) outputs = ctx.run_function(*detached_inputs) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 156ba07a4ce08b..60e00238f6cc99 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -6097,7 +6097,7 @@ def __init__(self, shape, dtype, **kwargs): self.need_clip = kwargs.get('need_clip', True) - self.is_distributed = False + self.is_distributed = kwargs.get('is_distributed', False) # self.block = default_main_program().global_block() @property diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py new file mode 100644 index 00000000000000..571459365addfc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py @@ -0,0 +1,138 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest +import paddle +import numpy as np +import random +import paddle +import paddle.distributed as dist +import paddle.distributed.fleet as fleet +from hybrid_parallel_pp_layer import AlexNetPipeDesc, AlexNet + + +def set_random_seed(seed, dp_id, rank_id): + """Set random seed for reproducability.""" + random.seed(seed) + np.random.seed(seed + dp_id) + paddle.seed(seed + dp_id) + + +batch_size = 4 +micro_batch_size = 2 + + +class TestDistPPTraning(unittest.TestCase): + def setUp(self): + strategy = fleet.DistributedStrategy() + self.model_parallel_size = 1 + self.data_parallel_size = 1 + self.pipeline_parallel_size = 2 + strategy.hybrid_configs = { + "dp_degree": self.data_parallel_size, + "mp_degree": self.model_parallel_size, + "pp_degree": self.pipeline_parallel_size, + } + strategy.pipeline_configs = { + "accumulate_steps": batch_size // micro_batch_size, + "micro_batch_size": micro_batch_size + } + fleet.init(is_collective=True, strategy=strategy) + + def test_pp_model(self): + hcg = fleet.get_hybrid_communicate_group() + word_size = hcg.get_model_parallel_world_size() + dp_id = hcg.get_data_parallel_rank() + pp_id = hcg.get_stage_id() + rank_id = dist.get_rank() + set_random_seed(1024, dp_id, rank_id) + + #construct model a + model_a = AlexNet(10) + scheduler_a = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True) + optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a, + parameters=model_a.parameters()) + + scaler_a = paddle.amp.GradScaler(init_loss_scaling=2**5) + + # construct model b + model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size) + scheduler_b = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True) + optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b, + parameters=model_b.parameters()) + + param_len = len(model_a.parameters()) + parameters = [] + for param in model_a.parameters(): + parameters.append(param.numpy()) + + for idx, param in enumerate(model_b.parameters()): + param.set_value(parameters[idx + pp_id * (param_len // 2)]) + + model_a, optimizer_a = paddle.amp.decorate( + models=model_a, + optimizers=optimizer_a, + level='O2', + save_dtype='float32') + model_b, optimizer_b = paddle.amp.decorate( + models=model_b, + optimizers=optimizer_b, + level='O2', + save_dtype='float32') + + model_b = fleet.distributed_model(model_b) + optimizer_b = fleet.distributed_optimizer(optimizer_b) + scaler_b = paddle.amp.GradScaler(init_loss_scaling=2**5) + scaler_b = fleet.distributed_scaler(scaler_b) + + # construct reader + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=batch_size, drop_last=True) + + for step_id, data in enumerate(train_reader()): + x_data = np.array([x[0] for x in data]).astype('float32').reshape( + batch_size, 1, 28, 28) + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + batch_size, 1) + img = paddle.to_tensor(x_data) + label = paddle.to_tensor(y_data) + img.stop_gradient = True + label.stop_gradient = True + + if step_id >= 5: + return True + + with paddle.amp.auto_cast(enable=True, level='O2'): + loss_a = model_a(img, label) + scaler_a.scale(loss_a).backward() + with paddle.amp.auto_cast(enable=False): + scaler_a.minimize(optimizer_a, loss_a) + optimizer_a.clear_grad() + scheduler_a.step() + + loss_b = model_b.train_batch( + [img, label], optimizer_b, scheduler_b, scaler=scaler_b) + + print("loss: ", loss_a.numpy(), loss_b.numpy()) + np.testing.assert_allclose( + loss_a.numpy(), loss_b.numpy(), rtol=5e-3) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py index 332603b8129550..4a4bcd2b8163c8 100755 --- a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py @@ -92,7 +92,10 @@ def forward(self, inputs): return inputs -def run_model(recompute_block=[], recompute_kwargs={}, enable_autocast=False): +def run_model(recompute_block=[], + recompute_kwargs={}, + enable_autocast=False, + pure_fp16=False): gen = paddle.seed(10) gen.manual_seed(10) np.random.seed(10) @@ -118,7 +121,8 @@ def run_model(recompute_block=[], recompute_kwargs={}, enable_autocast=False): x_data = np.random.randn(batch_size, input_size).astype(np.float32) x = paddle.to_tensor(x_data) # x.stop_gradient = False - with paddle.amp.auto_cast(True): + level = 'O2' if pure_fp16 else 'O1' + with paddle.amp.auto_cast(True, level=level): y_pred = model(x) loss = y_pred.mean() if enable_autocast: @@ -196,6 +200,36 @@ def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad): recompute_block=[1, 3], enable_autocast=True) check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + def test_fc_net_with_fp16(self): + def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad): + self.assertEqual(loss_ref, loss) + self.assertEqual(param_ref, param) + self.assertEqual(grad_ref, grad) + + # without recompute + loss_ref, param_ref, grad_ref = run_model( + recompute_block=[], enable_autocast=True, pure_fp16=True) + + # recompute second block + loss, param, grad = run_model( + recompute_block=[1], enable_autocast=True, pure_fp16=True) + check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + + # recompute fourth block + loss, param, grad = run_model( + recompute_block=[3], enable_autocast=True, pure_fp16=True) + check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + + # recompute second to fourth block + loss, param, grad = run_model( + recompute_block=[1, 2, 3], enable_autocast=True, pure_fp16=True) + check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + + # recompute second & fourth block + loss, param, grad = run_model( + recompute_block=[1, 3], enable_autocast=True, pure_fp16=True) + check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + def test_recompute_kwargs(self): paddle.set_device("gpu") kwargs = {"is_test": False} diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py index f54aa1bb6e5561..71c254dabb9e16 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py @@ -30,9 +30,12 @@ def test_hybrid_parallel_pp_tuple_inputs(self): def test_hybrid_parallel_shared_weight(self): self.run_mnist_2gpu('hybrid_parallel_shared_weight.py') - def test_pipeline_parallel(self): + def test_pipeline_parallel_amp(self): self.run_mnist_2gpu('hybrid_parallel_pp_amp.py') + def test_pipeline_parallel_fp16(self): + self.run_mnist_2gpu('hybrid_parallel_pp_fp16.py') + def test_hybrid_parallel_transformer(self): self.run_mnist_2gpu('hybrid_parallel_pp_transformer.py') From 305b99a0c1be76ed33490231d41cba2057b57eaa Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 19 Oct 2021 10:30:42 +0800 Subject: [PATCH 197/298] Add pow2_decay_with_linear_warmup op (#36421) * add pow2_warmup op * remove contrib __all__ * add AttrT * rename * follow comments * fix duplicate PADDLE_RESTRICT --- .../pow2_decay_with_linear_warmup_op.cc | 90 +++++++++++++ .../pow2_decay_with_linear_warmup_op.cu | 24 ++++ .../pow2_decay_with_linear_warmup_op.h | 119 ++++++++++++++++++ python/paddle/fluid/contrib/layers/nn.py | 36 ++++++ .../test_pow2_decay_with_linear_warmup_op.py | 90 +++++++++++++ 5 files changed, 359 insertions(+) create mode 100644 paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc create mode 100644 paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu create mode 100644 paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc new file mode 100644 index 00000000000000..12362b1bc6401c --- /dev/null +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +class Pow2DecayWithLinearWarmupOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + auto dim = framework::make_ddim({1}); + ctx->SetOutputDim("LearningRateOut", dim); + ctx->SetOutputDim("StepOut", dim); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto data_type = + OperatorWithKernel::IndicateVarDataType(ctx, "LearningRate"); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class Pow2DecayWithLinearWarmupOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("LearningRate", "(Tensor) The input learning rate Tensor."); + AddInput("Step", "(Tensor) The input global step Tensor."); + AddOutput("LearningRateOut", + "(Tensor) The output learning rate Tensor. Same with " + "Input(LearningRate)."); + AddOutput( + "StepOut", + "(Tensor) The output learning rate Tensor. Same with Input(Step)."); + AddAttr("warmup_steps", "(int64_t) The warmup steps."); + AddAttr( + "total_steps", + "(int64_t) The total steps for changing the learning rate."); + AddAttr("start_lr", + "(float) The initial value of the learning rate."); + AddAttr("base_lr", + "(float) The final learning rate value after warmup."); + AddAttr("end_lr", + "(float) The final learning rate value after total_steps."); + AddComment(R"DOC( +The Pow2DecayWithLinearWarmup learning rate scheduler. + +When step_num < warmup_steps, lr = (base_lr - start_lr) * step_num / warmup_steps + start_lr + +When warmup_steps <= step_num <= total_steps, + factor = 1 - (step_num - warmup_steps) / (total_steps - warmup_steps) + lr = (base_lr - end_lr) * factor * factor + end_lr + +When step_num > total_steps, lr = end_lr + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_WITHOUT_GRADIENT(pow2_decay_with_linear_warmup, + ops::Pow2DecayWithLinearWarmupOp, + ops::Pow2DecayWithLinearWarmupOpMaker); +REGISTER_OP_CPU_KERNEL( + pow2_decay_with_linear_warmup, + ops::Pow2DecayWithLinearWarmupOpKernel, + ops::Pow2DecayWithLinearWarmupOpKernel); diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu new file mode 100644 index 00000000000000..6695778dbac063 --- /dev/null +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu @@ -0,0 +1,24 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + pow2_decay_with_linear_warmup, + ops::Pow2DecayWithLinearWarmupOpKernel, + ops::Pow2DecayWithLinearWarmupOpKernel); diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h new file mode 100644 index 00000000000000..41e07b0343e728 --- /dev/null +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h @@ -0,0 +1,119 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace operators { + +template +struct Pow2DecayWithLinearWarmupFunctor { + template + using RestrictPtr = U *PADDLE_RESTRICT; + + public: + HOSTDEVICE Pow2DecayWithLinearWarmupFunctor( + RestrictPtr lr, RestrictPtr step, size_t warmup_steps, + size_t total_steps, AttrT start_lr, AttrT base_lr, AttrT end_lr) + : lr_(lr), + step_(step), + warmup_steps_(warmup_steps), + total_steps_(total_steps), + start_lr_(start_lr), + base_lr_(base_lr), + end_lr_(end_lr) {} + + HOSTDEVICE void operator()(size_t) const { + size_t step = static_cast(*step_); + *step_ = static_cast(step + 1); + if (step < warmup_steps_) { + auto new_lr = + static_cast(base_lr_ - start_lr_) * step / warmup_steps_ + + start_lr_; + *lr_ = static_cast(new_lr); + } else if (step < total_steps_) { + auto factor = 1 - + static_cast(step - warmup_steps_) / + (total_steps_ - warmup_steps_); + auto new_lr = + static_cast(base_lr_ - end_lr_) * factor * factor + end_lr_; + *lr_ = static_cast(new_lr); + } else { + *lr_ = static_cast(end_lr_); + } + } + + private: + RestrictPtr lr_; + RestrictPtr step_; + size_t warmup_steps_; + size_t total_steps_; + AttrT start_lr_; + AttrT base_lr_; + AttrT end_lr_; +}; + +template +class Pow2DecayWithLinearWarmupOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const { + const auto *lr = ctx.Input("LearningRate"); + const auto *step = ctx.Input("Step"); + auto *lr_out = ctx.Output("LearningRateOut"); + auto *step_out = ctx.Output("StepOut"); + PADDLE_ENFORCE_EQ( + lr, lr_out, platform::errors::InvalidArgument("Input(LearningRate) and " + "Output(LearningRateOut) " + "must be the same.")); + PADDLE_ENFORCE_NOT_NULL(lr, + platform::errors::InvalidArgument( + "Input(LearingRate) should not be nullptr.")); + PADDLE_ENFORCE_EQ(step, step_out, + platform::errors::InvalidArgument( + "Input(Step) and Output(StepOut) must be the same.")); + PADDLE_ENFORCE_NOT_NULL(step, platform::errors::InvalidArgument( + "Input(Step) should not be nullptr.")); + PADDLE_ENFORCE_EQ( + step->IsInitialized(), true, + platform::errors::InvalidArgument("Input(Step) must be initialized.")); + + auto warmup_steps = static_cast(ctx.Attr("warmup_steps")); + auto total_steps = static_cast(ctx.Attr("total_steps")); + PADDLE_ENFORCE_LE(warmup_steps, total_steps, + platform::errors::InvalidArgument( + "warmup_steps must not be larger than total_steps.")); + auto start_lr = ctx.Attr("start_lr"); + auto base_lr = ctx.Attr("base_lr"); + auto end_lr = ctx.Attr("end_lr"); + + auto *lr_data = lr_out->data(); + auto *step_data = step_out->data(); + auto &dev_ctx = ctx.template device_context(); + platform::ForRange for_range(dev_ctx, 1); + using AttrT = float; + Pow2DecayWithLinearWarmupFunctor functor( + lr_data, step_data, warmup_steps, total_steps, + static_cast(start_lr), static_cast(base_lr), + static_cast(end_lr)); + for_range(functor); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index 99ede353c1081e..0d0addb17e9ae6 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -1932,3 +1932,39 @@ def build_program(main_program, startup_program): attrs=attrs) return batch_norm_out + + +def pow2_decay_with_linear_warmup(warmup_steps, + total_steps, + start_lr, + base_lr, + end_lr, + dtype='float32', + name=None): + if paddle.fluid.in_dygraph_mode(): + raise NotImplementedError( + "pow2_warmup does not support dygraph mode yet.") + + helper = LayerHelper("pow2_decay_with_linear_warmup", **locals()) + lr = helper.create_global_variable(persistable=True, dtype=dtype, shape=[1]) + helper.set_variable_initializer(lr, Constant(value=start_lr)) + + step = helper.create_global_variable( + persistable=True, dtype='int64', shape=[1]) + helper.set_variable_initializer(step, Constant(value=0)) + assert warmup_steps <= total_steps, "warmup_steps cannot be larger than total_steps" + + helper.append_op( + type="pow2_decay_with_linear_warmup", + inputs={"LearningRate": lr, + "Step": step}, + outputs={"LearningRateOut": lr, + "StepOut": step}, + attrs={ + "warmup_steps": warmup_steps, + "total_steps": total_steps, + "start_lr": start_lr, + "base_lr": base_lr, + "end_lr": end_lr, + }) + return lr diff --git a/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py b/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py new file mode 100644 index 00000000000000..641ea3eccf8d2b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py @@ -0,0 +1,90 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.fluid.contrib.layers.nn import pow2_decay_with_linear_warmup +from paddle.optimizer.lr import LinearWarmup +from paddle.optimizer.lr import PolynomialDecay +import unittest + + +def gen_pow2_warmup_op_lr(warmup_steps, total_steps, start_lr, base_lr, end_lr, + place): + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + lr = pow2_decay_with_linear_warmup(warmup_steps, total_steps, start_lr, + base_lr, end_lr) + exe = paddle.static.Executor(place) + with paddle.static.scope_guard(paddle.static.Scope()): + exe.run(startup) + while True: + lr_np = exe.run(main, fetch_list=[lr])[0] + yield lr_np[0] + + +class Pow2Warmup(LinearWarmup): + def __init__(self, warmup_steps, total_steps, start_lr, base_lr, end_lr): + assert total_steps > warmup_steps + lr_sch = PolynomialDecay( + learning_rate=base_lr, + decay_steps=total_steps - warmup_steps, + end_lr=end_lr, + power=2) + + super(Pow2Warmup, self).__init__( + learning_rate=lr_sch, + warmup_steps=warmup_steps, + start_lr=start_lr, + end_lr=base_lr) + + +def gen_pow2_warmup_py_lr(warmup_steps, total_steps, start_lr, base_lr, end_lr, + place): + lr_sch = Pow2Warmup(warmup_steps, total_steps, start_lr, base_lr, end_lr) + while True: + yield lr_sch() + lr_sch.step() + + +class TestPow2WarmupLRScheduler(unittest.TestCase): + def setUp(self): + paddle.enable_static() + self.params = { + 'warmup_steps': 30, + 'total_steps': 100, + 'start_lr': 0.01, + 'base_lr': 0.02, + 'end_lr': 0.001, + } + self.step_num = 1000 + + def check_with_place(self, place): + kwargs = dict(self.params) + kwargs['place'] = place + lr_sch_op = gen_pow2_warmup_op_lr(**kwargs) + lr_sch_py = gen_pow2_warmup_py_lr(**kwargs) + for i, (lr_op, lr_py) in enumerate(zip(lr_sch_op, lr_sch_py)): + self.assertLess(abs(lr_op - lr_py), 1e-6) + if i > self.step_num: + break + + def test_main(self): + self.check_with_place(paddle.CPUPlace()) + if paddle.is_compiled_with_cuda(): + self.check_with_place(paddle.CUDAPlace(0)) + + +if __name__ == "__main__": + unittest.main() From a7830a293224c21742c892aadefe9971e498952e Mon Sep 17 00:00:00 2001 From: zmx Date: Tue, 19 Oct 2021 10:37:42 +0800 Subject: [PATCH 198/298] bug fix for DeserializeSelectedRows. test=develop (#36520) --- paddle/fluid/distributed/service/brpc_utils.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc index a356b77e73733e..376e820cb7a741 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/service/brpc_utils.cc @@ -273,8 +273,8 @@ void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg, auto* slr = var->GetMutable(); framework::Tensor* tensor = slr->mutable_value(); slr->set_height(msg.slr_height()); - std::vector tmp_rows(msg.slr_height()); - memcpy(&tmp_rows[0], msg.data().data(), msg.slr_height() * sizeof(int64_t)); + std::vector tmp_rows(msg.dims()[0]); + memcpy(tmp_rows.data(), msg.data().data(), msg.dims()[0] * sizeof(int64_t)); slr->set_rows(tmp_rows); std::vector vec_dim; for (auto& x : msg.dims()) { From 77f4597f81b075e01d98bcde0a25d03e5a390366 Mon Sep 17 00:00:00 2001 From: xiaoting <31891223+tink2123@users.noreply.github.com> Date: Tue, 19 Oct 2021 10:56:15 +0800 Subject: [PATCH 199/298] fix out of range for area interp, test=develop (#36466) --- python/paddle/nn/functional/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index fdd370d7f81e72..7362b284eaefee 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -296,7 +296,8 @@ def interpolate(x, ) if resample == 'AREA': - if isinstance(size, list) or isinstance(size, tuple): + if isinstance(size, list) or isinstance(size, tuple) or isinstance( + size, Variable): if len(size) == 0: raise ValueError("output size can not be empty") if len(x.shape) == 3: From 1d5746bd022c1c7bc3e35eb727559f30baaf3b0f Mon Sep 17 00:00:00 2001 From: Xiaoxu Chen Date: Tue, 19 Oct 2021 13:13:16 +0800 Subject: [PATCH 200/298] add rocm support for fft api (#36415) --- paddle/fluid/operators/CMakeLists.txt | 3 +- paddle/fluid/operators/spectral_helper.h | 261 ++++++++ paddle/fluid/operators/spectral_op.cu | 614 +++++++----------- paddle/fluid/platform/dynload/CMakeLists.txt | 2 +- .../fluid/platform/dynload/dynamic_loader.cc | 10 + .../fluid/platform/dynload/dynamic_loader.h | 1 + paddle/fluid/platform/dynload/hipfft.cc | 30 + paddle/fluid/platform/dynload/hipfft.h | 124 ++++ paddle/fluid/platform/enforce.h | 10 + paddle/fluid/platform/enforce_test.cc | 4 + 10 files changed, 679 insertions(+), 380 deletions(-) create mode 100644 paddle/fluid/operators/spectral_helper.h create mode 100644 paddle/fluid/platform/dynload/hipfft.cc create mode 100644 paddle/fluid/platform/dynload/hipfft.h diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index bb31fcf854d88f..78cbc7e8a583b8 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -102,8 +102,7 @@ else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() - -if (WITH_GPU AND (NOT WITH_ROCM)) +if (WITH_GPU OR WITH_ROCM) if (MKL_FOUND AND WITH_ONEMKL) op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda dynload_mklrt ${OP_HEADER_DEPS}) target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE}) diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h new file mode 100644 index 00000000000000..9c34d500eac92a --- /dev/null +++ b/paddle/fluid/operators/spectral_helper.h @@ -0,0 +1,261 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/spectral_op.h" + +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/hipfft.h" +#endif + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/dynload/cufft.h" +#endif + +namespace paddle { +namespace operators { +using ScalarType = framework::proto::VarType::Type; +const int64_t kMaxCUFFTNdim = 3; +const int64_t kMaxDataNdim = kMaxCUFFTNdim + 1; +// This struct is used to easily compute hashes of the +// parameters. It will be the **key** to the plan cache. +struct PlanKey { + // between 1 and kMaxCUFFTNdim, i.e., 1 <= signal_ndim <= 3 + int64_t signal_ndim_; + // These include additional batch dimension as well. + int64_t sizes_[kMaxDataNdim]; + int64_t input_shape_[kMaxDataNdim]; + int64_t output_shape_[kMaxDataNdim]; + FFTTransformType fft_type_; + ScalarType value_type_; + + PlanKey() = default; + + PlanKey(const std::vector& in_shape, + const std::vector& out_shape, + const std::vector& signal_size, FFTTransformType fft_type, + ScalarType value_type) { + // Padding bits must be zeroed for hashing + memset(this, 0, sizeof(*this)); + signal_ndim_ = signal_size.size() - 1; + fft_type_ = fft_type; + value_type_ = value_type; + + std::copy(signal_size.cbegin(), signal_size.cend(), sizes_); + std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_); + std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_); + } +}; + +#if defined(PADDLE_WITH_CUDA) +// An RAII encapsulation of cuFFTHandle +class CuFFTHandle { + ::cufftHandle handle_; + + public: + CuFFTHandle() { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftCreate(&handle_)); + } + + ::cufftHandle& get() { return handle_; } + const ::cufftHandle& get() const { return handle_; } + + ~CuFFTHandle() { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftDestroy(handle_)); + } +}; + +using plan_size_type = long long int; // NOLINT +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. the workspace size needed +class CuFFTConfig { + public: + // Only move semantics is enought for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + explicit CuFFTConfig(const PlanKey& plan_key) + : CuFFTConfig( + std::vector(plan_key.sizes_, + plan_key.sizes_ + plan_key.signal_ndim_ + 1), + plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} + + // sizes are full signal, including batch size and always two-sided + CuFFTConfig(const std::vector& sizes, const int64_t signal_ndim, + FFTTransformType fft_type, ScalarType dtype) + : fft_type_(fft_type), value_type_(dtype) { + // signal sizes (excluding batch dim) + std::vector signal_sizes(sizes.begin() + 1, sizes.end()); + + // input batch size + const auto batch = static_cast(sizes[0]); + // const int64_t signal_ndim = sizes.size() - 1; + PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, + platform::errors::InvalidArgument( + "The signal_ndim must be equal to sizes.size() - 1," + "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", + signal_ndim, sizes.size() - 1)); + + cudaDataType itype, otype, exec_type; + const auto complex_input = has_complex_input(fft_type); + const auto complex_output = has_complex_output(fft_type); + if (dtype == framework::proto::VarType::FP32) { + itype = complex_input ? CUDA_C_32F : CUDA_R_32F; + otype = complex_output ? CUDA_C_32F : CUDA_R_32F; + exec_type = CUDA_C_32F; + } else if (dtype == framework::proto::VarType::FP64) { + itype = complex_input ? CUDA_C_64F : CUDA_R_64F; + otype = complex_output ? CUDA_C_64F : CUDA_R_64F; + exec_type = CUDA_C_64F; + } else if (dtype == framework::proto::VarType::FP16) { + itype = complex_input ? CUDA_C_16F : CUDA_R_16F; + otype = complex_output ? CUDA_C_16F : CUDA_R_16F; + exec_type = CUDA_C_16F; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "cuFFT only support transforms of type float16, float32 and " + "float64")); + } + + // disable auto allocation of workspace to use allocator from the framework + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetAutoAllocation( + plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtMakePlanMany( + plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, + batch, &ws_size_t, exec_type)); + + ws_size = ws_size_t; + } + + const cufftHandle& plan() const { return plan_ptr.get(); } + + FFTTransformType transform_type() const { return fft_type_; } + ScalarType data_type() const { return value_type_; } + size_t workspace_size() const { return ws_size; } + + private: + CuFFTHandle plan_ptr; + size_t ws_size; + FFTTransformType fft_type_; + ScalarType value_type_; +}; + +#elif defined(PADDLE_WITH_HIP) +// An RAII encapsulation of cuFFTHandle +class HIPFFTHandle { + ::hipfftHandle handle_; + + public: + HIPFFTHandle() { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftCreate(&handle_)); + } + + ::hipfftHandle& get() { return handle_; } + const ::hipfftHandle& get() const { return handle_; } + + ~HIPFFTHandle() { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftDestroy(handle_)); + } +}; +using plan_size_type = int; +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. the workspace size needed +class HIPFFTConfig { + public: + // Only move semantics is enought for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + explicit HIPFFTConfig(const PlanKey& plan_key) + : HIPFFTConfig( + std::vector(plan_key.sizes_, + plan_key.sizes_ + plan_key.signal_ndim_ + 1), + plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} + + // sizes are full signal, including batch size and always two-sided + HIPFFTConfig(const std::vector& sizes, const int64_t signal_ndim, + FFTTransformType fft_type, ScalarType dtype) + : fft_type_(fft_type), value_type_(dtype) { + // signal sizes (excluding batch dim) + std::vector signal_sizes(sizes.begin() + 1, sizes.end()); + + // input batch size + const auto batch = static_cast(sizes[0]); + // const int64_t signal_ndim = sizes.size() - 1; + PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, + platform::errors::InvalidArgument( + "The signal_ndim must be equal to sizes.size() - 1," + "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", + signal_ndim, sizes.size() - 1)); + + hipfftType exec_type = [&] { + if (dtype == framework::proto::VarType::FP32) { + switch (fft_type) { + case FFTTransformType::C2C: + return HIPFFT_C2C; + case FFTTransformType::R2C: + return HIPFFT_R2C; + case FFTTransformType::C2R: + return HIPFFT_C2R; + } + } else if (dtype == framework::proto::VarType::FP64) { + switch (fft_type) { + case FFTTransformType::C2C: + return HIPFFT_Z2Z; + case FFTTransformType::R2C: + return HIPFFT_D2Z; + case FFTTransformType::C2R: + return HIPFFT_Z2D; + } + } + PADDLE_THROW(platform::errors::InvalidArgument( + "hipFFT only support transforms of type float32 and float64")); + }(); + + // disable auto allocation of workspace to use allocator from the framework + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetAutoAllocation( + plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftMakePlanMany( + plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type, + batch, &ws_size_t)); + + ws_size = ws_size_t; + } + + const hipfftHandle& plan() const { return plan_ptr.get(); } + + FFTTransformType transform_type() const { return fft_type_; } + ScalarType data_type() const { return value_type_; } + size_t workspace_size() const { return ws_size; } + + private: + HIPFFTHandle plan_ptr; + size_t ws_size; + FFTTransformType fft_type_; + ScalarType value_type_; +}; +#endif +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu index 24dffaad41b5fc..e8a4fac2915d7c 100644 --- a/paddle/fluid/operators/spectral_op.cu +++ b/paddle/fluid/operators/spectral_op.cu @@ -8,10 +8,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#include -#include - #include #include #include @@ -24,311 +20,246 @@ #include #include "paddle/fluid/operators/conj_op.h" +#include "paddle/fluid/operators/spectral_helper.h" #include "paddle/fluid/operators/spectral_op.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/dynload/cufft.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { namespace { -using ScalarType = framework::proto::VarType::Type; -const int64_t kMaxCUFFTNdim = 3; -const int64_t kMaxDataNdim = kMaxCUFFTNdim + 1; - -static inline std::string get_cufft_error_info(cufftResult error) { - switch (error) { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; - case CUFFT_INCOMPLETE_PARAMETER_LIST: - return "CUFFT_INCOMPLETE_PARAMETER_LIST"; - case CUFFT_INVALID_DEVICE: - return "CUFFT_INVALID_DEVICE"; - case CUFFT_PARSE_ERROR: - return "CUFFT_PARSE_ERROR"; - case CUFFT_NO_WORKSPACE: - return "CUFFT_NO_WORKSPACE"; - case CUFFT_NOT_IMPLEMENTED: - return "CUFFT_NOT_IMPLEMENTED"; -#ifndef __HIPCC__ - case CUFFT_LICENSE_ERROR: - return "CUFFT_LICENSE_ERROR"; -#endif - case CUFFT_NOT_SUPPORTED: - return "CUFFT_NOT_SUPPORTED"; - default: - std::ostringstream ss; - ss << "unknown error " << error; - return ss.str(); +// Calculates the normalization constant +double fft_normalization_scale(FFTNormMode normalization, + const std::vector& sizes, + const std::vector& dims) { + // auto norm = static_cast(normalization); + if (normalization == FFTNormMode::none) { + return static_cast(1.0); } -} -static inline void CUFFT_CHECK(cufftResult error) { - PADDLE_ENFORCE_CUDA_SUCCESS(error); + int64_t signal_numel = 1; + for (auto dim : dims) { + signal_numel *= sizes[dim]; + } + const double scale_denom = (normalization == FFTNormMode::by_sqrt_n) + ? std::sqrt(signal_numel) + : static_cast(signal_numel); + return static_cast(1.0 / scale_denom); } -// This struct is used to easily compute hashes of the -// parameters. It will be the **key** to the plan cache. -struct PlanKey { - // between 1 and kMaxCUFFTNdim, i.e., 1 <= signal_ndim <= 3 - int64_t signal_ndim_; - // These include additional batch dimension as well. - int64_t sizes_[kMaxDataNdim]; - int64_t input_shape_[kMaxDataNdim]; - int64_t output_shape_[kMaxDataNdim]; - FFTTransformType fft_type_; - ScalarType value_type_; - - PlanKey() = default; - - PlanKey(const std::vector& in_shape, - const std::vector& out_shape, - const std::vector& signal_size, FFTTransformType fft_type, - ScalarType value_type) { - // Padding bits must be zeroed for hashing - memset(this, 0, sizeof(*this)); - signal_ndim_ = signal_size.size() - 1; - fft_type_ = fft_type; - value_type_ = value_type; - - std::copy(signal_size.cbegin(), signal_size.cend(), sizes_); - std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_); - std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_); +template +void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out, + FFTNormMode normalization, + const std::vector& sizes, + const std::vector& axes) { + double scale = fft_normalization_scale(normalization, sizes, axes); + if (scale != 1.0) { + auto eigen_out = framework::EigenVector::Flatten(*out); + auto eigen_in = framework::EigenVector::Flatten(*in); + auto dev = ctx.eigen_device(); + EigenScale::Eval(*dev, eigen_out, eigen_in, + static_cast(scale), + static_cast(0), false); + } else { + framework::TensorCopy(*in, ctx.GetPlace(), out); } -}; - -// An RAII encapsulation of cuFFTHandle -class CuFFTHandle { - ::cufftHandle handle_; - - public: - CuFFTHandle() { CUFFT_CHECK(platform::dynload::cufftCreate(&handle_)); } +} - ::cufftHandle& get() { return handle_; } - const ::cufftHandle& get() const { return handle_; } +#if defined(PADDLE_WITH_CUDA) +CuFFTConfig create_cufft_config(const framework::Tensor& input, + const framework::Tensor& output, + int signal_ndim) { + // Create the transform plan (either from cache or locally) + const auto value_type = framework::IsComplexType(input.type()) + ? framework::ToRealType(input.type()) + : input.type(); + auto fft_type = GetFFTTransformType(input.type(), output.type()); + // signal sizes + std::vector signal_size(signal_ndim + 1); - ~CuFFTHandle() { -// Not using fftDestroy() for rocFFT to work around double freeing of handles -#ifndef __HIPCC__ - CUFFT_CHECK(platform::dynload::cufftDestroy(handle_)); -#endif + signal_size[0] = input.dims()[0]; + for (int64_t i = 1; i <= signal_ndim; ++i) { + auto in_size = input.dims()[i]; + auto out_size = output.dims()[i]; + signal_size[i] = std::max(in_size, out_size); } -}; + PlanKey key(framework::vectorize(input.dims()), + framework::vectorize(output.dims()), signal_size, fft_type, + value_type); -#ifdef __HIPCC__ -using plan_size_type = int; -#else -using plan_size_type = long long int; // NOLINT -#endif + return CuFFTConfig(key); +} -// This class contains all the information needed to execute a cuFFT plan: -// 1. the plan -// 2. the workspace size needed -class CuFFTConfig { - public: - // Only move semantics is enought for this class. Although we already use - // unique_ptr for the plan, still remove copy constructor and assignment op so - // we don't accidentally copy and take perf hit. - CuFFTConfig(const CuFFTConfig&) = delete; - CuFFTConfig& operator=(CuFFTConfig const&) = delete; - - explicit CuFFTConfig(const PlanKey& plan_key) - : CuFFTConfig( - std::vector(plan_key.sizes_, - plan_key.sizes_ + plan_key.signal_ndim_ + 1), - plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} - - // sizes are full signal, including batch size and always two-sided - CuFFTConfig(const std::vector& sizes, const int64_t signal_ndim, - FFTTransformType fft_type, ScalarType dtype) - : fft_type_(fft_type), value_type_(dtype) { - // signal sizes (excluding batch dim) - std::vector signal_sizes(sizes.begin() + 1, sizes.end()); - - // input batch size - const auto batch = static_cast(sizes[0]); - // const int64_t signal_ndim = sizes.size() - 1; - PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, - platform::errors::InvalidArgument( - "The signal_ndim must be equal to sizes.size() - 1," - "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", - signal_ndim, sizes.size() - 1)); - -#ifdef __HIPCC__ - hipfftType exec_type = [&] { - if (dtype == framework::proto::VarType::FP32) { - switch (fft_type) { - case FFTTransformType::C2C: - return HIPFFT_C2C; - case FFTTransformType::R2C: - return HIPFFT_R2C; - case FFTTransformType::C2R: - return HIPFFT_C2R; - } - } else if (dtype == framework::proto::VarType::FP64) { - switch (fft_type) { - case FFTTransformType::C2C: - return HIPFFT_Z2Z; - case FFTTransformType::R2C: - return HIPFFT_D2Z; - case FFTTransformType::C2R: - return HIPFFT_Z2D; - } - } - PADDLE_THROW(platform::errors::InvalidArgument( - "hipFFT only support transforms of type float32 and float64")); - }(); -#else - cudaDataType itype, otype, exec_type; - const auto complex_input = has_complex_input(fft_type); - const auto complex_output = has_complex_output(fft_type); - if (dtype == framework::proto::VarType::FP32) { - itype = complex_input ? CUDA_C_32F : CUDA_R_32F; - otype = complex_output ? CUDA_C_32F : CUDA_R_32F; - exec_type = CUDA_C_32F; - } else if (dtype == framework::proto::VarType::FP64) { - itype = complex_input ? CUDA_C_64F : CUDA_R_64F; - otype = complex_output ? CUDA_C_64F : CUDA_R_64F; - exec_type = CUDA_C_64F; - } else if (dtype == framework::proto::VarType::FP16) { - itype = complex_input ? CUDA_C_16F : CUDA_R_16F; - otype = complex_output ? CUDA_C_16F : CUDA_R_16F; - exec_type = CUDA_C_16F; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "cuFFT only support transforms of type float16, float32 and " - "float64")); - } -#endif +// Execute a pre-planned transform +static void exec_cufft_plan_raw(const CuFFTConfig& config, void* in_data, + void* out_data, bool forward) { + auto& plan = config.plan(); - // disable auto allocation of workspace to use allocator from the framework - CUFFT_CHECK(platform::dynload::cufftSetAutoAllocation( - plan(), /* autoAllocate */ 0)); - - size_t ws_size_t; - -// make plan -#ifdef __HIPCC__ - CUFFT_CHECK(hipfftMakePlanMany( - plan(), signal_ndim, signal_sizes.data(), - /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, - /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type, - batch, &ws_size_t)); -#else - - CUFFT_CHECK(platform::dynload::cufftXtMakePlanMany( - plan(), signal_ndim, signal_sizes.data(), - /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, - /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, - batch, &ws_size_t, exec_type)); -#endif + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtExec( + plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); +} - ws_size = ws_size_t; +template +void exec_cufft_plan(const DeviceContext& ctx, const CuFFTConfig& config, + framework::Tensor* input, framework::Tensor* output, + bool forward) { + // execute transform plan + auto fft_type = config.transform_type(); + if (fft_type == FFTTransformType::C2R && forward) { + forward = false; + framework::Tensor input_conj(input->type()); + input_conj.mutable_data(input->dims(), ctx.GetPlace()); + platform::ForRange for_range(ctx, input->numel()); + math::ConjFunctor functor(input->data(), input->numel(), + input_conj.data()); + for_range(functor); + exec_cufft_plan_raw(config, input_conj.data(), output->data(), + forward); + } else if (fft_type == FFTTransformType::R2C && !forward) { + forward = true; + framework::Tensor out_conj(output->type()); + out_conj.mutable_data(output->dims(), ctx.GetPlace()); + exec_cufft_plan_raw(config, input->data(), out_conj.data(), + forward); + + platform::ForRange for_range(ctx, output->numel()); + math::ConjFunctor functor(out_conj.data(), output->numel(), + output->data()); + for_range(functor); + } else { + exec_cufft_plan_raw(config, input->data(), output->data(), + forward); } +} - const cufftHandle& plan() const { return plan_ptr.get(); } +#elif defined(PADDLE_WITH_HIP) - FFTTransformType transform_type() const { return fft_type_; } - ScalarType data_type() const { return value_type_; } - size_t workspace_size() const { return ws_size; } +HIPFFTConfig create_hipfft_config(const framework::Tensor& input, + const framework::Tensor& output, + int signal_ndim) { + // Create the transform plan (either from cache or locally) + const auto value_type = framework::IsComplexType(input.type()) + ? framework::ToRealType(input.type()) + : input.type(); + auto fft_type = GetFFTTransformType(input.type(), output.type()); + // signal sizes + std::vector signal_size(signal_ndim + 1); - private: - CuFFTHandle plan_ptr; - size_t ws_size; - FFTTransformType fft_type_; - ScalarType value_type_; -}; + signal_size[0] = input.dims()[0]; + for (int64_t i = 1; i <= signal_ndim; ++i) { + auto in_size = input.dims()[i]; + auto out_size = output.dims()[i]; + signal_size[i] = std::max(in_size, out_size); + } + PlanKey key(framework::vectorize(input.dims()), + framework::vectorize(output.dims()), signal_size, fft_type, + value_type); + + return HIPFFTConfig(key); +} // Execute a pre-planned transform -static void exec_cufft_plan(const CuFFTConfig& config, void* in_data, - void* out_data, bool forward) { +static void exec_hipfft_plan_raw(const HIPFFTConfig& config, void* in_data, + void* out_data, bool forward) { auto& plan = config.plan(); -#ifdef __HIPCC__ + auto value_type = config.data_type(); if (value_type == framework::proto::VarType::FP32) { switch (config.transform_type()) { case FFTTransformType::C2C: { - CUFFT_CHECK(hipfftExecC2C(plan, static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2C( + plan, static_cast(in_data), + static_cast(out_data), + forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); return; } case FFTTransformType::R2C: { - CUFFT_CHECK(hipfftExecR2C(plan, static_cast(in_data), - static_cast(out_data))); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecR2C( + plan, static_cast(in_data), + static_cast(out_data))); return; } case FFTTransformType::C2R: { - CUFFT_CHECK(hipfftExecC2R(plan, static_cast(in_data), - static_cast(out_data))); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2R( + plan, static_cast(in_data), + static_cast(out_data))); return; } } } else if (value_type == framework::proto::VarType::FP64) { switch (config.transform_type()) { case FFTTransformType::C2C: { - CUFFT_CHECK(hipfftExecZ2Z(plan, - static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2Z( + plan, static_cast(in_data), + static_cast(out_data), + forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); return; } case FFTTransformType::R2C: { - CUFFT_CHECK(hipfftExecD2Z(plan, static_cast(in_data), - static_cast(out_data))); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecD2Z( + plan, static_cast(in_data), + static_cast(out_data))); return; } case FFTTransformType::C2R: { - CUFFT_CHECK(hipfftExecZ2D(plan, - static_cast(in_data), - static_cast(out_data))); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2D( + plan, static_cast(in_data), + static_cast(out_data))); return; } } } PADDLE_THROW(platform::errors::InvalidArgument( "hipFFT only support transforms of type float32 and float64")); -#else - CUFFT_CHECK(platform::dynload::cufftXtExec( - plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); -#endif } +template +void exec_hipfft_plan(const DeviceContext& ctx, const HIPFFTConfig& config, + framework::Tensor* input, framework::Tensor* output, + bool forward) { + auto fft_type = config.transform_type(); + if (fft_type == FFTTransformType::C2R && forward) { + forward = false; + framework::Tensor input_conj(input->type()); + input_conj.mutable_data(input->dims(), ctx.GetPlace()); + platform::ForRange for_range(ctx, input->numel()); + math::ConjFunctor functor(input->data(), input->numel(), + input_conj.data()); + for_range(functor); + exec_hipfft_plan_raw(config, input_conj.data(), output->data(), + forward); + } else if (fft_type == FFTTransformType::R2C && !forward) { + forward = true; + framework::Tensor out_conj(output->type()); + out_conj.mutable_data(output->dims(), ctx.GetPlace()); + exec_hipfft_plan_raw(config, input->data(), out_conj.data(), + forward); + + platform::ForRange for_range(ctx, output->numel()); + math::ConjFunctor functor(out_conj.data(), output->numel(), + output->data()); + for_range(functor); + } else { + exec_hipfft_plan_raw(config, input->data(), output->data(), + forward); + } +} + +#endif + // Execute a general unnormalized fft operation (can be c2c, onesided r2c or // onesided c2r) template void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out, const std::vector& dim, bool forward) { const auto x_dims = framework::vectorize(X->dims()); - const auto out_dims = framework::vectorize(out->dims()); const int64_t ndim = static_cast(X->dims().size()); - const int64_t signal_ndim = static_cast(dim.size()); - const int64_t batch_dims = ndim - signal_ndim; auto tensor_place = ctx.GetPlace(); - // Transpose batch dimensions first, then with transforming dims + // make a dim permutation std::vector dim_permute(ndim); - std::vector reverse_dim_permute(ndim); - std::vector trans_dims(ndim); std::iota(dim_permute.begin(), dim_permute.end(), int{0}); std::vector is_transformed_dim(ndim); for (const auto& d : dim) { @@ -340,160 +271,89 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out, std::sort(dim_permute.begin(), batch_end); std::copy(dim.cbegin(), dim.cend(), batch_end); - for (size_t i = 0; i < ndim; i++) { - trans_dims[i] = x_dims[dim_permute[i]]; // shape of input transpose - reverse_dim_permute[dim_permute[i]] = - static_cast(i); // reverse of dim permute - } - framework::Tensor input; - input.Resize(framework::make_ddim(trans_dims)); - input.mutable_data(tensor_place); - /* - auto in_ret = TransposeSimple::run(ctx, *X, dim_permute, input); - if (!in_ret) { - TransCompute(ndim, ctx, *X, input, dim_permute); - } - */ - TransCompute(ndim, ctx, *X, &input, dim_permute); + // transpose input according to dim permutation + auto transposed_input_shape = X->dims().transpose(dim_permute); + framework::Tensor transposed_input; + transposed_input.Resize(transposed_input_shape); + transposed_input.mutable_data(tensor_place); + TransCompute(ndim, ctx, *X, &transposed_input, + dim_permute); // Reshape batch dimensions into a single dimension - std::vector batched_sizes(signal_ndim + 1); + const int64_t signal_ndim = static_cast(dim.size()); + std::vector collapsed_input_shape(signal_ndim + 1); + + auto transposed_input_shape_ = framework::vectorize(transposed_input_shape); + const int64_t batch_dims = ndim - signal_ndim; auto batch_size = - std::accumulate(trans_dims.begin(), trans_dims.begin() + batch_dims, + std::accumulate(transposed_input_shape_.begin(), + transposed_input_shape_.begin() + batch_dims, static_cast(1), std::multiplies()); - batched_sizes[0] = batch_size; - std::copy(trans_dims.begin() + batch_dims, trans_dims.end(), - batched_sizes.begin() + 1); - input.Resize(framework::make_ddim(batched_sizes)); + collapsed_input_shape[0] = batch_size; - // Check the shape of transforming dims with input and output - std::vector signal_size(signal_ndim + 1); - signal_size[0] = batch_size; - for (int64_t i = 0; i < signal_ndim; ++i) { - auto in_size = input.dims()[i + 1]; - auto out_size = out_dims[dim[i]]; - signal_size[i + 1] = std::max(in_size, out_size); - PADDLE_ENFORCE_EQ( - (in_size == signal_size[i + 1] || - in_size == (signal_size[i + 1] / 2) + 1), - true, - platform::errors::InvalidArgument( - "The dimension[%d] of Input size: [%d] must be equal or half to " - "The dimension[%d] of Output size: [%d]", - dim[i], in_size, dim[i], out_size)); - PADDLE_ENFORCE_EQ( - (out_size == signal_size[i + 1] || - out_size == (signal_size[i + 1] / 2) + 1), - true, - platform::errors::InvalidArgument( - "The dimension[%d] of Output size: [%d] must be equal or half to " - "The dimension[%d] of Input size: [%d]", - dim[i], out_size, dim[i], in_size)); - } + std::copy(transposed_input_shape_.begin() + batch_dims, + transposed_input_shape_.end(), collapsed_input_shape.begin() + 1); - std::vector reshape_out_sizes(ndim); - for (size_t i = 0; i < ndim; ++i) { - reshape_out_sizes[i] = out_dims[dim_permute[i]]; - } - std::vector batched_out_sizes(batched_sizes.begin(), - batched_sizes.end()); + framework::Tensor& collapsed_input = transposed_input; + collapsed_input.Resize(framework::make_ddim(collapsed_input_shape)); + + // make a collpased output + const auto out_dims = framework::vectorize(out->dims()); + std::vector collapsed_output_shape(1 + signal_ndim); + collapsed_output_shape[0] = batch_size; for (size_t i = 0; i < dim.size(); ++i) { - batched_out_sizes[i + 1] = out_dims[dim[i]]; + collapsed_output_shape[i + 1] = out_dims[dim[i]]; } - - // output - framework::Tensor output; - output.Resize(framework::make_ddim(batched_out_sizes)); - output.mutable_data(tensor_place); - - // Create the transform plan (either from cache or locally) - const auto value_type = framework::IsComplexType(input.type()) - ? framework::ToRealType(input.type()) - : input.type(); - auto fft_type = GetFFTTransformType(input.type(), output.type()); - - PlanKey Key(framework::vectorize(input.dims()), - framework::vectorize(output.dims()), signal_size, fft_type, - value_type); - CuFFTConfig uncached_plan(Key); - CuFFTConfig* config = &uncached_plan; - auto& plan = config->plan(); - + framework::Tensor collapsed_output; + collapsed_output.Resize(framework::make_ddim(collapsed_output_shape)); + collapsed_output.mutable_data(tensor_place); + +#if defined(PADDLE_WITH_CUDA) + // create plan + CuFFTConfig config = + create_cufft_config(collapsed_input, collapsed_output, signal_ndim); // prepare cufft for execution - CUFFT_CHECK(platform::dynload::cufftSetStream(plan, ctx.stream())); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cufftSetStream(config.plan(), ctx.stream())); framework::Tensor workspace_tensor; - workspace_tensor.mutable_data(tensor_place, config->workspace_size()); - CUFFT_CHECK( - platform::dynload::cufftSetWorkArea(plan, workspace_tensor.data())); + workspace_tensor.mutable_data(tensor_place, config.workspace_size()); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetWorkArea( + config.plan(), workspace_tensor.data())); + // execute transform plan + exec_cufft_plan(ctx, config, &collapsed_input, + &collapsed_output, forward); +#elif defined(PADDLE_WITH_HIP) + // create plan + HIPFFTConfig config = + create_hipfft_config(collapsed_input, collapsed_output, signal_ndim); + // prepare cufft for execution + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::hipfftSetStream(config.plan(), ctx.stream())); + framework::Tensor workspace_tensor; + workspace_tensor.mutable_data(tensor_place, config.workspace_size()); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetWorkArea( + config.plan(), workspace_tensor.data())); // execute transform plan - if (fft_type == FFTTransformType::C2R && forward) { - forward = false; - framework::Tensor input_conj(input.type()); - input_conj.mutable_data(input.dims(), ctx.GetPlace()); - platform::ForRange for_range(ctx, input.numel()); - math::ConjFunctor functor(input.data(), input.numel(), - input_conj.data()); - for_range(functor); - exec_cufft_plan(*config, input_conj.data(), output.data(), - forward); - } else if (fft_type == FFTTransformType::R2C && !forward) { - forward = true; - framework::Tensor out_conj(output.type()); - out_conj.mutable_data(output.dims(), ctx.GetPlace()); - exec_cufft_plan(*config, input.data(), out_conj.data(), - forward); - - platform::ForRange for_range(ctx, output.numel()); - math::ConjFunctor functor(out_conj.data(), output.numel(), - output.data()); - for_range(functor); - } else { - exec_cufft_plan(*config, input.data(), output.data(), forward); - } + exec_hipfft_plan(ctx, config, &collapsed_input, + &collapsed_output, forward); +#endif // Inverting output by reshape and transpose to original batch and dimension - output.Resize(framework::make_ddim(reshape_out_sizes)); - out->Resize(framework::make_ddim(out_dims)); - TransCompute(ndim, ctx, output, out, reverse_dim_permute); -} + auto transposed_out_shape = out->dims().transpose(dim_permute); -// Calculates the normalization constant -double fft_normalization_scale(FFTNormMode normalization, - const std::vector& sizes, - const std::vector& dims) { - // auto norm = static_cast(normalization); - if (normalization == FFTNormMode::none) { - return static_cast(1.0); - } + collapsed_output.Resize(transposed_out_shape); + auto& transposed_output = collapsed_output; - int64_t signal_numel = 1; - for (auto dim : dims) { - signal_numel *= sizes[dim]; + std::vector reverse_dim_permute(ndim); + for (size_t i = 0; i < ndim; i++) { + reverse_dim_permute[dim_permute[i]] = i; } - const double scale_denom = (normalization == FFTNormMode::by_sqrt_n) - ? std::sqrt(signal_numel) - : static_cast(signal_numel); - return static_cast(1.0 / scale_denom); -} -template -void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out, - FFTNormMode normalization, - const std::vector& sizes, - const std::vector& axes) { - double scale = fft_normalization_scale(normalization, sizes, axes); - if (scale != 1.0) { - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto dev = ctx.eigen_device(); - EigenScale::Eval(*dev, eigen_out, eigen_in, - static_cast(scale), - static_cast(0), false); - } else { - framework::TensorCopy(*in, ctx.GetPlace(), out); - } + TransCompute(ndim, ctx, transposed_output, out, + reverse_dim_permute); } + } // anonymous namespace // Use the optimized path to perform single R2C or C2R if transformation dim is diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 8c64aad46cfc80..6e90ccfc51e1b6 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -7,7 +7,7 @@ if (NOT WITH_NV_JETSON) endif() if (WITH_ROCM) - list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc) + list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc) endif() # There is no macOS version of NCCL. diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 0c5c47e38f85ef..1bfd48b1339071 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -356,6 +356,16 @@ void* GetCurandDsoHandle() { #endif } +#ifdef PADDLE_WITH_HIP +void* GetROCFFTDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib"); +#else + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.so"); +#endif +} +#endif + void* GetNvjpegDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib"); diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 6260efdf71c590..1a66f4b979207e 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -44,6 +44,7 @@ void* GetOpDsoHandle(const std::string& dso_name); void* GetNvtxDsoHandle(); void* GetCUFFTDsoHandle(); void* GetMKLRTDsoHandle(); +void* GetROCFFTDsoHandle(); void SetPaddleLibPath(const std::string&); } // namespace dynload diff --git a/paddle/fluid/platform/dynload/hipfft.cc b/paddle/fluid/platform/dynload/hipfft.cc new file mode 100644 index 00000000000000..767d2161be9d8d --- /dev/null +++ b/paddle/fluid/platform/dynload/hipfft.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/hipfft.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag hipfft_dso_flag; +void *hipfft_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/hipfft.h b/paddle/fluid/platform/dynload/hipfft.h new file mode 100644 index 00000000000000..50c25935e41b7e --- /dev/null +++ b/paddle/fluid/platform/dynload/hipfft.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#ifdef PADDLE_WITH_HIP +#include + +#include // NOLINT + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { +extern std::once_flag hipfft_dso_flag; +extern void *hipfft_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using hipfftFunc = decltype(&::__name); \ + std::call_once(hipfft_dso_flag, []() { \ + hipfft_dso_handle = paddle::platform::dynload::GetROCFFTDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(hipfft_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define HIPFFT_FFT_ROUTINE_EACH(__macro) \ + __macro(hipfftPlan1d); \ + __macro(hipfftPlan2d); \ + __macro(hipfftPlan3d); \ + __macro(hipfftPlanMany); \ + __macro(hipfftMakePlan1d); \ + __macro(hipfftMakePlanMany); \ + __macro(hipfftMakePlanMany64); \ + __macro(hipfftGetSizeMany64); \ + __macro(hipfftEstimate1d); \ + __macro(hipfftEstimate2d); \ + __macro(hipfftEstimate3d); \ + __macro(hipfftEstimateMany); \ + __macro(hipfftCreate); \ + __macro(hipfftGetSize1d); \ + __macro(hipfftGetSizeMany); \ + __macro(hipfftGetSize); \ + __macro(hipfftSetWorkArea); \ + __macro(hipfftSetAutoAllocation); \ + __macro(hipfftExecC2C); \ + __macro(hipfftExecR2C); \ + __macro(hipfftExecC2R); \ + __macro(hipfftExecZ2Z); \ + __macro(hipfftExecD2Z); \ + __macro(hipfftExecZ2D); \ + __macro(hipfftSetStream); \ + __macro(hipfftDestroy); \ + __macro(hipfftGetVersion); \ + __macro(hipfftGetProperty); + +HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP); + +inline const char *hipfftGetErrorString(hipfftResult_t status) { + switch (status) { + case HIPFFT_SUCCESS: + return "'HIPFFT_SUCCESS'. The hipFFT operation was successful."; + case HIPFFT_INVALID_PLAN: + return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle."; + case HIPFFT_ALLOC_FAILED: + return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU " + "memory."; + case HIPFFT_INVALID_TYPE: + return "'HIPFFT_INVALID_TYPE'. No longer used."; + case HIPFFT_INVALID_VALUE: + return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or " + "parameter."; + case HIPFFT_INTERNAL_ERROR: + return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library " + "error."; + case HIPFFT_EXEC_FAILED: + return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU."; + case HIPFFT_SETUP_FAILED: + return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize."; + case HIPFFT_INVALID_SIZE: + return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size."; + case HIPFFT_UNALIGNED_DATA: + return "'HIPFFT_UNALIGNED_DATA'. No longer used."; + case HIPFFT_INCOMPLETE_PARAMETER_LIST: + return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call."; + case HIPFFT_INVALID_DEVICE: + return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different " + "GPU than plan creation."; + case HIPFFT_PARSE_ERROR: + return "'HIPFFT_PARSE_ERROR'. Internal plan database error."; + case HIPFFT_NO_WORKSPACE: + return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to " + "plan execution."; + case HIPFFT_NOT_IMPLEMENTED: + return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement " + "functionality for parameters given."; + case HIPFFT_NOT_SUPPORTED: + return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for " + "parameters given."; + default: + return "HIPFFT_STATUS_UNKNOWN_ERROR"; + } +} +} // namespace dynload +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 7427060add8b10..caa495bb7f8c52 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -86,6 +86,7 @@ limitations under the License. */ #endif // PADDLE_WITH_CUDA #ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/hipfft.h" #include "paddle/fluid/platform/dynload/hiprand.h" #include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/dynload/rocblas.h" @@ -1113,6 +1114,14 @@ inline std::string build_rocm_error_msg(ncclResult_t nccl_result) { } #endif // not(__APPLE__) and PADDLE_WITH_NCCL +/***** HIPFFT ERROR *****/ +inline bool is_error(hipfftResult_t stat) { return stat != HIPFFT_SUCCESS; } + +inline std::string build_rocm_error_msg(hipfftResult_t stat) { + std::string msg(" HIPFFT error, "); + return msg + platform::dynload::hipfftGetErrorString(stat) + " "; +} + namespace details { template @@ -1129,6 +1138,7 @@ DEFINE_EXTERNAL_API_TYPE(hipError_t, hipSuccess); DEFINE_EXTERNAL_API_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS); DEFINE_EXTERNAL_API_TYPE(miopenStatus_t, miopenStatusSuccess); DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success); +DEFINE_EXTERNAL_API_TYPE(hipfftResult_t, HIPFFT_SUCCESS); #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess); diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index c6d5f171ddce4d..6ff9e6ea903cd3 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -331,6 +331,10 @@ TEST(enforce, hip_success) { CheckCudaStatusFailure(rocblas_status_invalid_handle, "Rocblas error")); EXPECT_TRUE( CheckCudaStatusFailure(rocblas_status_invalid_value, "Rocblas error")); + EXPECT_TRUE(CheckCudaStatusSuccess(HIPFFT_SUCCESS)); + EXPECT_TRUE(CheckCudaStatusFailure(HIPFFT_INVALID_PLAN, "HIPFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(HIPFFT_ALLOC_FAILED, "HIPFFT error")); + #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Rccl error")); From a573a7ed7f4113cc7658b38f889e442bc805171e Mon Sep 17 00:00:00 2001 From: YipZLF <22539457+YipZLF@users.noreply.github.com> Date: Tue, 19 Oct 2021 14:03:46 +0800 Subject: [PATCH 201/298] Add auto parallel cost model and unittests (#36363) * Add auto parallel cost model and unittests * Fixed code styles. * Fixed bugs and codes style * fixed typo * Improved code style: object encapsulation. * Fixed codes. * Refractored estimate_cost * Fixed typo --- .../distributed/auto_parallel/__init__.py | 1 + .../distributed/auto_parallel/cost_model.py | 741 ++++++++++++++++++ .../fluid/tests/unittests/CMakeLists.txt | 3 + .../test_auto_parallel_cost_model.py | 236 ++++++ 4 files changed, 981 insertions(+) create mode 100644 python/paddle/distributed/auto_parallel/cost_model.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py index 31f92e2575a1f8..2779a9feb0b833 100644 --- a/python/paddle/distributed/auto_parallel/__init__.py +++ b/python/paddle/distributed/auto_parallel/__init__.py @@ -21,5 +21,6 @@ from .completion import complete_annotation # noqa: F401 from .completion import complete_backward_annotation # noqa: F401 from .reshard import reshard # noqa: F401 +from .cost_model import estimate_cost __all__ = [] diff --git a/python/paddle/distributed/auto_parallel/cost_model.py b/python/paddle/distributed/auto_parallel/cost_model.py new file mode 100644 index 00000000000000..3fd438e2a624a7 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/cost_model.py @@ -0,0 +1,741 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import json +import queue +import copy +from enum import Enum +import paddle + +SUCC = 0 # successor +PRED = 1 # predecessor + + +class CostNodeType(Enum): + DEFAULT = 0 + COMPUTATION = 1 + COMMUNICATION = 2 + VARIABLE = 3 + MERGED = 4 + NOP = 5 + + +class Cost(object): + def __init__(self): + self.runtime = None + self.static_mem = None + self.peak_mem = None + + +class CostModelMode(Enum): + DEFAULT = 0 + BENCHMARKING = 1 # costs based on trial runs + ANALYSIS = 2 # costs based on analysis + MIXED = 3 + + +class CostNode(object): + def __init__(self, node, node_type, id=None): + self.id = id + self.node = node + self.type = node_type + self._cost = 0 + self.is_optim = False + self.is_bwd = False + + @property + def cost(self): + return self._cost + + @cost.setter + def cost(self, cost): + if cost < 0: + raise ValueError('Cost must be above 0.') + self._cost = cost + + +class MergedOpsCostNode(CostNode): + def __init__(self, node_type, id=None, base_node_list=None, is_bwd=False): + super(MergedOpsCostNode, self).__init__(None, node_type, id) + self.node_list = base_node_list + self.is_bwd = is_bwd + + +class CommOpCostNode(CostNode): + def __init__(self, + node, + node_type, + id=None, + comm_node_list=None, + is_bwd=False): + super(CommOpCostNode, self).__init__(node, node_type, id) + self.node_list = comm_node_list + self.ranks = [] + self.comm_type = node.type + self.is_bwd = is_bwd + + def set_ranks(self, ranks): + self.ranks = ranks + + def set_shapes(self, input_shape, output_shape): + self.input_shape = input_shape + self.output_shape = output_shape + + def init_comm_cost(self, cluster=None): + # ref: https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md + # should get from `cluster` + BANDWIDTH = 32 * 1024 / 1000 # MB/ms, V100 PCIe + num_ranks = len(self.ranks) + comm_volumn = np.prod(self.input_shape) * 4 + + if 'allreduce' in self.comm_type: + self._cost = comm_volumn / (BANDWIDTH * num_ranks / + (2 * (num_ranks - 1))) + elif 'gather' in self.comm_type: + self._cost = comm_volumn / (BANDWIDTH * num_ranks / (num_ranks - 1)) + elif 'broadcast' in self.comm_type: + self._cost = comm_volumn / BANDWIDTH + elif 'send' in self.comm_type or 'recv' in self.comm_type: + self._cost = comm_volumn / BANDWIDTH + else: + self._cost = 0 + + +class TensorCostNode(CostNode): + def __init__(self, + node, + node_type, + id=None, + base_node_list=None, + batch_size=None, + shared_node_id=None): + super(TensorCostNode, self).__init__(node, node_type, id) + self.shape = node.shape + self.dtype = node.dtype + self.dtype_factor = 1 + self.persistable = None + self.shared_node_id = shared_node_id + if self.dtype == paddle.float32 or node.dtype == paddle.int32: + self.dtype_factor *= 4 + elif node.dtype == paddle.int64: + self.dtype_factor *= 8 + else: + raise NotImplementedError("{} not counted".format(v.node.dtype)) + + self.batch_size = None + if batch_size is not None: + self.batch_size = batch_size + + def get_size(self): + p = 1 + for i in self.node.shape: + if i == -1: # deal with placeholder + assert self.batch_size is not None, "Batch size not decided." + i = self.batch_size + p *= i + return p + + +class CompOpCostNode(CostNode): + def __init__(self, node, node_type, id=None, is_bwd=False, is_optim=False): + super(CompOpCostNode, self).__init__(node, node_type, id) + self.is_bwd = is_bwd + self.is_optim = is_optim + + def init_comp_cost(self, cost_data): + # TODO: improve fluid.CostModel for more specific cost_data + op_name = self.node.type + if op_name in cost_data.keys(): + self.cost = cost_data[op_name] + else: + self.cost = 0.0 + + +class PipeEvent(object): + def __init__(self, stage_id, event_name, duration, start_time=-1): + self.stage_id = stage_id + self.name = event_name + self.duration = duration + self.s_time = start_time + self.e_time = -1 + + +class CostModel(object): + def __init__(self, + mode=CostModelMode.BENCHMARKING, + cluster=None, + batch_size=1, + microbatch_num=1, + opcall_overhead=0, + standalone_cost_data=None, + pipeline_config=None): + self.mode = mode + + # parameters + self.opcall_overhead = opcall_overhead + self.batch_size = batch_size + self.microbatch_num = microbatch_num + + self.nodes = {} # name -> node + + self.origin_graph = {} # original graph + self.op_graph = {} # op graph (no variables nodes) + self.runtime_graph = {} # runtime graph, for simulation + + self.cluster = cluster + self.cost_data = standalone_cost_data + self.pp2rank = pipeline_config + if self.pp2rank is not None: + self.rank2pp = {} + for stage_idx, ranks in enumerate(self.pp2rank): + for rank in ranks: + self.rank2pp[rank] = stage_idx + else: + self.rank2pp = None + + self.ring2rank = {} + + self.fwd_time = [] + self.bwd_time = [] + self.optim_time = [] + + def _parse_sub_program(self, program, nodes, graph, cost_data, sub_idx): + assert len( + program.blocks) == 1, "Program more than 1 block not supported." + block = program.blocks[0] + + for var in block.vars.values(): + var_id = var.name + nodes[var_id] = TensorCostNode(var, CostNodeType.VARIABLE, var_id) + graph[var_id] = [[], []] + + for op in block.ops: + op_id = op.type + "_" + str(op.idx) + if op.type.startswith('c_') or op.type.startswith( + 'send') or op.type.startswith('recv'): + is_bwd = False + if op.type.startswith('c_'): + ring_id = op.attr('ring_id') + if ring_id not in self.ring2rank: + self.ring2rank[ring_id] = set() + self.ring2rank[ring_id].add(sub_idx) + is_bwd = '@GRAD' in op.output('Out')[0] + elif op.type.startswith('recv'): + is_bwd = '@GRAD' in op.output('Out')[0] + elif op.type.startswith('send'): + is_bwd = '@GRAD' in op.input('X')[0] + op_node = CommOpCostNode(op, CostNodeType.COMMUNICATION, op_id, + is_bwd) + else: + is_bwd = '_grad' in op.type + is_optim = 'LearningRate' in op.input_names + op_node = CompOpCostNode(op, CostNodeType.COMPUTATION, op_id, + is_bwd, is_optim) + op_node.init_comp_cost(cost_data) + + nodes[op_id] = op_node + graph[op_id] = [[], []] + + comm_input_shape = [0] + comm_output_shape = [0] + for i in range(len(op.input_names)): + try: + var_id = op.input(op.input_names[i])[0] + var_node = nodes[var_id] + graph[op_id][PRED].append(var_node.id) + graph[var_id][SUCC].append(op_node.id) + comm_input_shape = var_node.shape + except: + continue + for i in range(len(op.output_names)): + try: + var_id = op.output(op.output_names[i])[0] + var_node = nodes[var_id] + graph[op_id][SUCC].append(var_node.id) + graph[var_id][PRED].append(op_node.id) + comm_output_shape = var_node.shape + except: + continue + if op_node.type == CostNodeType.COMMUNICATION: + op_node.set_shapes(comm_input_shape, comm_output_shape) + + # resolve hazard: rename the r/w hazard variable nodes to ensure self.origin_graph is a DAG + new_var_dict = {} + for node_id, node in nodes.items(): + if node.type == CostNodeType.VARIABLE and node.node.persistable: + write_op_cnt = 0 + for pred_id in graph[node_id][PRED]: + pred = nodes[pred_id] + if pred.type == CostNodeType.COMPUTATION and ( + pred_id in graph[node_id][SUCC]): + + graph[pred_id][SUCC].remove(node_id) + graph[node_id][PRED].remove(pred_id) + + write_op_cnt += 1 + new_var_id = node_id + '_write_{}'.format(write_op_cnt) + new_var = TensorCostNode( + node.node, + CostNodeType.VARIABLE, + new_var_id, + shared_node_id=node_id) + + graph[new_var_id] = [[], []] + graph[pred_id][SUCC].append(new_var_id) + graph[new_var_id][PRED].append(pred_id) + + new_var_dict[new_var_id] = new_var + for k, v in new_var_dict.items(): + nodes[k] = v + return nodes + + def parse_program(self, distributed_program): + self.distributed_program = distributed_program + self.total_rank = len(self.distributed_program) + sub_prog_cnt = len(distributed_program) + self.nodes = [] * sub_prog_cnt + self.origin_graph = [] * sub_prog_cnt # original graph + self.op_graph = [] * sub_prog_cnt # op graph (no variables nodes) + self.runtime_graph = [] * sub_prog_cnt # runtime graph, for simulation + + for sub_idx, sub_prog in enumerate(distributed_program): + self.nodes.append({}) + self.origin_graph.append({}) + self.op_graph.append({}) + self.runtime_graph.append({}) + self._parse_sub_program( + sub_prog, self.nodes[sub_idx], self.origin_graph[sub_idx], + self.cost_data[0 if self.rank2pp is None else self.rank2pp[ + sub_idx]], sub_idx) + return self.nodes + + def _find_succ_op(self, node_id, sub_idx=0): + succ_ops_id = [] + for succ_id in self.origin_graph[sub_idx][node_id][SUCC]: + succ = self.nodes[sub_idx][succ_id] + if succ.type == CostNodeType.COMMUNICATION or \ + succ.type == CostNodeType.COMPUTATION: + succ_ops_id.append(succ_id) + elif succ.type == CostNodeType.VARIABLE: + succ_ops_id = succ_ops_id + self._find_succ_op(succ_id, sub_idx) + else: + raise NotImplementedError( + 'This type of node not supported yet:{}'.format(succ.type)) + return succ_ops_id + + def build_op_graph(self): + for sub_idx in range(self.total_rank): + op_nodes_id = [] + for node_id, node in self.nodes[sub_idx].items(): + if node.type == CostNodeType.VARIABLE: + continue + self.op_graph[sub_idx][node_id] = [[], []] + op_nodes_id.append(node_id) + for op_id in op_nodes_id: + succ_nodes_id = self._find_succ_op(op_id, sub_idx) + + self.op_graph[sub_idx][op_id][SUCC] = succ_nodes_id + for succ_id in succ_nodes_id: + self.op_graph[sub_idx][succ_id][PRED].append(op_id) + + def build_runtime_graph(self): + self.runtime_graph = copy.deepcopy(self.op_graph) + + def eliminate_multi_edges(self, graph=None): + for node_id, edges in graph.items(): + graph[node_id][PRED] = list(set(edges[PRED])) + graph[node_id][SUCC] = list(set(edges[SUCC])) + + def merge_comm(self): + for sub_idx in range(self.total_rank): + for node_id, edges in self.op_graph[sub_idx].items(): + node = self.nodes[sub_idx][node_id] + if node_id.startswith('c_'): + ring_id = node.node.attr('ring_id') + node.set_ranks(list(self.ring2rank[ring_id])) + node.init_comm_cost(self.cluster) + elif node_id.startswith('send') or node_id.startswith('recv'): + peer_rank = node.node.attr('peer') + node.set_ranks([sub_idx, peer_rank]) + node.init_comm_cost(self.cluster) + else: + pass # Not communication op + + def _merge_node(self, to_merge_node_list, merge_type='linear', nodes=None): + nodes_list = [] + node_cost = 0 + for node in to_merge_node_list: + if isinstance(node, MergedOpsCostNode): + nodes_list += node.node_list + else: + nodes_list.append(node.id) + if merge_type == 'linear': + node_cost += node.cost + elif merge_type == 'branch': + node_cost = max(node_cost, node.cost) + else: + raise NotImplementedError( + 'This type of merging is not supported:{}'.format( + merge_type)) + merged_node_id = 'merged_' + str(len(nodes)) + is_bwd = to_merge_node_list[0].is_bwd + merged_node = MergedOpsCostNode( + CostNodeType.MERGED, + id=merged_node_id, + base_node_list=nodes_list, + is_bwd=is_bwd) + merged_node.cost = node_cost + return merged_node_id, merged_node + + def merge_linear(self): + ''' + This method does the following: + If X depends on Y only, they must be run sequentially. + [ e.g. A ->- C ->- D D and E depends on C only.] + [ B ->-/ \->- E C depends on A and B. ] + We merge X and Y into a new node and sum up their cost time. + ''' + cnt = 0 + for sub_idx in range(self.total_rank): + cnt += self._merge_linear( + self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=False) + cnt += self._merge_linear( + self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=True) + return cnt + + def merge_branch(self): + ''' + This method does the following: + If a node has more than one successor, there is *branch*. + [ e.g. A ->- B ->- D ] + [ \->- C ->- / , B and C can be run at the same time ] + case 1: if B or C is null (or D is directly dependent on A), + it's equivalent to A->C->D or A->B->D, fall back to self.merge_linear + case 2: if both B and C are some op, + merged_cost = max(cost(B), cost(C)) + ''' + cnt = 0 + for sub_idx in range(self.total_rank): + cnt += self._merge_branch( + self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=False) + cnt += self._merge_branch( + self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=True) + return cnt + + def _merge_linear(self, nodes, runtime_graph, is_bwd=False): + reduct_cnt = 0 + rt_nodes_id = list(runtime_graph.keys()) + for node_id in rt_nodes_id: + if node_id not in runtime_graph.keys(): + continue + node = nodes[node_id] + if not is_bwd == node.is_bwd or node.is_optim: + continue + edges = runtime_graph[node_id] + ind = len(edges[PRED]) # in_degree + if ind == 1: # only depend on one node + pred_id = edges[PRED][0] + pred = nodes[pred_id] + merged_node_id, merged_node = self._merge_node( + [node, pred], merge_type='linear', nodes=nodes) + nodes[merged_node_id] = merged_node + runtime_graph[merged_node_id] = [[], []] + + # delete edges and add new edges + succ = None + runtime_graph[merged_node_id][SUCC] = copy.deepcopy(edges[SUCC]) + if len(runtime_graph[pred_id][SUCC]) > 1: + # predecessor has more than 1 successor + # the merged_node is to inherit the rest of its successors + succ = runtime_graph[pred_id][SUCC] + succ.remove(node_id) + runtime_graph[merged_node_id][SUCC] += succ + runtime_graph[merged_node_id][PRED] = runtime_graph[pred_id][ + PRED] + for i in runtime_graph[pred_id][PRED]: + runtime_graph[i][SUCC].remove(pred_id) + runtime_graph[i][SUCC].append(merged_node_id) + + for i in edges[SUCC]: + runtime_graph[i][PRED].remove(node_id) + runtime_graph[i][PRED].append(merged_node_id) + if succ is not None: + for i in succ: + runtime_graph[i][PRED].remove(pred_id) + runtime_graph[i][PRED].append(merged_node_id) + + runtime_graph.pop(node_id) + runtime_graph.pop(pred_id) + reduct_cnt += 1 + self.eliminate_multi_edges(runtime_graph) + return reduct_cnt # the number of nodes that have been reduced + + def _merge_branch(self, nodes, runtime_graph, is_bwd=False): + reduct_cnt = 0 + rt_nodes_id = list(runtime_graph.keys()) + for node_id in rt_nodes_id: + node = nodes[node_id] + if not is_bwd == node.is_bwd or node.is_optim: + continue + edges = runtime_graph[node_id] + outd = len(edges[SUCC]) # out_degree + if outd > 1: # branch out + succ_nodes_id = edges[SUCC] + + succ_to_elim = [] + for succ_id in succ_nodes_id: + for succ_2_id in succ_nodes_id: + tmp = runtime_graph[succ_2_id][SUCC] + if succ_id in tmp: + succ_to_elim.append(succ_id) + break + for id in succ_to_elim: + edges[SUCC].remove(id) + runtime_graph[id][PRED].remove(node_id) + reduct_cnt += 1 + + to_merge = True + if len(edges[SUCC]) < 1 or len(runtime_graph[edges[SUCC][0]][ + SUCC]) < 1: + continue + end_node_id = runtime_graph[edges[SUCC][0]][SUCC][0] + for i in succ_nodes_id: + if len(runtime_graph[i][SUCC]) != 1 or \ + runtime_graph[i][SUCC][0] != end_node_id: + to_merge = False # if branches has different end node, we don't merge them + break + if to_merge: + to_merge_node_list = [nodes[i] for i in succ_nodes_id] + merged_node_id, merged_node = self._merge_node( + to_merge_node_list, merge_type='branch', nodes=nodes) + nodes[merged_node_id] = merged_node + runtime_graph[merged_node_id] = [[], []] + + # delete edges and add new edges + runtime_graph[merged_node_id][SUCC] = [end_node_id] + runtime_graph[merged_node_id][PRED] = edges[PRED] + + runtime_graph[end_node_id][PRED] = [merged_node_id] + runtime_graph[node_id][SUCC] = [merged_node_id] + + for i in succ_nodes_id: + runtime_graph.pop(i) + reduct_cnt += len(to_merge_node_list) - 1 + return reduct_cnt + + def get_runtime_cost(self): + def get_node_cost(node): + node_cost = node.cost + self.opcall_overhead + if isinstance(node, MergedOpsCostNode): + for it in node.node_list: + node_cost += self.opcall_overhead + return node_cost + + for sub_idx in range(self.total_rank): + fwd_cost = 0 + bwd_cost = 0 + optim_cost = 0 + for node_id in self.runtime_graph[sub_idx].keys(): + node = self.nodes[sub_idx][node_id] + if node.is_optim: + optim_cost += get_node_cost(node) + elif node.is_bwd: + bwd_cost += get_node_cost(node) + else: + fwd_cost += get_node_cost(node) + self.fwd_time.append(fwd_cost) + self.bwd_time.append(bwd_cost) + self.optim_time.append(optim_cost) + return self.fwd_time, self.bwd_time, self.optim_time + + def get_mem(self): + static_list = [] + top_list = [] + for sub_idx in range(self.total_rank): + static_mem, cur_mem, top_mem = self._simulate_mem( + self.nodes[sub_idx], self.origin_graph[sub_idx]) + static_list.append(static_mem) + top_list.append(top_mem) + return static_list, top_list + + def _simulate_mem(self, nodes, origin_graph): + q = queue.Queue(1024) + sim_graph = copy.deepcopy(origin_graph) + for node_id, node in nodes.items(): + if len(sim_graph[node_id][PRED]) == 0: + q.put(node_id) + + q.put('nop') + cur_mem = 0 + top_mem = -1 + static_mem = 0 + while not q.empty(): + node_id = q.get() + node = None + size = 0 + if node_id == 'nop': + top_mem = max(cur_mem, top_mem) + if q.empty(): + break + else: + q.put(node_id) + continue + else: + node = nodes[node_id] + if node.type == CostNodeType.VARIABLE: + size = node.get_size() + if node.node.persistable: + static_mem += size + cur_mem += size + edges = sim_graph[node_id] + if not (node.type == CostNodeType.VARIABLE and + node.node.persistable): + for succ_id in edges[SUCC]: + sim_graph[succ_id][PRED].remove(node_id) + if len(sim_graph[succ_id][PRED]) == 0: + q.put(succ_id) + for pred_id in edges[PRED]: + pred = nodes + if pred.type == CostNodeType.VARIABLE: + sim_graph[pred_id][SUCC].remove(node_id) + if len(sim_graph[pred_id][ + SUCC]) == 0 and not pred.node.persistable: + cur_mem -= pred.get_size() + return static_mem, cur_mem, top_mem + + def get_pipeline_time(self): + if self.total_rank <= 1: + return self.fwd_time[0] + self.bwd_time[0] + self.optim_time[0] + else: + return self._simulate_pipeline() + + def _simulate_pipeline(self): + stage_num = len(self.pp2rank) + event_list = [] + global_time = [0] * stage_num + total_time = 0 + fwd_cnt = list(range(stage_num, 0, -1)) + bwd_cnt = [self.microbatch_num] * stage_num + q = queue.Queue(1024) + + for i in range(self.microbatch_num): + q.put(PipeEvent(0, 'fwd', self.fwd_time[0])) + + while not q.empty(): + e = q.get() + stid = e.stage_id + if e.name == 'fwd': + if fwd_cnt[stid] > 0: + e.s_time = max(global_time[stid], e.s_time) + e.e_time = e.s_time + e.duration + event_list.append(e) + if stid != stage_num - 1: + q.put( + PipeEvent( + stid + 1, + 'fwd', + self.fwd_time[stid + 1], + start_time=e.e_time)) + else: + q.put( + PipeEvent( + stid, + 'bwd', + self.bwd_time[stid], + start_time=e.e_time)) + fwd_cnt[stid] -= 1 + global_time[stid] = e.e_time + else: + q.put(e) + elif e.name == 'bwd': + e.s_time = max(global_time[stid], e.s_time) + e.e_time = e.s_time + e.duration + event_list.append(e) + if stid != 0: + q.put( + PipeEvent( + stid - 1, + 'bwd', + self.bwd_time[stid - 1], + start_time=e.e_time)) + fwd_cnt[stid] += 1 + bwd_cnt[stid] -= 1 + if bwd_cnt[stid] == 0: + q.put( + PipeEvent( + stid, + 'optim', + self.optim_time[stid], + start_time=e.e_time)) + global_time[stid] = e.e_time + elif e.name == 'optim': + e.s_time = max(global_time[stid], e.s_time) + e.e_time = e.s_time + e.duration + event_list.append(e) + global_time[stid] = e.e_time + else: + raise NotImplementedError( + 'This type of pipe event is not supported yet.{}'.format( + e.name)) + + for t in global_time: + total_time = max(total_time, t) + return total_time + + def get_cost(self): + cost = Cost() + static_mem, peak_mem = self.get_mem() + cost.static_mem = static_mem + cost.peak_mem = peak_mem + self.merge_comm() + while True: + cnt = 0 + cnt += self.merge_linear() + cnt += self.merge_branch() + if cnt == 0: # can't be further merged + break + self.get_runtime_cost() + cost.runtime = self.get_pipeline_time() + return cost + + def init(self, distributed_program): + self.parse_program(distributed_program) + self.build_op_graph() + for sub_idx in range(self.total_rank): + self.eliminate_multi_edges(self.op_graph[sub_idx]) + self.build_runtime_graph() + + +def estimate_cost(distributed_program, cluster, pipeline_config, + standalone_cost_data, batch_size): + """ + Estimated cost from distributed program, cluster model and distributed settings. + + Args: + distributed_program(list): list of paddle programs + cluster(Cluster): cluster model + standalone_cost_data(CostData): cost data given by paddle.core + batch_size(int): batch size of the training workload + pipeline_config(list): configuration of pipeline stage allocation + """ + # the following line is left for now, cluster model will be involved in the future + assert cluster is None, "For now, cluster remains None" + cm_ctx = CostModel( + cluster=cluster, + batch_size=batch_size, + standalone_cost_data=standalone_cost_data, + pipeline_config=pipeline_config) + cm_ctx.init(distributed_program) + cost = cm_ctx.get_cost() + return cost diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f883d7a80a4122..90f59758a2faf9 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -91,6 +91,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard) list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_serial) list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_mppp) list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_dpmppp) +list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_cost_model) foreach(TEST_OP ${MIXED_DIST_TEST_OPS}) list(REMOVE_ITEM TEST_OPS ${TEST_OP}) endforeach() @@ -234,6 +235,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_serial) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_mppp) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_dpmppp) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_cost_model) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_data_unshard) elseif(WITH_GPU) if (${CUDNN_VERSION} VERSION_LESS 7100) @@ -608,6 +610,7 @@ if(WITH_DISTRIBUTE) py_test_modules(test_auto_parallel_reshard_serial MODULES test_auto_parallel_reshard_serial ENVS ${dist_ENVS}) py_test_modules(test_auto_parallel_reshard_mppp MODULES test_auto_parallel_reshard_mppp ENVS ${dist_ENVS}) py_test_modules(test_auto_parallel_reshard_dpmppp MODULES test_auto_parallel_reshard_dpmppp ENVS ${dist_ENVS}) + py_test_modules(test_auto_parallel_cost_model MODULES test_auto_parallel_cost_model ENVS ${dist_ENVS}) endif(NOT WIN32) endif(NOT APPLE) if(WITH_DGC) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py new file mode 100644 index 00000000000000..58d033ad658315 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py @@ -0,0 +1,236 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.context import DistributedContext +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.completion import complete_backward_annotation +from paddle.distributed.auto_parallel.reshard import reshard +from paddle.distributed.auto_parallel.cost_model import estimate_cost +import paddle.fluid.core as core + +paddle.enable_static() +_global_parallel_strategy = "dp_mp_pp" +ROOT_MESH = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]]) +_global_process_mesh = auto.ProcessMesh( + [[[0, 1], [4, 5]], [[2, 3], [6, 7]]], parent=ROOT_MESH) +PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], parent=ROOT_MESH) +PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], parent=ROOT_MESH) +NUM_RANKS = 8 +STAGE_0_CNT = 5 +STAGE_1_CNT = 10 +pp_cfg = [[0, 1, 4, 5], [2, 3, 6, 7]] + +device = "gpu" if core.is_compiled_with_cuda() else "cpu" + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=256, + intermediate_size=4 * 256, + initializer_range=0.02, + is_distributed=True): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + + self.is_distributed = is_distributed + + def forward(self, input): + if self.is_distributed: + auto.shard_tensor( + self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 1]) + auto.shard_tensor( + self.linear1.weight, PP_MESH_1, dim_mapping=[1, -1]) + + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + + return out + + +def get_single_node_data(): + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + + loss, train_program, startup_program = mlp_forward( + train_program, startup_program, is_distributed=False) + + cost_model = core.CostModel() + cost_data = cost_model.profile_measure(train_program, startup_program, + device, ["time"]) + + op_name2cost = [{}, {}] + for idx, op in enumerate(train_program.blocks[0].ops): + if idx <= STAGE_0_CNT: + op_name2cost[0][op.type] = cost_data.get_op_time_ms(idx) + elif idx <= STAGE_1_CNT: + op_name2cost[1][op.type] = cost_data.get_op_time_ms(idx) + return op_name2cost + + +def mlp_forward(train_program, start_program, is_distributed=True): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 256 + sequence_len = 128 + if is_distributed: + input = static.data( + name="input", shape=[batch_size, hidden_size], dtype='float32') + label = static.data( + name="label", shape=[batch_size, 1], dtype='float32') + else: + input = paddle.ones( + name="input", shape=[batch_size, hidden_size], dtype='float32') + label = paddle.ones( + name="label", shape=[batch_size, 1], dtype='float32') + + if is_distributed: + auto.shard_tensor(input, PP_MESH_0, dim_mapping=[0, -1]) + auto.shard_tensor(label, PP_MESH_1, dim_mapping=[0, -1]) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + initializer_range=0.02, + is_distributed=is_distributed) + + predict = mlp(input) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + + return loss, train_program, start_program + + +def get_dist_prog(train_program, startup_program, dist_context, rank_id): + global _global_process_mesh + dist_context.set_process_mesh(_global_process_mesh) + loss, train_program, startup_program = mlp_forward(train_program, + startup_program) + + # auto completion + complete_train_program = auto.complete_annotation(train_program, + dist_context) + + dist_strategy = fleet.DistributedStrategy() + dist_main_prog = [] + dist_startup_prog = [] + for rank_id in range(NUM_RANKS): + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + # logical partition + auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + dist_params_grads = partitioner.apply_backward( + loss, complete_train_program, startup_program, + auto_parallel_main_prog, auto_parallel_startup_prog) + optimizer = paddle.fluid.optimizer.AdamOptimizer() + opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, + auto_parallel_main_prog, + auto_parallel_startup_prog) + dist_main_prog.append(auto_parallel_main_prog) + dist_startup_prog.append(auto_parallel_startup_prog) + return dist_main_prog, dist_startup_prog + + +def check_runtime_estimation(cost): + return cost.runtime > 0 + + +def check_memory_estimation(cost): + for i in range(NUM_RANKS): + if cost.static_mem[i] <= 0 or cost.peak_mem[i] <= 0: + return False + if cost.static_mem[i] > cost.peak_mem[i]: + return False + return True + + +def check_empty_program_runtime(cost): + return cost.runtime == 0 + + +def check_empty_program_memory(cost): + for mem in cost.peak_mem: + if mem > 0: + return False + for mem in cost.static_mem: + if mem > 0: + return False + return True + + +class TestCostModel(unittest.TestCase): + def test_empty_program_cost_model(self): + empty_program = paddle.static.Program() + startup_program = paddle.static.Program() + standalone_cost_data = [{}] + empty_pp_cfg = None + cluster = None + cost = estimate_cost( + [empty_program], + cluster=cluster, + pipeline_config=empty_pp_cfg, + standalone_cost_data=standalone_cost_data, + batch_size=1) + + self.assertTrue(check_empty_program_runtime(cost)) + self.assertTrue(check_empty_program_memory(cost)) + + def test_auto_parallel_cost_model(self): + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + standalone_cost_data = get_single_node_data() + distributed_program, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, 0) + for rank_id in range(NUM_RANKS): + complete_backward_annotation(distributed_program[rank_id], + dist_context) + reshard(distributed_program[rank_id], dist_startup_prog[rank_id], + rank_id, dist_context) + cluster = None + cost = estimate_cost( + distributed_program, + cluster=cluster, + pipeline_config=pp_cfg, + standalone_cost_data=standalone_cost_data, + batch_size=4) + self.assertTrue(check_runtime_estimation(cost)) + self.assertTrue(check_memory_estimation(cost)) + + +if __name__ == "__main__": + unittest.main() From 34d785c22803db1d45148f8dfd175cbaae05a485 Mon Sep 17 00:00:00 2001 From: Yulong Ao Date: Tue, 19 Oct 2021 14:10:27 +0800 Subject: [PATCH 202/298] [paddle.linalg.qr] Add the Qr Operator (#35742) * Add QR decomposition op * Change codes to adapt to new svd_helper * Update linalg.py Restore the deleted comma * Restore the deleted line * Update linalg.py * Update linalg.py * Improve the qr code by reviews * Update QR based on CI results * Update qr doc, test=document_fix * Change unsafe and ill-formed codes --- cmake/operators.cmake | 1 + paddle/fluid/operators/qr_op.cc | 152 +++++++++ paddle/fluid/operators/qr_op.cu | 309 ++++++++++++++++++ paddle/fluid/operators/qr_op.h | 135 ++++++++ paddle/fluid/operators/svd_helper.h | 13 + paddle/fluid/platform/dynload/cusolver.h | 18 +- .../fluid/tests/unittests/test_qr_op.py | 173 ++++++++++ python/paddle/linalg.py | 2 + python/paddle/tensor/__init__.py | 2 + python/paddle/tensor/linalg.py | 66 +++- 10 files changed, 869 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/operators/qr_op.cc create mode 100644 paddle/fluid/operators/qr_op.cu create mode 100644 paddle/fluid/operators/qr_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_qr_op.py diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 228da9f77739d7..5eecbefa2fcfb9 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -185,6 +185,7 @@ function(op_library TARGET) list(REMOVE_ITEM hip_srcs "cholesky_op.cu") list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu") list(REMOVE_ITEM hip_srcs "svd_op.cu") + list(REMOVE_ITEM hip_srcs "qr_op.cu") list(REMOVE_ITEM hip_srcs "eigh_op.cu") list(REMOVE_ITEM hip_srcs "multinomial_op.cu") list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu") diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc new file mode 100644 index 00000000000000..f612bb9e31f930 --- /dev/null +++ b/paddle/fluid/operators/qr_op.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/qr_op.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { +using DDim = framework::DDim; + +class QrOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "qr"); + OP_INOUT_CHECK(ctx->HasOutput("Q"), "Output", "Q", "qr"); + OP_INOUT_CHECK(ctx->HasOutput("R"), "Output", "R", "qr"); + + auto x_dims = ctx->GetInputDim("X"); + int x_rank = x_dims.size(); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + platform::errors::InvalidArgument( + "the rank of input must greater than 2")); + bool compute_q; + bool reduced_mode; + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + std::string mode = ctx->Attrs().Get("mode"); + std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); + + if (compute_q) { + int k = reduced_mode ? min_mn : m; + auto q_dims_vec = framework::vectorize(x_dims); + q_dims_vec[q_dims_vec.size() - 1] = k; + ctx->SetOutputDim("Q", framework::make_ddim(q_dims_vec)); + } else { + ctx->SetOutputDim("Q", framework::make_ddim({0})); + } + + int k = reduced_mode ? min_mn : m; + auto r_dims_vec = framework::vectorize(x_dims); + r_dims_vec[r_dims_vec.size() - 2] = k; + r_dims_vec[r_dims_vec.size() - 1] = n; + ctx->SetOutputDim("R", framework::make_ddim(r_dims_vec)); + + ctx->ShareLoD("X", /*->*/ "Q"); + ctx->ShareLoD("X", /*->*/ "R"); + } +}; + +class QrOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of qr op."); + AddOutput("Q", "(Tensor), The output Q tensor of qr op."); + AddOutput("R", "(Tensor), The output R tensor of qr op."); + AddAttr( + "mode", + "(string, default \"reduced\"). " + "If mode is \"reduced\", Qr op will return reduced Q and R matrices. " + "If mode is \"complete\", Qr op will return complete Q and R matrices. " + "If mode is \"r\", Qr op will only return reduced R matrix.") + .SetDefault("reduced"); + AddComment(R"DOC( +Qr Operator. + +This operator is used to perform QR operation for batched matrics $X$. +$$Q, R = qr(X)$$ + +)DOC"); + } +}; + +class QrGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Q")), "Input", + "Q@Grad", "QrGrad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("R")), "Input", + "R@Grad", "QrGrad"); + OP_INOUT_CHECK(ctx->HasInput("Q"), "Input", "Q", "QrGrad"); + OP_INOUT_CHECK(ctx->HasInput("R"), "Input", "R", "QrGrad"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + "X@Grad", "QrGrad"); + + auto x_dims = ctx->GetInputDim(("X")); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(dtype, ctx.GetPlace()); + } +}; + +template +class QrGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr retv) const override { + retv->SetType("qr_grad"); + retv->SetInput(framework::GradVarName("Q"), this->OutputGrad("Q")); + retv->SetInput(framework::GradVarName("R"), this->OutputGrad("R")); + retv->SetInput("Q", this->Output("Q")); + retv->SetInput("R", this->Output("R")); + retv->SetInput("X", this->Input("X")); + retv->SetAttrMap(this->Attrs()); + retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker, + ops::QrGradMaker, + ops::QrGradMaker); + +REGISTER_OPERATOR(qr_grad, ops::QrGradOp); + +REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel, ops::QrCPUKernel); + +REGISTER_OP_CPU_KERNEL( + qr_grad, ops::QrGradKernel, + ops::QrGradKernel); diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu new file mode 100644 index 00000000000000..992df172ace0c7 --- /dev/null +++ b/paddle/fluid/operators/qr_op.cu @@ -0,0 +1,309 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef PADDLE_WITH_HIP +// HIP not support cusolver + +#include +#include +#include +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/qr_op.h" +#include "paddle/fluid/platform/dynload/cusolver.h" + +// Reuse some helper functions from svd +#include "paddle/fluid/operators/svd_helper.h" + +namespace paddle { +namespace operators { + +template +class QrGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool compute_q; + bool reduced_mode; + auto& dev_ctx = + context.template device_context(); + const Tensor& x = *context.Input("X"); + Tensor& q = *context.Output("Q"); + Tensor& r = *context.Output("R"); + const std::string mode = context.Attr("mode"); + std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); + + auto numel = x.numel(); + PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet( + "The input of QR is empty.")); + auto x_dims = x.dims(); + int x_rank = x_dims.size(); + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + int k = reduced_mode ? min_mn : m; + int batch_size = numel / (m * n); + int qr_stride = m * n; + int tau_stride = min_mn; + + if (compute_q) { + q.mutable_data>( + context.GetPlace(), + size_t(batch_size * m * k * sizeof(math::Real))); + } + r.mutable_data>( + context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real))); + + auto dito = + math::DeviceIndependenceTensorOperations(context); + + // Note: allocate temporary tensors because of lacking in-place operatios. + // Prepare qr + Tensor qr; + qr.mutable_data>( + context.GetPlace(), size_t(batch_size * m * n * sizeof(math::Real))); + // BatchedGeqrf performs computation in-place and 'qr' must be a copy of + // input + TensorCopy(x, context.GetPlace(), &qr); + + // Prepare tau + auto tau_dims_vec = framework::vectorize(x_dims); + tau_dims_vec.pop_back(); + tau_dims_vec[tau_dims_vec.size() - 1] = min_mn; + Tensor tau = dito.Fill(tau_dims_vec, 0); + + // Transpose 'qr' to conform the column-major order + auto tmp_qr = dito.Transpose(qr); + framework::TensorCopy(tmp_qr, qr.place(), &qr); + auto qr_data = qr.mutable_data(context.GetPlace()); + auto tau_data = tau.mutable_data(context.GetPlace()); + + BatchedGeqrf(dev_ctx, batch_size, m, n, qr_data, m, tau_data, qr_stride, + tau_stride); + + if (reduced_mode) { + auto trans_qr = dito.Transpose(qr); + auto sliced_qr = dito.Slice(trans_qr, {-2}, {0}, {min_mn}); + auto tmp_r = dito.TrilTriu(sliced_qr, 0, false); + // Transpose 'tmp_r' to retore the original row-major order + framework::TensorCopy(tmp_r, r.place(), &r); + } else { + auto trans_qr = dito.Transpose(qr); + auto tmp_r = dito.TrilTriu(trans_qr, 0, false); + // Transpose 'tmp_r' to retore the original row-major order + framework::TensorCopy(tmp_r, r.place(), &r); + } + + if (compute_q) { + // Perform QRGQR for Q using the result from GEQRF + // Transpose 'q' to retore the original row-major order + if (reduced_mode) { + BatchedOrgqr(dev_ctx, batch_size, m, min_mn, min_mn, qr_data, m, + tau_data, qr_stride, tau_stride); + auto trans_q = dito.Transpose(qr); + auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {min_mn}); + framework::TensorCopy(sliced_q, q.place(), &q); + } else { + if (m > n) { + auto new_qr_dims_vec = framework::vectorize(x_dims); + new_qr_dims_vec[new_qr_dims_vec.size() - 1] = m; + Tensor new_qr = dito.Fill(new_qr_dims_vec, 0); + auto new_qr_data = new_qr.mutable_data(context.GetPlace()); + auto new_qr_stride = m * m; + for (int i = 0; i < batch_size; ++i) { + memory::Copy( + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + (new_qr_data + i * new_qr_stride), + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + (qr_data + i * qr_stride), qr_stride * sizeof(math::Real), + dev_ctx.stream()); + } + BatchedOrgqr(dev_ctx, batch_size, m, m, min_mn, new_qr_data, m, + tau_data, new_qr_stride, tau_stride); + auto trans_q = dito.Transpose(new_qr); + framework::TensorCopy(trans_q, q.place(), &q); + } else { + BatchedOrgqr(dev_ctx, batch_size, m, m, min_mn, qr_data, m, tau_data, + qr_stride, tau_stride); + auto trans_q = dito.Transpose(qr); + auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {m}); + framework::TensorCopy(sliced_q, q.place(), &q); + } + } + } + } + + void BatchedGeqrf(const platform::CUDADeviceContext& dev_ctx, int batch_size, + int m, int n, float* a, int lda, float* tau, int a_stride, + int tau_stride) const; + + void BatchedGeqrf(const platform::CUDADeviceContext& dev_ctx, int batch_size, + int m, int n, double* a, int lda, double* tau, int a_stride, + int tau_stride) const; + + void BatchedOrgqr(const platform::CUDADeviceContext& dev_ctx, int batch_size, + int m, int n, int k, float* a, int lda, float* tau, + int a_stride, int tau_stride) const; + + void BatchedOrgqr(const platform::CUDADeviceContext& dev_ctx, int batch_size, + int m, int n, int k, double* a, int lda, double* tau, + int a_stride, int tau_stride) const; +}; + +template <> +void QrGPUKernel::BatchedGeqrf( + const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n, + float* a, int lda, float* tau, int a_stride, int tau_stride) const { + int lwork = 0; + + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf_bufferSize( + handle, m, n, a, lda, &lwork)); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); + float* workspace_ptr = reinterpret_cast(workspace->ptr()); + auto info = memory::Alloc(dev_ctx, sizeof(int)); + int* info_d = reinterpret_cast(info->ptr()); + + for (int i = 0; i < batch_size; ++i) { + float* a_working_ptr = &a[i * a_stride]; + float* tau_working_ptr = &tau[i * tau_stride]; + // compute geqrf + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf( + handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork, + info_d)); + // Do we need synchronized here? + // check the error info + int info_h; + memory::Copy(platform::CPUPlace(), &info_h, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + info_d, sizeof(int), dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + info_h, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: CUSolver geqrf is not zero. [%d]", i, info_h)); + } +} + +template <> +void QrGPUKernel::BatchedGeqrf( + const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n, + double* a, int lda, double* tau, int a_stride, int tau_stride) const { + int lwork = 0; + + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf_bufferSize( + handle, m, n, a, lda, &lwork)); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); + double* workspace_ptr = reinterpret_cast(workspace->ptr()); + auto info = memory::Alloc(dev_ctx, sizeof(int)); + int* info_d = reinterpret_cast(info->ptr()); + + for (int i = 0; i < batch_size; ++i) { + double* a_working_ptr = &a[i * a_stride]; + double* tau_working_ptr = &tau[i * tau_stride]; + // compute geqrf + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf( + handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork, + info_d)); + // Do we need synchronized here? + // check the error info + int info_h; + memory::Copy(platform::CPUPlace(), &info_h, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + info_d, sizeof(int), dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + info_h, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: CUSolver geqrf is not zero. [%d]", i, info_h)); + } +} + +template <> +void QrGPUKernel::BatchedOrgqr( + const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n, + int k, float* a, int lda, float* tau, int a_stride, int tau_stride) const { + int lwork = 0; + + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr_bufferSize( + handle, m, n, k, a, lda, tau, &lwork)); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); + float* workspace_ptr = reinterpret_cast(workspace->ptr()); + auto info = memory::Alloc(dev_ctx, sizeof(int)); + int* info_d = reinterpret_cast(info->ptr()); + + for (int i = 0; i < batch_size; ++i) { + float* a_working_ptr = &a[i * a_stride]; + float* tau_working_ptr = &tau[i * tau_stride]; + // compute orggr + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr( + handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr, + lwork, info_d)); + // Do we need synchronized here? + // check the error info + int info_h; + memory::Copy(platform::CPUPlace(), &info_h, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + info_d, sizeof(int), dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + info_h, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: CUSolver QR is not zero. [%d]", i, info_h)); + } +} + +template <> +void QrGPUKernel::BatchedOrgqr( + const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n, + int k, double* a, int lda, double* tau, int a_stride, + int tau_stride) const { + int lwork = 0; + + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr_bufferSize( + handle, m, n, k, a, lda, tau, &lwork)); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); + double* workspace_ptr = reinterpret_cast(workspace->ptr()); + auto info = memory::Alloc(dev_ctx, sizeof(int)); + int* info_d = reinterpret_cast(info->ptr()); + + for (int i = 0; i < batch_size; ++i) { + double* a_working_ptr = &a[i * a_stride]; + double* tau_working_ptr = &tau[i * tau_stride]; + // compute orggr + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr( + handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr, + lwork, info_d)); + // Do we need synchronized here? + // check the error info + int info_h; + memory::Copy(platform::CPUPlace(), &info_h, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + info_d, sizeof(int), dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + info_h, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: CUSolver QR is not zero. [%d]", i, info_h)); + } +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(qr, ops::QrGPUKernel, ops::QrGPUKernel); +REGISTER_OP_CUDA_KERNEL( + qr_grad, ops::QrGradKernel, + ops::QrGradKernel); + +#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h new file mode 100644 index 00000000000000..73ba52f590c0d7 --- /dev/null +++ b/paddle/fluid/operators/qr_op.h @@ -0,0 +1,135 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/math/complex_functors.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +static inline std::tuple _parse_qr_mode(std::string mode) { + bool compute_q; + bool reduced; + if (mode == "reduced") { + compute_q = true; + reduced = true; + } else if (mode == "complete") { + compute_q = true; + reduced = false; + } else if (mode == "r") { + compute_q = false; + reduced = true; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "QR received unrecognized mode '%s'" + " but expected one of 'reduced' (default), 'r', or 'complete'", + mode)); + } + return std::make_tuple(compute_q, reduced); +} + +template +class QrCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool compute_q; + bool reduced_mode; + const Tensor& x = *context.Input("X"); + Tensor& q = *context.Output("Q"); + Tensor& r = *context.Output("R"); + std::string mode = context.Attr("mode"); + std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); + + auto numel = x.numel(); + PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet( + "The input of QR is empty.")); + auto x_dims = x.dims(); + int x_rank = x_dims.size(); + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + int k = reduced_mode ? min_mn : m; + int batch_size = numel / (m * n); + int x_stride = m * n; + int q_stride = m * k; + int r_stride = k * n; + + auto* x_data = x.data>(); + T* q_data = nullptr; + if (compute_q) { + q_data = q.mutable_data>( + context.GetPlace(), + size_t(batch_size * m * k * sizeof(math::Real))); + } + auto* r_data = r.mutable_data>( + context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real))); + + // Implement QR by calling Eigen + for (int i = 0; i < batch_size; ++i) { + const T* x_matrix_ptr = x_data + i * x_stride; + T* r_matrix_ptr = r_data + i * r_stride; + using EigenDynamicMatrix = + Eigen::Matrix; + auto x_matrix = Eigen::Map(x_matrix_ptr, m, n); + Eigen::HouseholderQR qr(x_matrix); + if (reduced_mode) { + auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n); + auto r_matrix_view = + qr_top_matrix.template triangularView(); + auto r_matrix = EigenDynamicMatrix(r_matrix_view); + memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); + } else { + auto r_matrix_view = + qr.matrixQR().template triangularView(); + auto r_matrix = EigenDynamicMatrix(r_matrix_view); + memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); + } + + if (compute_q) { + T* q_matrix_ptr = q_data + i * q_stride; + if (reduced_mode) { + auto q_matrix = + qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn); + q_matrix.transposeInPlace(); + memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); + } else { + auto q_matrix = + qr.householderQ() * EigenDynamicMatrix::Identity(m, m); + q_matrix.transposeInPlace(); + memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); + } + } + } + } +}; + +template +class QrGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + PADDLE_THROW(platform::errors::InvalidArgument( + "QR doesn't have the backward kernel now and will be supported soon.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index 9ba7c9a3062a04..6b2584682277e5 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -502,6 +502,19 @@ struct DeviceIndependenceTensorOperations { return ret; } + framework::Tensor TrilTriu(const framework::Tensor& x, int diagonal, + bool lower) { + framework::AttributeMap attrs; + attrs["diagonal"] = diagonal; + attrs["lower"] = lower; + NameInTensorMap inputs({{"X", {&x}}}); + int x_rank = x.dims().size(); + PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument( + "Rank must be at least 2.")); + std::vector out_shape = framework::vectorize(x.dims()); + return CreateOpRunAndReturnTensor("tril_triu", inputs, attrs, out_shape); + } + Tensor Conj(const Tensor& x) { Tensor out; auto* out_data = out.mutable_data(x.dims(), context.GetPlace()); diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h index a8ce1cc9d3a354..4c018908b5945b 100644 --- a/paddle/fluid/platform/dynload/cusolver.h +++ b/paddle/fluid/platform/dynload/cusolver.h @@ -65,11 +65,27 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP); __macro(cusolverDnSpotrfBatched); \ __macro(cusolverDnDpotrfBatched); \ __macro(cusolverDnSgesvdj_bufferSize); \ + __macro(cusolverDnSgeqrf_bufferSize); \ + __macro(cusolverDnDgeqrf_bufferSize); \ + __macro(cusolverDnCgeqrf_bufferSize); \ + __macro(cusolverDnZgeqrf_bufferSize); \ + __macro(cusolverDnSorgqr_bufferSize); \ + __macro(cusolverDnDorgqr_bufferSize); \ + __macro(cusolverDnCungqr_bufferSize); \ + __macro(cusolverDnZungqr_bufferSize); \ __macro(cusolverDnDestroyGesvdjInfo); \ __macro(cusolverDnCreateGesvdjInfo); \ __macro(cusolverDnDgesvdj_bufferSize); \ __macro(cusolverDnSgesvdj); \ - __macro(cusolverDnDgesvdj); + __macro(cusolverDnDgesvdj); \ + __macro(cusolverDnSgeqrf); \ + __macro(cusolverDnDgeqrf); \ + __macro(cusolverDnCgeqrf); \ + __macro(cusolverDnZgeqrf); \ + __macro(cusolverDnSorgqr); \ + __macro(cusolverDnDorgqr); \ + __macro(cusolverDnCungqr); \ + __macro(cusolverDnZungqr); CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) #endif diff --git a/python/paddle/fluid/tests/unittests/test_qr_op.py b/python/paddle/fluid/tests/unittests/test_qr_op.py new file mode 100644 index 00000000000000..ea2aaf3f00d5be --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_qr_op.py @@ -0,0 +1,173 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import itertools +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers as layers +import paddle.fluid.core as core + + +class TestQrAPI(unittest.TestCase): + def test_dygraph(self): + paddle.disable_static() + + def run_qr_dygraph(shape, mode, dtype): + if dtype == "float32": + np_dtype = np.float32 + elif dtype == "float64": + np_dtype = np.float64 + a = np.random.rand(*shape).astype(np_dtype) + m = a.shape[-2] + n = a.shape[-1] + min_mn = min(m, n) + if mode == "reduced" or mode == "r": + k = min_mn + else: + k = m + np_q_shape = list(a.shape[:-2]) + np_q_shape.extend([m, k]) + np_r_shape = list(a.shape[:-2]) + np_r_shape.extend([k, n]) + np_q = np.zeros(np_q_shape).astype(np_dtype) + np_r = np.zeros(np_r_shape).astype(np_dtype) + places = [] + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for place in places: + batch_size = a.size // (a.shape[-1] * a.shape[-2]) + for i in range(batch_size): + coord = np.unravel_index(i, a.shape[:-2]) + if mode == "r": + tmp_r = np.linalg.qr(a[coord], mode=mode) + np_r[coord] = tmp_r + else: + tmp_q, tmp_r = np.linalg.qr(a[coord], mode=mode) + np_q[coord] = tmp_q + np_r[coord] = tmp_r + + x = paddle.to_tensor(a, dtype=dtype) + if mode == "r": + r = paddle.linalg.qr(x, mode=mode) + self.assertTrue(np.allclose(r, np_r, atol=1e-5)) + else: + q, r = paddle.linalg.qr(x, mode=mode) + self.assertTrue(np.allclose(q, np_q, atol=1e-5)) + self.assertTrue(np.allclose(r, np_r, atol=1e-5)) + + tensor_shapes = [ + (3, 5), + (5, 5), + (5, 3), # 2-dim Tensors + (2, 3, 5), + (3, 5, 5), + (4, 5, 3), # 3-dim Tensors + (2, 5, 3, 5), + (3, 5, 5, 5), + (4, 5, 5, 3) # 4-dim Tensors + ] + modes = ["reduced", "complete", "r"] + dtypes = ["float32", "float64"] + for tensor_shape, mode, dtype in itertools.product(tensor_shapes, modes, + dtypes): + run_qr_dygraph(tensor_shape, mode, dtype) + + def test_static(self): + paddle.enable_static() + + def run_qr_static(shape, mode, dtype): + if dtype == "float32": + np_dtype = np.float32 + elif dtype == "float64": + np_dtype = np.float64 + a = np.random.rand(*shape).astype(np_dtype) + m = a.shape[-2] + n = a.shape[-1] + min_mn = min(m, n) + if mode == "reduced" or mode == "r": + k = min_mn + else: + k = m + np_q_shape = list(a.shape[:-2]) + np_q_shape.extend([m, k]) + np_r_shape = list(a.shape[:-2]) + np_r_shape.extend([k, n]) + np_q = np.zeros(np_q_shape).astype(np_dtype) + np_r = np.zeros(np_r_shape).astype(np_dtype) + places = [] + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for place in places: + with fluid.program_guard(fluid.Program(), fluid.Program()): + batch_size = a.size // (a.shape[-1] * a.shape[-2]) + for i in range(batch_size): + coord = np.unravel_index(i, a.shape[:-2]) + if mode == "r": + tmp_r = np.linalg.qr(a[coord], mode=mode) + np_r[coord] = tmp_r + else: + tmp_q, tmp_r = np.linalg.qr(a[coord], mode=mode) + np_q[coord] = tmp_q + np_r[coord] = tmp_r + x = paddle.fluid.data( + name="input", shape=shape, dtype=dtype) + if mode == "r": + r = paddle.linalg.qr(x, mode=mode) + exe = fluid.Executor(place) + fetches = exe.run(fluid.default_main_program(), + feed={"input": a}, + fetch_list=[r]) + self.assertTrue( + np.allclose( + fetches[0], np_r, atol=1e-5)) + else: + q, r = paddle.linalg.qr(x, mode=mode) + exe = fluid.Executor(place) + fetches = exe.run(fluid.default_main_program(), + feed={"input": a}, + fetch_list=[q, r]) + self.assertTrue( + np.allclose( + fetches[0], np_q, atol=1e-5)) + self.assertTrue( + np.allclose( + fetches[1], np_r, atol=1e-5)) + + tensor_shapes = [ + (3, 5), + (5, 5), + (5, 3), # 2-dim Tensors + (2, 3, 5), + (3, 5, 5), + (4, 5, 3), # 3-dim Tensors + (2, 5, 3, 5), + (3, 5, 5, 5), + (4, 5, 5, 3) # 4-dim Tensors + ] + modes = ["reduced", "complete", "r"] + dtypes = ["float32", "float64"] + for tensor_shape, mode, dtype in itertools.product(tensor_shapes, modes, + dtypes): + run_qr_static(tensor_shape, mode, dtype) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py index 726355379e7b63..06b512150cee88 100644 --- a/python/paddle/linalg.py +++ b/python/paddle/linalg.py @@ -23,6 +23,7 @@ from .tensor.linalg import multi_dot # noqa: F401 from .tensor.linalg import matrix_rank from .tensor.linalg import svd +from .tensor.linalg import qr from .tensor.linalg import eigh # noqa: F401 from .tensor.linalg import det from .tensor.linalg import slogdet @@ -38,6 +39,7 @@ 'multi_dot', 'matrix_rank', 'svd', + 'qr', 'matrix_power', 'det', 'slogdet', diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index c8f897c21648f5..b898b60fe47126 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -47,6 +47,7 @@ from .linalg import mv # noqa: F401 from .linalg import eig # noqa: F401 from .linalg import matrix_power # noqa: F401 +from .linalg import qr # noqa: F401 from .linalg import eigvals # noqa: F401 from .linalg import multi_dot # noqa: F401 from .linalg import svd # noqa: F401 @@ -237,6 +238,7 @@ 'histogram', 'mv', 'matrix_power', + 'qr', 'eigvals', 'abs', 'acos', diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index f112603fbb60f1..6853d904adbf6e 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -1594,6 +1594,70 @@ def matrix_power(x, n, name=None): return out +def qr(x, mode="reduced", name=None): + r""" + Computes the QR decomposition of one matrix or batches of matrice (backward is unsupported now). + + Args: + x (Tensor): The input tensor. Its shape should be `[..., M, N]`, + where ... is zero or more batch dimensions. M and N can be arbitrary + positive number. The data type of x should be float32 or float64. + mode (str, optional): A flag to control the behavior of qr, the default is "reduced". + Suppose x's shape is `[..., M, N]` and denoting `K = min(M, N)`: + If mode = "reduced", qr op will return reduced Q and R matrices, + which means Q's shape is `[..., M, K]` and R's shape is `[..., K, N]`. + If mode = "complete", qr op will return complete Q and R matrices, + which means Q's shape is `[..., M, M]` and R's shape is `[..., M, N]`. + If mode = "r", qr op will only return reduced R matrix, which means + R's shape is `[..., K, N]`. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + If mode = "reduced" or mode = "complete", qr will return a two tensor-tuple, which represents Q and R. + If mode = "r", qr will return a tensor which represents R. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64') + q, r = paddle.linalg.qr(x) + print (q) + print (r) + + # Q = [[-0.16903085, 0.89708523], + # [-0.50709255, 0.27602622], + # [-0.84515425, -0.34503278]]) + + # R = [[-5.91607978, -7.43735744], + # [ 0. , 0.82807867]]) + + # one can verify : X = Q * R ; + """ + if in_dygraph_mode(): + q, r = _C_ops.qr(x, 'mode', mode) + if mode == "r": + return r + else: + return q, r + check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'qr') + check_type(mode, 'mode', str, 'qr') + helper = LayerHelper('qr', **locals()) + q = helper.create_variable_for_type_inference(dtype=x.dtype) + r = helper.create_variable_for_type_inference(dtype=x.dtype) + attrs = dict() + attrs['mode'] = mode + helper.append_op( + type='qr', inputs={'X': [x]}, outputs={'Q': q, + 'R': r}, attrs=attrs) + if mode == "r": + return r + else: + return q, r + + def eig(x, name=None): """ This API performs the eigenvalue decomposition of a square matrix or a batch of square matrices. @@ -1674,7 +1738,7 @@ def eigvals(x, name=None): Its data type should be float32, float64, complex64, or complex128. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - + Returns: Tensor: A tensor containing the unsorted eigenvalues which has the same batch dimensions with `x`. The eigenvalues are complex-valued even when `x` is real. From 7edcc4fbbe3f90aecba0cc0197c1f89d2368a17b Mon Sep 17 00:00:00 2001 From: xiongkun Date: Tue, 19 Oct 2021 14:45:01 +0800 Subject: [PATCH 203/298] catch the generatorfunction and intercept it. (#35369) * catch the generatorfunction and intercept it. * add test generator * add test case * refine the testcase --- .../dygraph_to_static/convert_call_func.py | 11 +++++ .../test_convert_call_generator.py | 49 +++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call_generator.py diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py index b62c16989fbe78..300586969ff65b 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py @@ -167,6 +167,17 @@ def dyfunc(x): if is_builtin(func) or is_unsupported(func): return func + if inspect.isgeneratorfunction(func): + # NOTE(xiongkun03): inspect.isfunction() will return True even though func is a generator function. + # If we don't deal generatorfunction here, we will regard it as normal function and get errors in some + # occasion. + number_of_stars = 30 + translator_logger.warn( + "\n\n" + "*" * number_of_stars + + "\nYour function:`{}` doesn't support to transform to static function because it is a generator function, it will be run as-is." + .format(func.__name__) + "\n" + "*" * number_of_stars + "\n\n") + return func + if inspect.isfunction(func): # TODO(liym27): If func is a lambda function, special conversion is needed. if func.__name__ == '': diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call_generator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call_generator.py new file mode 100644 index 00000000000000..cfe9e191ed486f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call_generator.py @@ -0,0 +1,49 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import logging +import numpy as np + +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph import ProgramTranslator +from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import CONVERSION_OPTIONS +from test_program_translator import get_source_code +from paddle.jit import to_static + + +def dyfunc_generator(): + for i in range(100): + yield paddle.to_tensor([i] * 10) + + +def main_func(): + """ Error will raise, but we only report a warning not intercept + """ + for i in dyfunc_generator(): + print(i) + + +class TestConvertGenerator(unittest.TestCase): + def test_raise_error(self): + with self.assertRaises(Exception): + to_static(main_func)() + + +if __name__ == '__main__': + unittest.main() From d89a759bba8dacd2da2a27e8142e4b37bbfd3954 Mon Sep 17 00:00:00 2001 From: littletomatodonkey Date: Tue, 19 Oct 2021 14:57:23 +0800 Subject: [PATCH 204/298] fix replicate pad when input size is 0 (#36510) * fix replicate pad when input size is 0 * add unit test --- paddle/fluid/operators/pad3d_op.cc | 12 +++++------- paddle/fluid/operators/pad3d_op.cu | 12 +++++------- python/paddle/fluid/tests/unittests/test_pad3d_op.py | 10 ++++++++++ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc index c2be9ac97ff89b..e84b5a9d9baaeb 100644 --- a/paddle/fluid/operators/pad3d_op.cc +++ b/paddle/fluid/operators/pad3d_op.cc @@ -565,13 +565,11 @@ class Pad3dCPUKernel : public framework::OpKernel { " in reflect mode" ", but received depth(%d) and pad_right(%d).", in_width, pads[1])); - } - - if (mode == "circular") { - PADDLE_ENFORCE_NE( - in_depth * in_height * in_width, 0, - platform::errors::InvalidArgument( - "The input tensor size can not be 0 for circular padding mode.")); + } else if (mode == "circular" || mode == "replicate") { + PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0, + platform::errors::InvalidArgument( + "The input tensor size can not be 0 for circular " + "or replicate padding mode.")); } const int pad_left = pads[0]; diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu index ed936c10755f07..f243a78e5578bb 100644 --- a/paddle/fluid/operators/pad3d_op.cu +++ b/paddle/fluid/operators/pad3d_op.cu @@ -618,13 +618,11 @@ class Pad3dCUDAKernel : public framework::OpKernel { " in reflect mode" ", but received depth(%d) and pad_right(%d).", in_width, pads[1])); - } - - if (mode == "circular") { - PADDLE_ENFORCE_NE( - in_depth * in_height * in_width, 0, - platform::errors::InvalidArgument( - "The input tensor size can not be 0 for circular padding mode.")); + } else if (mode == "circular" || mode == "replicate") { + PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0, + platform::errors::InvalidArgument( + "The input tensor size can not be 0 for circular " + "or replicate padding mode.")); } const int pad_left = pads[0]; diff --git a/python/paddle/fluid/tests/unittests/test_pad3d_op.py b/python/paddle/fluid/tests/unittests/test_pad3d_op.py index 5ec7bdc66fe495..7abc314bc1ba01 100644 --- a/python/paddle/fluid/tests/unittests/test_pad3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pad3d_op.py @@ -732,6 +732,15 @@ def test_circular_1(): mode='circular', data_format="NCDHW") + def test_replicate_1(): + input_shape = (1, 2, 0, 4, 5) + data = np.random.rand(*input_shape).astype(np.float32) + x = paddle.to_tensor(data) + y = F.pad(x, + pad=[1, 1, 1, 1, 2, 3], + mode='replicate', + data_format="NCDHW") + paddle.disable_static() for place in self.places: self.assertRaises(ValueError, test_variable) @@ -739,6 +748,7 @@ def test_circular_1(): self.assertRaises(Exception, test_reflect_2) self.assertRaises(Exception, test_reflect_3) self.assertRaises(Exception, test_circular_1) + self.assertRaises(Exception, test_replicate_1) paddle.enable_static() From 8cc8e411121649be36af8396536502e7ef7539b7 Mon Sep 17 00:00:00 2001 From: WangXi Date: Tue, 19 Oct 2021 14:59:38 +0800 Subject: [PATCH 205/298] [hybrid] static model parallel dropout support deterministic RandomSeedGenerator (#36228) --- paddle/fluid/framework/generator.cc | 37 +++++ paddle/fluid/framework/generator.h | 6 + paddle/fluid/operators/dropout_impl_util.h | 10 +- paddle/fluid/operators/seed_op.cc | 11 ++ paddle/fluid/operators/seed_op.cu | 11 +- paddle/fluid/operators/seed_op.h | 34 +++-- paddle/fluid/pybind/generator_py.cc | 2 + .../meta_parallel/parallel_layers/random.py | 137 ++++++++++++++++++ python/paddle/fluid/backward.py | 6 +- .../fluid/tests/unittests/test_dropout_op.py | 44 ++++++ .../fluid/tests/unittests/test_optimizer.py | 48 +++++- .../fluid/tests/unittests/test_seed_op.py | 32 +++- python/paddle/framework/random.py | 8 + 13 files changed, 354 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc index 4b64722a7abf5a..154154fc795179 100644 --- a/paddle/fluid/framework/generator.cc +++ b/paddle/fluid/framework/generator.cc @@ -63,6 +63,43 @@ const std::shared_ptr& DefaultCPUGenerator() { return default_cpu_generator; } +using RNGMap = std::unordered_map>; + +static RNGMap& GetRandomSeedGeneratorMap() { + static auto random_seed_generator_map = RNGMap(); + return random_seed_generator_map; +} + +const std::shared_ptr& SetRandomSeedGenerator( + const std::string& name, uint64_t seed) { + auto& rng_map = GetRandomSeedGeneratorMap(); + auto iter = rng_map.find(name); + PADDLE_ENFORCE_EQ(iter == rng_map.end(), true, + platform::errors::AlreadyExists( + "%s RandomSeedGenerator is already exist", name)); + + auto generator = std::make_shared(seed); + bool emplace_success = rng_map.emplace(name, generator).second; + PADDLE_ENFORCE_EQ( + emplace_success, true, + platform::errors::PermissionDenied( + "SetRandomSeedGenerator cannot emplace %s RandomSeedGenerator", + name)); + return rng_map[name]; +} + +const std::shared_ptr& GetRandomSeedGenerator( + const std::string& name) { + auto& rng_map = GetRandomSeedGeneratorMap(); + auto iter = rng_map.find(name); + PADDLE_ENFORCE_EQ(iter != rng_map.end(), true, + platform::errors::NotFound( + "%s RandomSeedGenerator is not found, please " + "use `set_random_seed_generator` to set rng first", + name)); + return iter->second; +} + std::shared_ptr OpDefaultCPUEngine() { static auto op_default_cpu_engine = std::make_shared(); return op_default_cpu_engine; diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h index 862e63c4c6af5a..d0a5b4443e3f49 100644 --- a/paddle/fluid/framework/generator.h +++ b/paddle/fluid/framework/generator.h @@ -126,5 +126,11 @@ std::shared_ptr GetCPURandomEngine(uint64_t); const std::shared_ptr& GetDefaultCUDAGenerator( int64_t device_id = -1); +const std::shared_ptr& SetRandomSeedGenerator( + const std::string& name, uint64_t seed); + +const std::shared_ptr& GetRandomSeedGenerator( + const std::string& name); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h index a7188efe7139c7..f2038d12528c49 100644 --- a/paddle/fluid/operators/dropout_impl_util.h +++ b/paddle/fluid/operators/dropout_impl_util.h @@ -29,7 +29,7 @@ inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - if ((seed) && platform::is_gpu_place(seed->place())) { + if (seed) { framework::Tensor seed_cpu_tensor; TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor); *seed_data = static_cast(seed_cpu_tensor.data()[0]); @@ -39,12 +39,8 @@ inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, *seed_data = seed_offset.first; *increment = seed_offset.second; } else { - if (seed) { - *seed_data = *(seed->data()); - } else { - std::random_device rnd; - *seed_data = is_fix_seed ? seed_val : rnd(); - } + std::random_device rnd; + *seed_data = is_fix_seed ? seed_val : rnd(); *increment = offset; } } diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc index 32daa8c3934aed..837ccae0284f5e 100644 --- a/paddle/fluid/operators/seed_op.cc +++ b/paddle/fluid/operators/seed_op.cc @@ -39,6 +39,17 @@ class SeedOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddOutput("Out", "The output of seed op."); AddAttr("seed", "Dropout random seed.").SetDefault(0); + AddAttr("deterministic", + "(bool, default false) Whether to use deterministic " + "RandomSeedGenerator which " + "generate by `set_random_seed_generator`") + .SetDefault(false) + .AsExtra(); + AddAttr( + "rng_name", + "use deterministic RandomSeedGenerator which name is `rng_name`") + .SetDefault("") + .AsExtra(); AddAttr("force_cpu", "(bool, default false) Force fill output variable to cpu " "memory. Otherwise, fill output variable to the running " diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu index 4593b88019621a..4ca75bcf76e513 100644 --- a/paddle/fluid/operators/seed_op.cu +++ b/paddle/fluid/operators/seed_op.cu @@ -23,16 +23,9 @@ class GPUSeedKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *out = context.Output("Out"); - int user_seed = context.Attr("seed"); - auto force_cpu = context.Attr("force_cpu"); - std::random_device rnd; - int seed; - if (user_seed != 0) { - seed = user_seed; - } else { - seed = rnd(); - } + int seed = get_seed(context); + auto force_cpu = context.Attr("force_cpu"); bool cpu_place = force_cpu || context.GetPlace() == platform::CPUPlace(); if (cpu_place) { platform::DeviceContextPool &pool = diff --git a/paddle/fluid/operators/seed_op.h b/paddle/fluid/operators/seed_op.h index 671f397d4eaffc..202f25e0b4cd12 100644 --- a/paddle/fluid/operators/seed_op.h +++ b/paddle/fluid/operators/seed_op.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -20,24 +21,37 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -class CPUSeedKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* out_data = out->mutable_data(context.GetPlace()); - int user_seed = context.Attr("seed"); +static int get_seed(const framework::ExecutionContext& context) { + int user_seed = context.Attr("seed"); + bool deterministic = context.Attr("deterministic"); + int seed = 0; + if (!deterministic) { // NOTE: fixed seed should only be used in unittest or for debug. // Guarantee to use random seed in training. - std::random_device rnd; - int seed; if (user_seed != 0) { seed = user_seed; } else { + std::random_device rnd; seed = rnd(); } - out_data[0] = seed; + } else { + std::string name = context.Attr("rng_name"); + auto rng = framework::GetRandomSeedGenerator(name); + do { // NOTE(wangxi): cpu dropout will use random seed if seed == 0 + seed = static_cast(rng->Random64()); + } while (seed == 0); + } + return seed; +} + +template +class CPUSeedKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* out = context.Output("Out"); + auto* out_data = out->mutable_data(context.GetPlace()); + out_data[0] = get_seed(context); } }; diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc index 67121e24089f7c..fa924ce6581257 100644 --- a/paddle/fluid/pybind/generator_py.cc +++ b/paddle/fluid/pybind/generator_py.cc @@ -60,6 +60,8 @@ void BindGenerator(py::module* m_ptr) { &framework::Generator::SetIsInitPy); m.def("default_cpu_generator", &framework::DefaultCPUGenerator); m.def("default_cuda_generator", &framework::GetDefaultCUDAGenerator); + m.def("set_random_seed_generator", &framework::SetRandomSeedGenerator); + m.def("get_random_seed_generator", &framework::GetRandomSeedGenerator); } } // namespace pybind } // namespace paddle diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py index ec80ba71036c06..0a96745c2a4a1f 100644 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py @@ -15,6 +15,11 @@ import paddle import contextlib import numpy as np +from paddle import _C_ops +from paddle.fluid import core +from paddle.fluid.data_feeder import check_variable_and_dtype +from paddle.fluid.framework import in_dygraph_mode, default_main_program +from paddle.fluid.layer_helper import LayerHelper __all__ = [] @@ -93,3 +98,135 @@ def model_parallel_random_seed(seed=None): RNG_STATE_TRACKER.reset() RNG_STATE_TRACKER.add(MODEL_PARALLEL_RNG, local_seed) paddle.seed(global_seed) + + +def determinate_seed(rng_name): + assert rng_name is not None and rng_name != "" + helper = LayerHelper('seed', **locals()) + out = helper.create_variable_for_type_inference(dtype=paddle.int32) + # set force_cpu to reduce sync copy from CPU->GPU->CPU, and reduce pipeline hang + helper.append_op( + type='seed', + outputs={'Out': out}, + attrs={'deterministic': True, + 'rng_name': rng_name, + 'force_cpu': True}) + return out + + +def dropout(x, + p=0.5, + axis=None, + rng_name=None, + training=True, + mode="upscale_in_train", + name=None): + """ + Dropout is a regularization technique for reducing overfitting by preventing + neuron co-adaption during training. The dropout operator randomly sets the + outputs of some units to zero, while upscale others according to the given + dropout probability. + + Args: + x (Tensor): The input tensor. The data type is float32 or float64. + p (float|int): Probability of setting units to zero. Default 0.5. + axis (int|list|tuple): The axis along which the dropout is performed. Default None. + rng_name (str): The random seed generator name, which used to obtain deterministic results. + training (bool): A flag indicating whether it is in train phrase or not. Default True. + mode(str): ['upscale_in_train'(default) | 'downscale_in_infer']. + + 1. upscale_in_train(default), upscale the output at training time + + - train: out = input * mask / ( 1.0 - dropout_prob ) + - inference: out = input + + 2. downscale_in_infer, downscale the output at inference + + - train: out = input * mask + - inference: out = input * (1.0 - dropout_prob) + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Tensor representing the dropout, has same shape and data type as `x` . + + + Examples: + We use ``p=0.5`` in the following description for simplicity. + + 1. When ``axis=None`` , this is commonly used dropout, which dropout each element of x randomly. + + .. code-block:: text + + Let's see a simple case when x is a 2d tensor with shape 2*3: + [[1 2 3] + [4 5 6]] + we generate mask with the same shape as x, which is 2*3. The value of mask is + sampled from a Bernoulli distribution randomly. For example, we may get such mask: + [[0 1 0] + [1 0 1]] + So the output is obtained from elementwise multiply of x and mask: + [[0 2 0] + [4 0 6]] + Using default setting, i.e. ``mode='upscale_in_train'`` , + if in training phase, the final upscale output is: + [[0 4 0 ] + [8 0 12]] + if in test phase, the output is the same as input: + [[1 2 3] + [4 5 6]] + we can also set ``mode='downscale_in_infer'`` , then + if in training phase, the final output is: + [[0 2 0] + [4 0 6]] + if in test phase, the scale output is: + [[0.5 1. 1.5] + [2. 2.5 3. ]] + + """ + if rng_name is None: + return paddle.nn.functional.dropout(x, p, axis, training, mode, name) + + # fast return for p == 0 + if p == 0: return x + + assert isinstance(p, (float, int)), \ + TypeError("p argument should be a number") + assert 0 <= p <= 1, ValueError("p argument should between 0 and 1") + assert mode in ('downscale_in_infer', 'upscale_in_train'), \ + ValueError( + "mode argument should be 'downscale_in_infer' or 'upscale_in_train'") + + assert axis is None, \ + TypeError("unsupport axis when using random seed generator") + + mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode #semantic transfer + + # dygraph using tracker, doesn't need determinate seed + if in_dygraph_mode(): + out, mask = _C_ops.dropout(x, 'dropout_prob', p, 'is_test', + not training, 'fix_seed', False, 'seed', 0, + 'dropout_implementation', mode) + return out + + seed = determinate_seed(rng_name) + + helper = LayerHelper('dropout', **locals()) + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], + 'dropout') + + out = helper.create_variable_for_type_inference(dtype=x.dtype) + mask = helper.create_variable_for_type_inference( + dtype=core.VarDesc.VarType.UINT8, stop_gradient=True) + + helper.append_op( + type='dropout', + inputs={'X': [x], + 'Seed': seed}, + outputs={'Out': [out], + 'Mask': [mask]}, + attrs={ + 'dropout_prob': p, + 'is_test': not training, + 'dropout_implementation': mode, + }) + return out diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 7ab060be6df291..d62f7b5941126b 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -175,11 +175,15 @@ def modify_forward_desc_for_recompute(self): return op_idx = 0 - while (op_idx < len(self.ops)): + while op_idx < len(self.ops): op = self.ops[op_idx] if op.desc.type() != "dropout": op_idx += 1 continue + # already insert seed op before dropout + if op.input('Seed') is not None and len(op.input('Seed')) == 1: + op_idx += 1 + continue # add a seed op so that the two dropout op can generate same output op_unique_name = unique_name.generate("seed") var_unique_name = unique_name.generate_with_ignorable_key(".".join( diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py index 396d55b3d0a8b5..bf10e07ba0d6fc 100644 --- a/python/paddle/fluid/tests/unittests/test_dropout_op.py +++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py @@ -19,6 +19,7 @@ import paddle.fluid.core as core from op_test import OpTest, skip_check_grad_ci import paddle +import paddle.static as static import paddle.fluid as fluid from paddle.fluid import Program, program_guard @@ -856,5 +857,48 @@ def test_dygraph(self): self.assertTrue(np.allclose(result.numpy(), result_np)) +class TestDropoutWithDeterminateSeedGenerator(unittest.TestCase): + def setUp(self): + paddle.framework.random.set_random_seed_generator('seed0', 123) + paddle.framework.random.set_random_seed_generator('seed1', 123) + rng0 = paddle.framework.random.get_random_seed_generator('seed0') + rng1 = paddle.framework.random.get_random_seed_generator('seed1') + self.places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + self.places.append(paddle.CUDAPlace(0)) + + def check_static_result(self, place): + from paddle.distributed.fleet.meta_parallel.parallel_layers.random import dropout + with static.program_guard(static.Program(), static.Program()): + input = static.data(name="input", shape=[40, 40], dtype="float32") + res1 = dropout( + input, + p=0.3, + training=True, + mode='upscale_in_train', + rng_name='seed0') + res2 = dropout( + input, + p=0.3, + training=True, + mode='upscale_in_train', + rng_name='seed1') + res3 = dropout(input, p=0.3) + + in_np = np.random.random([40, 40]).astype("float32") + + exe = static.Executor(place) + res_list = [res1, res2] + for i in range(2): + out1, out2 = exe.run(static.default_main_program(), + feed={"input": in_np}, + fetch_list=res_list) + self.assertTrue(np.allclose(out1, out2)) + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 31704ebcd91920..89c7be18a7dfaf 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -619,7 +619,7 @@ def test_lookahead_optimizer(self): class TestRecomputeOptimizer(unittest.TestCase): - def net(self, return_input=False, with_dropout=False): + def net(self, return_input=False, with_dropout=False, with_seed=False): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( @@ -628,7 +628,8 @@ def net(self, return_input=False, with_dropout=False): dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") - if with_dropout == True: + + if with_dropout is True: mul_out_drop = block.create_var( dtype="float32", shape=[5, 8], @@ -636,6 +637,10 @@ def net(self, return_input=False, with_dropout=False): name="mul.out.dropout") mul_out_mask = block.create_var( dtype="uint8", shape=[5, 8], lod_level=0, name="mul.out.mask") + if with_seed is True: + seed_out = block.create_var( + dtype="int32", shape=[1], name="seed.out") + b1 = block.create_parameter( dtype="float32", shape=[5, 8], lod_level=0, name="b1") b1_out = block.create_var( @@ -652,10 +657,23 @@ def net(self, return_input=False, with_dropout=False): "Y": mul_y}, outputs={"Out": mul_out}, attrs={"x_num_col_dims": 1}) - if with_dropout == True: + + if with_dropout is True: + dropout_inputs = {'X': [mul_out]} + if with_seed is True: + block.append_op( + type='seed', + outputs={'Out': seed_out}, + attrs={ + 'deterministic': True, + 'rng_name': 'rng0', + 'force_cpu': True + }) + dropout_inputs = {'X': [mul_out], 'Seed': [seed_out]} + block.append_op( type='dropout', - inputs={'X': [mul_out]}, + inputs=dropout_inputs, outputs={'Out': [mul_out_drop], 'Mask': [mul_out_mask]}, attrs={'dropout_prob': 0.5, }) @@ -670,6 +688,7 @@ def net(self, return_input=False, with_dropout=False): inputs={"X": mul_out, "Y": b1}, outputs={"Out": b1_out}) + block.append_op( type="elementwise_add", inputs={"X": b1_out, @@ -864,6 +883,27 @@ def test_dropout(self): "sgd", "sgd", "sgd" ]) + def test_dropout_with_determinate_seed(self): + mul_out, b1_out, b2_out, mean_out = self.net(with_dropout=True, + with_seed=True) + self.assertEqual(len(mean_out.block.ops), 6) + self.assertEqual([op.type for op in mean_out.block.ops], [ + "mul", "seed", "dropout", "elementwise_add", "elementwise_add", + "mean" + ]) + sgd_optimizer = optimizer.SGD(learning_rate=1.0) + recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + recompute_optimizer._set_checkpoints([b1_out]) + opts, params_grads = recompute_optimizer.minimize(mean_out) + + self.assertEqual(len(mean_out.block.ops), 17) + self.assertEqual([op.type for op in mean_out.block.ops], [ + "mul", "seed", "dropout", "elementwise_add", "elementwise_add", + "mean", "fill_constant", "mean_grad", "elementwise_add_grad", "mul", + "dropout", "elementwise_add_grad", "dropout_grad", "mul_grad", + "sgd", "sgd", "sgd" + ]) + def test_dropout_with_seed(self): """ when we recompute a dropout op, make sure that the recomputed one diff --git a/python/paddle/fluid/tests/unittests/test_seed_op.py b/python/paddle/fluid/tests/unittests/test_seed_op.py index 08478d7140d434..0dcc197ece7ed0 100644 --- a/python/paddle/fluid/tests/unittests/test_seed_op.py +++ b/python/paddle/fluid/tests/unittests/test_seed_op.py @@ -17,7 +17,10 @@ import unittest import numpy as np from op_test import OpTest -import paddle.fluid as fluid +import paddle +import paddle.static as static + +paddle.enable_static() class TestSeedOpFixSeed(OpTest): @@ -42,5 +45,32 @@ def test_check_output(self): self.check_output(no_check_set=["Out"]) +class TestDropoutWithRandomSeedGenerator(unittest.TestCase): + def setUp(self): + paddle.framework.random.set_random_seed_generator('seed0', 123) + paddle.framework.random.set_random_seed_generator('seed1', 123) + self.rng0 = paddle.framework.random.get_random_seed_generator('seed0') + self.rng1 = paddle.framework.random.get_random_seed_generator('seed1') + self.places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + self.places.append(paddle.CUDAPlace(0)) + + def check_static_result(self, place): + import paddle.distributed.fleet.meta_parallel.parallel_layers.random as random + with static.program_guard(static.Program(), static.Program()): + res1 = random.determinate_seed('seed0') + + exe = static.Executor(place) + res_list = [res1] + for i in range(2): + out1, = exe.run(static.default_main_program(), + fetch_list=res_list) + self.assertEqual(out1, np.cast['int32'](self.rng1.random())) + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py index 701f8b5352c3d4..a560072cf5a7b7 100644 --- a/python/paddle/framework/random.py +++ b/python/paddle/framework/random.py @@ -122,3 +122,11 @@ def _manual_program_seed(seed): fluid.default_startup_program().random_seed = seed program = fluid.Program() program.global_seed(seed) + + +def set_random_seed_generator(name, seed): + core.set_random_seed_generator(name, seed) + + +def get_random_seed_generator(name): + return core.get_random_seed_generator(name) From 7b67f398c33e03930aea8cfb0d330c2c28757100 Mon Sep 17 00:00:00 2001 From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com> Date: Tue, 19 Oct 2021 15:06:48 +0800 Subject: [PATCH 206/298] add nearest_interp_v2 trt plugin (#34126) * add nearest_interp_v2 trt plugin --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../tensorrt/convert/nearest_interp_v2_op.cc | 108 +++++++++++++ .../convert/test_nearest_interp_v2_op.cc | 54 +++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 30 +++- .../tests/infer_ut/test_det_mv3_db.cc | 41 +---- .../unittests/ir/inference/CMakeLists.txt | 1 + .../test_trt_convert_nearest_interp_v2.py | 101 ++++++++++++ .../test_trt_nearest_interp_v2_op.py | 151 ++++++++++++++++++ 9 files changed, 450 insertions(+), 38 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3136e53e74d090..dfa27037205f15 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1403,6 +1403,7 @@ USE_TRT_CONVERTER(roi_align); USE_TRT_CONVERTER(affine_channel); USE_TRT_CONVERTER(multiclass_nms); USE_TRT_CONVERTER(nearest_interp); +USE_TRT_CONVERTER(nearest_interp_v2); USE_TRT_CONVERTER(reshape); USE_TRT_CONVERTER(reduce_sum); USE_TRT_CONVERTER(gather_nd); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index f2c7a4b62bbbb3..ef12cb6b366177 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -18,6 +18,7 @@ nv_library(tensorrt_converter tile_op.cc conv3d_op.cc mish_op.cc + nearest_interp_v2_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc new file mode 100644 index 00000000000000..f2e0e0c09c5efb --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class NearestInterpolateV2OpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid nearest_interp_v2 op"; + + framework::OpDesc op_desc(op, nullptr); + + std::string input_name = op_desc.Input("X").front(); + std::string output_name = op_desc.Output("Out").front(); + + auto input = engine_->GetITensor(input_name); + + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout"))); + auto interp_method = + BOOST_GET_CONST(std::string, op_desc.GetAttr("interp_method")); + bool align_corners = + BOOST_GET_CONST(bool, op_desc.GetAttr("align_corners")); + + auto input_names = op_desc.Input("X"); + auto scale = BOOST_GET_CONST(std::vector, op_desc.GetAttr("scale")); + auto out_h = BOOST_GET_CONST(int, op_desc.GetAttr("out_h")); + auto out_w = BOOST_GET_CONST(int, op_desc.GetAttr("out_w")); + + auto layer = TRT_ENGINE_ADD_LAYER(engine_, Resize, *input); + layer->setAlignCorners(align_corners); + + auto in_dim = input->getDimensions(); + + float scale_h = 1.f; + float scale_w = 1.f; + + std::vector scales; + + if (out_h > 0 && out_w > 0) { + // axis are different in static/dynamic mode + bool with_dynamic = engine_->with_dynamic_shape(); + + int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic; + int w_axis = + (data_layout == framework::DataLayout::kNCHW) + 1 + with_dynamic; + + scale_h = + static_cast(out_h) / static_cast(in_dim.d[h_axis]); + scale_w = + static_cast(out_w) / static_cast(in_dim.d[w_axis]); + } else { + scale_h = scale[0]; + scale_w = scale[1]; + } + + if (engine_->with_dynamic_shape()) { + scales.push_back(1.f); + } + + if (data_layout == framework::DataLayout::kNCHW) { + scales.push_back(1.f); + scales.push_back(scale_h); + scales.push_back(scale_w); + } else if (data_layout == framework::DataLayout::kNHWC) { + // NHWC + scales.push_back(scale_h); + scales.push_back(scale_w); + scales.push_back(1.f); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Data layout must be NCHW or NHWC.")); + } + layer->setScales(scales.data(), scales.size()); + + RreplenishLayerAndOutput(layer, "nearest_interp_v2", {output_name}, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(nearest_interp_v2, NearestInterpolateV2OpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc new file mode 100644 index 00000000000000..f5ab6a99249314 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(nearest_interp_v2_op, test_swish) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("interp-X", nvinfer1::Dims3(3, 32, 32)); + validator.DeclOutputVar("interp-Out", nvinfer1::Dims3(3, 64, 64)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("nearest_interp_v2"); + desc.SetInput("X", {"interp-X"}); + desc.SetOutput("Out", {"interp-Out"}); + + std::vector scale({2.f, 2.f}); + + desc.SetAttr("data_layout", "NCHW"); + desc.SetAttr("interp_method", "nearest"); + desc.SetAttr("align_corners", false); + desc.SetAttr("scale", scale); + desc.SetAttr("out_h", 0); + desc.SetAttr("out_w", 0); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(nearest_interp_v2); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 89159c0bb636c9..e7318d07611ea0 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -141,7 +141,8 @@ struct SimpleOpTypeSetTeller : public Teller { "reduce_mean", "conv3d", "conv3d_transpose", - "mish"}; + "mish", + "nearest_interp_v2"}; }; bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, @@ -599,6 +600,33 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "nearest_interp_v2") { + std::vector attrs{"data_layout", "interp_method", + "align_corners", "scale", + "out_h", "out_w"}; + for (auto const attr : attrs) { + if (!desc.HasAttr(attr)) return false; + } + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, desc.GetAttr("data_layout"))); + if (data_layout != framework::DataLayout::kNCHW && + data_layout != framework::DataLayout::kNHWC) + return false; + auto interp_method = + BOOST_GET_CONST(std::string, desc.GetAttr("interp_method")); + if (interp_method != "nearest") return false; + auto scale = BOOST_GET_CONST(std::vector, desc.GetAttr("scale")); + auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h")); + auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w")); + if (!(out_h > 0 && out_w > 0)) { + if (scale[0] <= 0.f || scale[1] <= 0.f) { + VLOG(3) << "scale factor must be greater than 0 if out_h or out_w is " + "not set."; + return false; + } + } + } + if (op_type == "roi_align") { if (!with_dynamic_shape) return false; diff --git a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc index 67c2eeb0be5f94..cf3398b49ee9b9 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc @@ -35,44 +35,11 @@ paddle::test::Record PrepareInput(int batch_size, int image_shape = 640) { void PrepareDynamicShape(paddle_infer::Config* config, int max_batch_size = 4) { // set dynamic shape range std::map> min_input_shape = { - {"x", {1, 3, 50, 50}}, - {"conv2d_92.tmp_0", {1, 120, 20, 20}}, - {"conv2d_91.tmp_0", {1, 24, 10, 10}}, - {"conv2d_59.tmp_0", {1, 96, 20, 20}}, - {"nearest_interp_v2_1.tmp_0", {1, 256, 10, 10}}, - {"nearest_interp_v2_2.tmp_0", {1, 256, 20, 20}}, - {"conv2d_124.tmp_0", {1, 256, 20, 20}}, - {"nearest_interp_v2_3.tmp_0", {1, 64, 20, 20}}, - {"nearest_interp_v2_4.tmp_0", {1, 64, 20, 20}}, - {"nearest_interp_v2_5.tmp_0", {1, 64, 20, 20}}, - {"elementwise_add_7", {1, 56, 2, 2}}, - {"nearest_interp_v2_0.tmp_0", {1, 256, 2, 2}}}; + {"x", {1, 3, 50, 50}}}; std::map> max_input_shape = { - {"x", {max_batch_size, 3, 2000, 2000}}, - {"conv2d_92.tmp_0", {max_batch_size, 120, 400, 400}}, - {"conv2d_91.tmp_0", {max_batch_size, 24, 200, 200}}, - {"conv2d_59.tmp_0", {max_batch_size, 96, 400, 400}}, - {"nearest_interp_v2_1.tmp_0", {max_batch_size, 256, 200, 200}}, - {"nearest_interp_v2_2.tmp_0", {max_batch_size, 256, 400, 400}}, - {"conv2d_124.tmp_0", {max_batch_size, 256, 400, 400}}, - {"nearest_interp_v2_3.tmp_0", {max_batch_size, 64, 400, 400}}, - {"nearest_interp_v2_4.tmp_0", {max_batch_size, 64, 400, 400}}, - {"nearest_interp_v2_5.tmp_0", {max_batch_size, 64, 400, 400}}, - {"elementwise_add_7", {max_batch_size, 56, 400, 400}}, - {"nearest_interp_v2_0.tmp_0", {max_batch_size, 256, 400, 400}}}; + {"x", {max_batch_size, 3, 1600, 1600}}}; std::map> opt_input_shape = { - {"x", {1, 3, 640, 640}}, - {"conv2d_92.tmp_0", {1, 120, 160, 160}}, - {"conv2d_91.tmp_0", {1, 24, 80, 80}}, - {"conv2d_59.tmp_0", {1, 96, 160, 160}}, - {"nearest_interp_v2_1.tmp_0", {1, 256, 80, 80}}, - {"nearest_interp_v2_2.tmp_0", {1, 256, 160, 160}}, - {"conv2d_124.tmp_0", {1, 256, 160, 160}}, - {"nearest_interp_v2_3.tmp_0", {1, 64, 160, 160}}, - {"nearest_interp_v2_4.tmp_0", {1, 64, 160, 160}}, - {"nearest_interp_v2_5.tmp_0", {1, 64, 160, 160}}, - {"elementwise_add_7", {1, 56, 40, 40}}, - {"nearest_interp_v2_0.tmp_0", {1, 256, 40, 40}}}; + {"x", {1, 3, 640, 640}}}; config->SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, opt_input_shape); } @@ -123,7 +90,7 @@ TEST(tensorrt_tester_det_mv3_db, multi_thread2_trt_fp32_dynamic_shape_bz2) { FLAGS_modeldir + "/inference.pdiparams"); config.EnableUseGpu(100, 0); config.EnableTensorRtEngine( - 1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, true, false); + 1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, false, false); PrepareDynamicShape(&config, 4); // get groudtruth by disbale ir paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index 54229533935a42..b951afdfad5ead 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -68,4 +68,5 @@ set_tests_properties(test_trt_conv_quant_dequant_pass PROPERTIES TIMEOUT 100) set_tests_properties(test_trt_matmul_quant_dequant PROPERTIES TIMEOUT 100) set_tests_properties(test_trt_conv3d_op PROPERTIES TIMEOUT 60) set_tests_properties(test_trt_conv3d_transpose_op PROPERTIES TIMEOUT 60) +set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30) endif() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py new file mode 100644 index 00000000000000..0c7715c957085a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py @@ -0,0 +1,101 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set + + +class TrtConvertNearestInterpV2Test(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def generate_input(): + return np.ones([1, 3, 32, 32]).astype(np.float32) + + ops_config = [{ + "op_type": "nearest_interp_v2", + "op_inputs": { + "X": ["input_data"] + }, + "op_outputs": { + "Out": ["interp_output_data"] + }, + "op_attrs": { + "data_layout": "NCHW", + "interp_method": "nearest", + "align_corners": False, + "scale": [2., 2.], + "out_h": 0, + "out_w": 0 + } + }] + + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={"input_data": TensorConfig(data_gen=generate_input)}, + outputs=["interp_output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]} + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 2 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-2 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-2 + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py new file mode 100644 index 00000000000000..101ace6cd54a83 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py @@ -0,0 +1,151 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid.core as core +from paddle import fluid +import paddle.nn.functional as F +from paddle.fluid.core import PassVersionChecker +from paddle.fluid.core import AnalysisConfig + + +class TRTNearestInterpTest(InferencePassTest): + def setUp(self): + self.set_params() + + with fluid.program_guard(self.main_program, self.startup_program): + if self.data_layout == 'NCHW': + shape = [ + -1, self.channels, self.origin_shape[0], + self.origin_shape[1] + ] + else: + shape = [ + -1, self.origin_shape[0], self.origin_shape[1], + self.channels + ] + data = fluid.data(name='data', shape=shape, dtype='float32') + resize_out = self.append_nearest_interp(data) + out = fluid.layers.batch_norm(resize_out, is_test=True) + + if self.data_layout == 'NCHW': + shape = [ + self.bs, self.channels, self.origin_shape[0], + self.origin_shape[1] + ] + else: + shape = [ + self.bs, self.origin_shape[0], self.origin_shape[1], + self.channels + ] + + self.feeds = {'data': np.random.random(shape).astype('float32'), } + self.enable_trt = True + self.trt_parameters = TRTNearestInterpTest.TensorRTParam( + 1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = False + self.data_layout = 'NCHW' + + def append_nearest_interp(self, data): + if self.scale > 0.: + return F.interpolate( + data, + scale_factor=self.scale, + align_corners=self.align_corners, + mode='nearest', + data_format=self.data_layout) + return F.interpolate( + data, + size=self.resize_shape, + align_corners=self.align_corners, + mode='nearest', + data_format=self.data_layout) + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + +class TRTNearestInterpTest1(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = 2. + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = False + self.data_layout = 'NCHW' + + +class TRTNearestInterpTest2(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (47, 48) # HW + self.align_corners = False + self.data_layout = 'NCHW' + + +class TRTNearestInterpTest3(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = False + self.data_layout = 'NHWC' + + +class TRTNearestInterpTest4(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = 2. + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = False + self.data_layout = 'NHWC' + + +class TRTNearestInterpTest5(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (47, 48) # HW + self.align_corners = False + self.data_layout = 'NHWC' + + +if __name__ == "__main__": + unittest.main() From 6cdc5a4ba16f11a09e8a723204b02de1f16c51c3 Mon Sep 17 00:00:00 2001 From: jiangcheng Date: Tue, 19 Oct 2021 15:24:38 +0800 Subject: [PATCH 207/298] Optimize the subgraph generated by BuildCinnPass (#36503) * add feed op and new var for the generated subgraph * perfect the test script of build_cinn_pass * remove useless clear and perfect some annotation --- .../framework/paddle2cinn/build_cinn_pass.cc | 129 ++++++++++++++++-- .../paddle2cinn/build_cinn_pass_test.cc | 98 +++++++++++-- 2 files changed, 198 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc index ffdbb46bd7c066..caddc8fbb7381d 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc @@ -64,10 +64,81 @@ using framework::ir::Node; using GraphNodeVec = std::vector; using GraphNodeSet = std::unordered_set; +// Deal with subgraph's feed input var node: +// create a new input var node and it's feed op node +void AddFeedOpAndVar(const std::unordered_set& feed_vars, + const GraphNodeSet& cluster, + const std::unordered_map& old_op2new_op, + Graph* graph) { + for (auto* old_var : feed_vars) { + // create feed op + OpDesc desc; + desc.SetType("feed"); + desc.SetOutput("Out", {old_var->Name()}); + auto op = graph->CreateOpNode(&desc); + + // create new feed var node (SSAGraph) + auto var = graph->CreateVarNode(old_var->Var()); + + // link feed op and feed var + op->outputs = {var}; + var->inputs = {op}; + + // link feed var to cluster op + for (auto* old_op : old_var->outputs) { + if (cluster.count(old_op)) { + var->outputs.emplace_back(old_op2new_op.at(old_op)); + old_op2new_op.at(old_op)->inputs.emplace_back(var); + } + // Do not need relink old op or old var here, they will be + // fixed in RemoveLinkFromCluster, here we just deal with + // new subgraph's node. + } + } +} + +// Deal with subgraph's parameter var node: +// create a new input var node, it's data will get by scope, +// so it don't need feed op +void AddParamVar(const std::unordered_set& param_vars, + const GraphNodeSet& cluster, + const std::unordered_map& old_op2new_op, + Graph* graph) { + for (auto* old_var : param_vars) { + auto var = graph->CreateVarNode(old_var->Var()); + + for (auto* old_op : old_var->outputs) { + if (cluster.count(old_op)) { + var->outputs.emplace_back(old_op2new_op.at(old_op)); + old_op2new_op.at(old_op)->inputs.emplace_back(var); + } + } + } +} + +// Deal with subgraph's outputs var node: +// create a new output var node and it's fetch op +void AddOutputVar(const std::unordered_set& output_vars, + const GraphNodeSet& cluster, + const std::unordered_map& old_op2new_op, + Graph* graph) { + for (auto* old_var : output_vars) { + auto var = graph->CreateVarNode(old_var->Var()); + + for (auto* old_op : old_var->inputs) { + if (cluster.count(old_op)) { + var->inputs.emplace_back(old_op2new_op.at(old_op)); + old_op2new_op.at(old_op)->outputs.emplace_back(var); + } + } + } +} + // Create new subgraph with and op nodes are cluster nodes, and all // var node are from internal nodes -std::unique_ptr CreateNewSubGraph( - const GraphNodeSet& cluster, const GraphNodeSet& cluster_internals) { +std::unique_ptr CreateNewSubGraph(const GraphNodeSet& cluster, + const GraphNodeSet& cluster_internals, + const GraphNodeSet& cluster_inputs) { // Graph's constructor must has one parameter, and in our code, // the ProgramDesc is useless, so here we pass a temporary object. auto sub_graph = std::make_unique(framework::ProgramDesc()); @@ -84,6 +155,8 @@ std::unique_ptr CreateNewSubGraph( old_var2new_var[var] = sub_node; } + std::unordered_set need_feed_vars; + std::unordered_set param_vars, output_vars; // the subgraph is independently, so here we only need link // to the node in new subgraph, and discard the link to // out-graph. @@ -91,15 +164,36 @@ std::unique_ptr CreateNewSubGraph( for (auto* var : op->inputs) { if (cluster_internals.count(var)) { old_op2new_op[op]->inputs.emplace_back(old_var2new_var[var]); + } else if (cluster_inputs.count(var)) { + if (var->Var()->IsParameter()) { + // Parameters have been preserved in scope, compared to feed var, + // param just need add new var and don't need add feed op. + // The var is used for check whether we need preserve the tensor + // when transform paddle scope to CINN scope. + param_vars.insert(var); + } else { + // When the var is subgraph input and the var is not parameter, + // we need add a new feed op to feed the var. + need_feed_vars.insert(var); + } } } for (auto* var : op->outputs) { if (cluster_internals.count(var)) { old_op2new_op[op]->outputs.emplace_back(old_var2new_var[var]); + } else { + // Create new output var node to guarantee the independency of + // subgraph. In other words, the subgraph has no connection with + // other graph, even the input graph. + output_vars.insert(var); } } } + AddFeedOpAndVar(need_feed_vars, cluster, old_op2new_op, sub_graph.get()); + AddParamVar(param_vars, cluster, old_op2new_op, sub_graph.get()); + AddOutputVar(output_vars, cluster, old_op2new_op, sub_graph.get()); + for (auto* var : cluster_internals) { for (auto* op : var->inputs) { if (cluster.count(op)) { @@ -118,10 +212,12 @@ std::unique_ptr CreateNewSubGraph( // This interface is used to classify all variables involved in a cluster into // three types: inputs, outputs, and internals. -// Specially, the internal node is a node that only used by sub-graph, and +// The input node is some subgraph op's input but not any subgraph op's output. +// The output node is some subgraph op's output and some out-graph op's input. +// Specially, the internal node is a node that only used by subgraph, and // out-graph should not using this node at all. -// inputs & outputs & internals == NULL -// inputs | outputs | internals == all graph node +// cluster_inputs & cluster_outputs & cluster_internals == NULL +// cluster_outputs | cluster_internals == all graph op's outputs node void AnalyseClusterVariables(const GraphNodeSet& cluster, GraphNodeSet* cluster_inputs, GraphNodeSet* cluster_outputs, @@ -154,10 +250,6 @@ void AnalyseClusterVariables(const GraphNodeSet& cluster, } } - // if a output node also exists in input list, remove. - for (auto* var_node : *cluster_inputs) { - cluster_outputs->erase(var_node); - } // if a output node also exists in internal list, remove. for (auto* var_node : *cluster_internals) { cluster_outputs->erase(var_node); @@ -206,14 +298,23 @@ void RemoveLinkFromCluster(const GraphNodeSet& cluster, // removing useless link from cluster_inputs to cluster for (auto* var_node : cluster_inputs) { - auto preserved_nodes = get_preserved_ops(var_node->outputs); - var_node->outputs.assign(preserved_nodes.begin(), preserved_nodes.end()); + auto preserved_ops = get_preserved_ops(var_node->outputs); + var_node->outputs.assign(preserved_ops.begin(), preserved_ops.end()); + // According to SSA form, a var node must not be any two op's output, + // and the cluster_inputs var nodes is defined as an out-graph op's + // output, so the cluster_inputs var nodes are not any subgraph op's + // output. Do not reassign input list here. } // removing useless link from cluster to cluster_outputs for (auto* var_node : cluster_outputs) { - auto preserved_nodes = get_preserved_ops(var_node->inputs); - var_node->inputs.assign(preserved_nodes.begin(), preserved_nodes.end()); + auto preserved_ops = get_preserved_ops(var_node->inputs); + var_node->inputs.assign(preserved_ops.begin(), preserved_ops.end()); + + // Note that cluster_outputs var node maybe some subgraph op's input, + // here we need remove them. + preserved_ops = get_preserved_ops(var_node->outputs); + var_node->outputs.assign(preserved_ops.begin(), preserved_ops.end()); } } @@ -272,7 +373,7 @@ void SearchAllSubgraphs(Graph* graph, &cluster_internals); cinn_subgraphs->emplace_back( - CreateNewSubGraph(cluster_set, cluster_internals)); + CreateNewSubGraph(cluster_set, cluster_internals, cluster_inputs)); // replacing subgraph to a new special op node ReplaceSubGraphWithSpecialOpNode(cluster_set, cluster_inputs, diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index 883d5c6fbfb391..bf68a2b554b7f1 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -54,6 +54,35 @@ inline Node* GetNode(const std::unordered_set& nodes, [&op_name](const Node* node) { return node->Name() == op_name; }); } +inline bool CheckGraphIndependence(const std::unordered_set& nodes) { + auto check_node_ok = [&nodes](Node* n1, Node* n2) -> bool { + if (n1->IsOp() && !n2->IsVar()) { + return false; + } + if (n1->IsVar() && !n2->IsOp()) { + return false; + } + if (nodes.count(n2) == 0) { + return false; + } + return true; + }; + + for (auto node : nodes) { + for (auto in : node->inputs) { + if (!check_node_ok(node, in)) { + return false; + } + } + for (auto out : node->outputs) { + if (!check_node_ok(node, out)) { + return false; + } + } + } + return true; +} + std::unique_ptr BuildNoCinnSubgraph() { ProgramDesc prog; auto g = std::make_unique(prog); @@ -67,6 +96,8 @@ std::unique_ptr BuildNoCinnSubgraph() { VarDesc var1("var1"); VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); VarDesc var3("var3"); VarDesc var4("var4"); @@ -109,6 +140,7 @@ TEST(BuildCinnPassTest, NoCinnSubgraph) { // After search, origin graph should no change ASSERT_EQ(previous_nodes, g->Nodes()); + ASSERT_TRUE(CheckGraphIndependence(g->Nodes())); // After search, there should one cinn subgraph ASSERT_TRUE(cinn_subgraphs.empty()); @@ -119,11 +151,8 @@ std::unique_ptr BuildAllOpSupportCinnGraph() { auto g = std::make_unique(prog); // v1 -- - // | // | --> mul --> v3 -- - // | | // v2 -- | --> add --> v5 --> relu --> v6 - // | // v4 -- OpDesc add_op; @@ -135,6 +164,8 @@ std::unique_ptr BuildAllOpSupportCinnGraph() { VarDesc var1("var1"); VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); VarDesc var3("var3"); VarDesc var4("var4"); VarDesc var5("var5"); @@ -192,6 +223,7 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) { // v4 --| const auto& nodes = g->Nodes(); ASSERT_EQ(nodes.size(), static_cast(5)); + ASSERT_TRUE(CheckGraphIndependence(nodes)); // A new op named kCinnLaunchOp should be added ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp)); @@ -214,16 +246,34 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) { ASSERT_FALSE(CheckNodeExisted(nodes, "relu")); // After search, there should has just one cinn subgraph - // mul --> v3 --> add --> v5 --> relu + // feed --> v1 -- + // | --> mul --> v3 -- + // v2 -- | --> add --> v5 --> relu --> v6 + // feed --> v4 -- ASSERT_EQ(cinn_subgraphs.size(), static_cast(1)); const auto& subgraph = cinn_subgraphs.back(); const auto& subnodes = subgraph->Nodes(); - ASSERT_EQ(subnodes.size(), static_cast(5)); + ASSERT_EQ(subnodes.size(), static_cast(11)); + ASSERT_TRUE(CheckGraphIndependence(subnodes)); ASSERT_TRUE(CheckNodeExisted(subnodes, "mul")); ASSERT_TRUE(CheckNodeExisted(subnodes, "add")); ASSERT_TRUE(CheckNodeExisted(subnodes, "relu")); + ASSERT_EQ(CountNode(subnodes, "feed"), 2); + + // No-parameter input should has feed op + auto new_v1 = GetNode(subnodes, "var1"); + ASSERT_EQ(new_v1->inputs.size(), static_cast(1)); + ASSERT_EQ(new_v1->outputs.size(), static_cast(1)); + ASSERT_EQ(new_v1->inputs[0]->Name(), "feed"); + ASSERT_EQ(new_v1->outputs[0]->Name(), "mul"); + + // Parameter input should not has feed op + auto new_v2 = GetNode(subnodes, "var2"); + ASSERT_TRUE(new_v2->inputs.empty()); + ASSERT_EQ(new_v2->outputs.size(), static_cast(1)); + ASSERT_EQ(new_v2->outputs[0]->Name(), "mul"); } std::unique_ptr BuildGraphWithOneCinnSubgraph() { @@ -231,9 +281,7 @@ std::unique_ptr BuildGraphWithOneCinnSubgraph() { auto g = std::make_unique(prog); // fake1 --> v1 -- - // | // | --> mul --> v3 --> relu --> v4 --> fake2 - // | // v2 -- OpDesc fake1_op; @@ -247,6 +295,8 @@ std::unique_ptr BuildGraphWithOneCinnSubgraph() { VarDesc var1("var1"); VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); VarDesc var3("var3"); VarDesc var4("var4"); @@ -299,6 +349,7 @@ TEST(BuildCinnPassTest, OneCinnSubgraph) { // v2 -- const auto& nodes = g->Nodes(); ASSERT_EQ(nodes.size(), static_cast(6)); + ASSERT_TRUE(CheckGraphIndependence(nodes)); // A new op named kCinnLaunchOp should be added ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp)); @@ -312,15 +363,19 @@ TEST(BuildCinnPassTest, OneCinnSubgraph) { ASSERT_TRUE(CheckNodeExisted(nodes, "fake2")); // After search, there should has just one cinn subgraph - // mul --> v3 --> relu + // feed --> v1 -- + // | --> mul --> v3 --> relu --> v4 + // v2 -- ASSERT_EQ(cinn_subgraphs.size(), static_cast(1)); const auto& subgraph = cinn_subgraphs.back(); const auto& subnodes = subgraph->Nodes(); - ASSERT_EQ(subnodes.size(), static_cast(3)); + ASSERT_EQ(subnodes.size(), static_cast(7)); + ASSERT_TRUE(CheckGraphIndependence(subnodes)); ASSERT_TRUE(CheckNodeExisted(subnodes, "mul")); ASSERT_TRUE(CheckNodeExisted(subnodes, "relu")); + ASSERT_EQ(CountNode(subnodes, "feed"), 1); } std::unique_ptr BuildGraphWithMultiCinnSubgraph() { @@ -328,9 +383,7 @@ std::unique_ptr BuildGraphWithMultiCinnSubgraph() { auto g = std::make_unique(prog); // fake1 --> v1 -- - // | // | --> mul --> v3 --> fake2 --> v4 --> relu --> v5 --> fake3 - // | // v2 -- OpDesc fake1_op; @@ -346,6 +399,8 @@ std::unique_ptr BuildGraphWithMultiCinnSubgraph() { VarDesc var1("var1"); VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); VarDesc var3("var3"); VarDesc var4("var4"); VarDesc var5("var5"); @@ -406,6 +461,7 @@ TEST(BuildCinnPassTest, MultiCinnSubgraph) { // v2 - const auto& nodes = g->Nodes(); ASSERT_EQ(nodes.size(), static_cast(10)); + ASSERT_TRUE(CheckGraphIndependence(nodes)); // A new op named kCinnLaunchOp should be added ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp)); @@ -424,15 +480,27 @@ TEST(BuildCinnPassTest, MultiCinnSubgraph) { // and each of subgraphs just has one node. ASSERT_EQ(cinn_subgraphs.size(), static_cast(2)); - // subgraph1: relu + // subgraph1: + // feed --> v4 --> relu --> v5 + // subgraph2: + // feed --> v1 -- + // | --> mul --> v3 + // v2 -- const auto& subgraph1 = cinn_subgraphs[0]; const auto& subnodes1 = subgraph1->Nodes(); - ASSERT_EQ(subnodes1.size(), static_cast(1)); + ASSERT_TRUE(CheckGraphIndependence(subnodes1)); - // subgraph2: mul const auto& subgraph2 = cinn_subgraphs[1]; const auto& subnodes2 = subgraph2->Nodes(); - ASSERT_EQ(subnodes2.size(), static_cast(1)); + ASSERT_TRUE(CheckGraphIndependence(subnodes2)); + + if (CheckNodeExisted(subnodes1, "relu")) { + ASSERT_EQ(subnodes1.size(), static_cast(4)); + ASSERT_EQ(subnodes2.size(), static_cast(5)); + } else { + ASSERT_EQ(subnodes2.size(), static_cast(4)); + ASSERT_EQ(subnodes1.size(), static_cast(5)); + } } } // namespace paddle2cinn From be6a83301e04389902137fee6aee41134e83f4f3 Mon Sep 17 00:00:00 2001 From: Wilber Date: Tue, 19 Oct 2021 15:49:13 +0800 Subject: [PATCH 208/298] Inference add type check in copy_from_cpu (#36429) * update * fix ut error * update ut --- .../fluid/inference/api/analysis_predictor.cc | 18 ++++++ .../api/analysis_predictor_tester.cc | 9 +++ .../inference/api/paddle_inference_api.h | 2 + paddle/fluid/inference/tensorrt/engine.cc | 13 ++++ paddle/fluid/inference/tensorrt/helper.h | 16 +++++ paddle/fluid/pybind/inference_api.cc | 11 ++-- python/paddle/fluid/inference/__init__.py | 2 +- python/paddle/fluid/inference/wrapper.py | 15 +++++ .../tests/unittests/test_inference_api.py | 59 +++++++++++++++++++ python/paddle/inference/__init__.py | 4 ++ 10 files changed, 144 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index dfa27037205f15..491ed71c4bcccf 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -36,6 +36,7 @@ #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/io_utils.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -56,6 +57,7 @@ #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #endif @@ -1471,6 +1473,22 @@ int GetNumBytesOfDataType(DataType dtype) { std::string GetVersion() { return paddle::get_version(); } +std::tuple GetTrtCompileVersion() { +#ifdef PADDLE_WITH_TENSORRT + return paddle::inference::tensorrt::GetTrtCompileVersion(); +#else + return std::tuple{0, 0, 0}; +#endif +} + +std::tuple GetTrtRuntimeVersion() { +#ifdef PADDLE_WITH_TENSORRT + return paddle::inference::tensorrt::GetTrtRuntimeVersion(); +#else + return std::tuple{0, 0, 0}; +#endif +} + std::string UpdateDllFlag(const char *name, const char *value) { return paddle::UpdateDllFlag(name, value); } diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 86fbde00075f09..a15a1cd84b1409 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -359,6 +359,15 @@ TEST(AnalysisPredictor, set_xpu_device_id) { namespace paddle_infer { TEST(Predictor, Run) { + auto trt_compile_ver = GetTrtCompileVersion(); + auto trt_runtime_ver = GetTrtRuntimeVersion(); + LOG(INFO) << "trt compile version: " << std::get<0>(trt_compile_ver) << "." + << std::get<1>(trt_compile_ver) << "." + << std::get<2>(trt_compile_ver); + LOG(INFO) << "trt runtime version: " << std::get<0>(trt_runtime_ver) << "." + << std::get<1>(trt_runtime_ver) << "." + << std::get<2>(trt_runtime_ver); + Config config; config.SetModel(FLAGS_dirname); diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index a516abb1432ca8..35b90bfa54f73c 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -169,6 +169,8 @@ PD_INFER_DECL std::shared_ptr CreatePredictor( PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype); PD_INFER_DECL std::string GetVersion(); +PD_INFER_DECL std::tuple GetTrtCompileVersion(); +PD_INFER_DECL std::tuple GetTrtRuntimeVersion(); PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value); namespace services { diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 24644645eee49b..26182a79321993 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -190,6 +190,19 @@ void TensorRTEngine::FreezeNetwork() { #if IS_TRT_VERSION_GE(6000) LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode."; for (auto &input : min_input_shape_) { +#if IS_TRT_VERSION_LT(7000) + // trt6 will check all_of input > 0 + if (!(std::all_of(input.second.begin(), input.second.end(), + [](int x) { return x > 0; }) && + std::all_of(max_input_shape_[input.first].begin(), + max_input_shape_[input.first].end(), + [](int x) { return x > 0; }) && + std::all_of(optim_input_shape_[input.first].begin(), + optim_input_shape_[input.first].end(), + [](int x) { return x > 0; }))) { + continue; + } +#endif VLOG(4) << "TRT dynamic_shape set " << input.first << " min: " << Vec2Str(input.second) << ", max: " << Vec2Str(max_input_shape_[input.first]) diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 16595b8a032988..b8051d8610442f 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -73,8 +73,24 @@ static nvinfer1::IPluginRegistry* GetPluginRegistry() { static int GetInferLibVersion() { return static_cast(dy::getInferLibVersion()); } +#else +static int GetInferLibVersion() { return 0; } #endif +static std::tuple GetTrtRuntimeVersion() { + int ver = GetInferLibVersion(); + int major = ver / 1000; + ver -= major * 1000; + int minor = ver / 100; + int patch = ver - minor * 100; + return std::tuple{major, minor, patch}; +} + +static std::tuple GetTrtCompileVersion() { + return std::tuple{NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, + NV_TENSORRT_PATCH}; +} + // A logger for create TensorRT infer builder. class NaiveLogger : public nvinfer1::ILogger { public: diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 8ce7bea2d8e703..e02f25ff636a29 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -330,6 +330,8 @@ void BindInferenceApi(py::module *m) { m->def("paddle_dtype_size", &paddle::PaddleDtypeSize); m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes); m->def("get_version", &paddle_infer::GetVersion); + m->def("get_trt_compile_version", &paddle_infer::GetTrtCompileVersion); + m->def("get_trt_runtime_version", &paddle_infer::GetTrtRuntimeVersion); m->def("get_num_bytes_of_data_type", &paddle_infer::GetNumBytesOfDataType); } @@ -739,10 +741,11 @@ void BindZeroCopyTensor(py::module *m) { void BindPaddleInferTensor(py::module *m) { py::class_(*m, "PaddleInferTensor") .def("reshape", &paddle_infer::Tensor::Reshape) - .def("copy_from_cpu", &PaddleInferTensorCreate) - .def("copy_from_cpu", &PaddleInferTensorCreate) - .def("copy_from_cpu", &PaddleInferTensorCreate) - .def("copy_from_cpu", &PaddleInferTensorCreate) + .def("copy_from_cpu_bind", &PaddleInferTensorCreate) + .def("copy_from_cpu_bind", &PaddleInferTensorCreate) + .def("copy_from_cpu_bind", &PaddleInferTensorCreate) + .def("copy_from_cpu_bind", + &PaddleInferTensorCreate) .def("copy_to_cpu", &PaddleInferTensorToNumpy) .def("shape", &paddle_infer::Tensor::shape) .def("set_lod", &paddle_infer::Tensor::SetLoD) diff --git a/python/paddle/fluid/inference/__init__.py b/python/paddle/fluid/inference/__init__.py index 3013c1f2aff87f..946b4f0c8d7b23 100644 --- a/python/paddle/fluid/inference/__init__.py +++ b/python/paddle/fluid/inference/__init__.py @@ -14,4 +14,4 @@ from .wrapper import Config, DataType, PlaceType, PrecisionType, Tensor, Predictor -from ..core import create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool +from ..core import create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool, get_trt_compile_version, get_trt_runtime_version diff --git a/python/paddle/fluid/inference/wrapper.py b/python/paddle/fluid/inference/wrapper.py index 96885edcc5e822..2c1b2c77504d92 100644 --- a/python/paddle/fluid/inference/wrapper.py +++ b/python/paddle/fluid/inference/wrapper.py @@ -15,9 +15,24 @@ from ..core import AnalysisConfig, PaddleDType, PaddlePlace from ..core import PaddleInferPredictor, PaddleInferTensor +import numpy as np + DataType = PaddleDType PlaceType = PaddlePlace PrecisionType = AnalysisConfig.Precision Config = AnalysisConfig Tensor = PaddleInferTensor Predictor = PaddleInferPredictor + + +def tensor_copy_from_cpu(self, data): + ''' + Support input type check based on tensor.copy_from_cpu. + ''' + if not isinstance(data, np.ndarray): + raise TypeError( + "In copy_from_cpu, we only support numpy ndarray data type.") + self.copy_from_cpu_bind(data) + + +Tensor.copy_from_cpu = tensor_copy_from_cpu diff --git a/python/paddle/fluid/tests/unittests/test_inference_api.py b/python/paddle/fluid/tests/unittests/test_inference_api.py index 98ec0b3db04c49..7ed908eb33b819 100644 --- a/python/paddle/fluid/tests/unittests/test_inference_api.py +++ b/python/paddle/fluid/tests/unittests/test_inference_api.py @@ -14,10 +14,14 @@ import os, shutil import unittest +import paddle +paddle.enable_static() import numpy as np import paddle.fluid as fluid from paddle.fluid.core import PaddleTensor from paddle.fluid.core import PaddleDType +from paddle.inference import Config, Predictor, create_predictor +from paddle.inference import get_trt_compile_version, get_trt_runtime_version class TestInferenceApi(unittest.TestCase): @@ -54,5 +58,60 @@ def test_inference_api(self): tensor_float.ravel().tolist()) +def get_sample_model(): + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + data = fluid.data(name="data", shape=[-1, 6, 64, 64], dtype="float32") + conv_out = fluid.layers.conv2d( + input=data, + num_filters=3, + filter_size=3, + groups=1, + padding=0, + bias_attr=False, + act=None) + exe.run(startup_program) + serialized_program = paddle.static.serialize_program( + data, conv_out, program=main_program) + serialized_params = paddle.static.serialize_persistables( + data, conv_out, executor=exe, program=main_program) + return serialized_program, serialized_params + + +class TestInferenceBaseAPI(unittest.TestCase): + def get_config(self, model, params): + config = Config() + config.set_model_buffer(model, len(model), params, len(params)) + config.enable_use_gpu(100, 0) + return config + + def test_apis(self): + print('trt compile version:', get_trt_compile_version()) + print('trt runtime version:', get_trt_runtime_version()) + program, params = get_sample_model() + config = self.get_config(program, params) + predictor = create_predictor(config) + in_names = predictor.get_input_names() + in_handle = predictor.get_input_handle(in_names[0]) + in_data = np.ones((1, 6, 32, 32)).astype(np.float32) + in_handle.copy_from_cpu(in_data) + predictor.run() + + def test_wrong_input(self): + with self.assertRaises(TypeError): + program, params = get_sample_model() + config = self.get_config(program, params) + predictor = create_predictor(config) + in_names = predictor.get_input_names() + in_handle = predictor.get_input_handle(in_names[0]) + in_data = np.ones((1, 6, 64, 64)).astype(np.float32) + in_handle.copy_from_cpu(list(in_data)) + predictor.run() + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/inference/__init__.py b/python/paddle/inference/__init__.py index 4e172039716628..ec5295b6dfe561 100644 --- a/python/paddle/inference/__init__.py +++ b/python/paddle/inference/__init__.py @@ -20,6 +20,8 @@ from ..fluid.inference import Predictor # noqa: F401 from ..fluid.inference import create_predictor # noqa: F401 from ..fluid.inference import get_version # noqa: F401 +from ..fluid.inference import get_trt_compile_version # noqa: F401 +from ..fluid.inference import get_trt_runtime_version # noqa: F401 from ..fluid.inference import get_num_bytes_of_data_type # noqa: F401 from ..fluid.inference import PredictorPool # noqa: F401 @@ -32,6 +34,8 @@ 'Predictor', 'create_predictor', 'get_version', + 'get_trt_compile_version', + 'get_trt_runtime_version', 'get_num_bytes_of_data_type', 'PredictorPool' ] From 9e4944725d7ad61ef2092dacdf0fecec78cac3fd Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Tue, 19 Oct 2021 15:49:57 +0800 Subject: [PATCH 209/298] [heterps]edit shrink and unseenday logit for pslib (#36194) --- paddle/fluid/framework/fleet/fleet_wrapper.cc | 23 ++++++++++++ paddle/fluid/framework/fleet/fleet_wrapper.h | 2 ++ .../framework/fleet/heter_ps/hashtable_inl.h | 2 +- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 13 +++++++ paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 9 +++++ paddle/fluid/pybind/fleet_wrapper_py.cc | 1 + paddle/fluid/pybind/ps_gpu_wrapper_py.cc | 2 ++ .../distributed/fleet/dataset/dataset.py | 36 +++++++++++++++++++ python/paddle/fluid/dataset.py | 23 ++++++++++++ .../fleet/parameter_server/pslib/__init__.py | 9 +++++ .../unittests/test_communicator_ps_gpu.py | 2 +- 11 files changed, 120 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 4346c144fab7f2..7aeb9eaf3f1958 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -1334,6 +1334,29 @@ void FleetWrapper::SaveModelOneTablePrefix(const uint64_t table_id, #endif } +void FleetWrapper::SetDate(const uint64_t table_id, const std::string& date) { +#ifdef PADDLE_WITH_PSLIB + assert(date.size() == 8); + int year = std::stoi(date.substr(0, 4)); + int month = std::stoi(date.substr(4, 2)); + int day = std::stoi(date.substr(6, 2)); + struct std::tm b; + b.tm_year = year - 1900; + b.tm_mon = month - 1; + b.tm_mday = day; + b.tm_hour = b.tm_min = b.tm_sec = 0; + std::time_t seconds_from_1970 = std::mktime(&b); + int day_id = seconds_from_1970 / 86400; + auto ret = pslib_ptr_->_worker_ptr->set_day_id(table_id, day_id); + ret.wait(); + if (ret.get() != 0) { + LOG(ERROR) << "setdate : " << date << " failed"; + } +#else + VLOG(0) << "FleetWrapper::SetDate does nothing when no pslib"; +#endif +} + void FleetWrapper::PrintTableStat(const uint64_t table_id) { #ifdef PADDLE_WITH_PSLIB auto ret = pslib_ptr_->_worker_ptr->print_table_stat(table_id); diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index d368b421ff2a05..6fddedccf02585 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -336,6 +336,8 @@ class FleetWrapper { // this performs better than rand_r, especially large data std::default_random_engine& LocalRandomEngine(); + void SetDate(const uint64_t table_id, const std::string& date); + #ifdef PADDLE_WITH_PSLIB static std::shared_ptr pslib_ptr_; #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index 9facbff1f25269..9f3d1a7adcafcc 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -128,7 +128,7 @@ void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { downpour_value->resize(gpu_val.mf_size + downpour_value_size); } float* cpu_val = downpour_value->data(); - cpu_val[0] = 0; + // cpu_val[0] = 0; cpu_val[1] = gpu_val.delta_score; cpu_val[2] = gpu_val.show; cpu_val[3] = gpu_val.clk; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index d1e98a711dc9dd..d3990c1f3dd769 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -181,6 +181,19 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { VLOG(3) << "GpuPs shard: " << i << " key len: " << local_keys[i].size(); local_ptr[i].resize(local_keys[i].size()); } + +#ifdef PADDLE_WITH_PSLIB + // get day_id: day nums from 1970 + struct std::tm b; + b.tm_year = year_ - 1900; + b.tm_mon = month_ - 1; + b.tm_mday = day_; + b.tm_min = b.tm_hour = b.tm_sec = 0; + std::time_t seconds_from_1970 = std::mktime(&b); + int day_id = seconds_from_1970 / 86400; + fleet_ptr->pslib_ptr_->_worker_ptr->set_day_id(table_id_, day_id); +#endif + timeline.Start(); auto ptl_func = [this, &local_keys, &local_ptr, &fleet_ptr](int i) { size_t key_size = local_keys[i].size(); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index fa2ff6cbdb8c78..6f785cad33e2d2 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -240,6 +240,12 @@ class PSGPUWrapper { mf_max_bound); } } + void SetDate(int year, int month, int day) { + year_ = year; + month_ = month; + day_ = day; + } + void SetDataset(Dataset* dataset) { dataset_ = dataset; } // PSGPUWrapper singleton @@ -283,6 +289,9 @@ class PSGPUWrapper { int thread_keys_thread_num_ = 37; int thread_keys_shard_num_ = 37; uint64_t max_fea_num_per_pass_ = 5000000000; + int year_; + int month_; + int day_; std::shared_ptr< paddle::framework::ChannelObject>> diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc index d8142f717baed8..af1c3da727d417 100644 --- a/paddle/fluid/pybind/fleet_wrapper_py.cc +++ b/paddle/fluid/pybind/fleet_wrapper_py.cc @@ -91,6 +91,7 @@ void BindFleetWrapper(py::module* m) { .def("save_model_one_table", &framework::FleetWrapper::SaveModelOneTable) .def("save_model_one_table_with_prefix", &framework::FleetWrapper::SaveModelOneTablePrefix) + .def("set_date", &framework::FleetWrapper::SetDate) .def("copy_table", &framework::FleetWrapper::CopyTable) .def("copy_table_by_feasign", &framework::FleetWrapper::CopyTableByFeasign); diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc index 48365f42b11ba9..6e98a9479fa26a 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc @@ -41,6 +41,8 @@ void BindPSGPUWrapper(py::module* m) { py::call_guard()) .def("init_GPU_server", &framework::PSGPUWrapper::InitializeGPUServer, py::call_guard()) + .def("set_date", &framework::PSGPUWrapper::SetDate, + py::call_guard()) .def("set_dataset", &framework::PSGPUWrapper::SetDataset, py::call_guard()) .def("init_gpu_ps", &framework::PSGPUWrapper::InitializeGPU, diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py index 25a1d98cb11218..e231ac55e679a2 100644 --- a/python/paddle/distributed/fleet/dataset/dataset.py +++ b/python/paddle/distributed/fleet/dataset/dataset.py @@ -748,6 +748,42 @@ def _generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num, self.dataset.generate_local_tables_unlock( table_id, fea_dim, read_thread_num, consume_thread_num, shard_num) + def set_date(self, date): + """ + :api_attr: Static Graph + + Set training date for pull sparse parameters, saving and loading model. Only used in psgpu + + Args: + date(str): training date(format : YYMMDD). eg.20211111 + + Examples: + .. code-block:: python + + import paddle + paddle.enable_static() + + dataset = paddle.distributed.InMemoryDataset() + slots = ["slot1", "slot2", "slot3", "slot4"] + slots_vars = [] + for slot in slots: + var = paddle.static.data( + name=slot, shape=[None, 1], dtype="int64", lod_level=1) + slots_vars.append(var) + dataset.init( + batch_size=1, + thread_num=2, + input_type=1, + pipe_command="cat", + use_var=slots_vars) + dataset.set_date("20211111") + """ + year = int(date[:4]) + month = int(date[4:6]) + day = int(date[6:]) + if self.use_ps_gpu and core._is_compiled_with_heterps(): + self.psgpu.set_date(year, month, day) + def load_into_memory(self, is_shuffle=False): """ :api_attr: Static Graph diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index d683e36fbe5ab3..972f59d1e9058a 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -716,6 +716,29 @@ def generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num, self.dataset.generate_local_tables_unlock( table_id, fea_dim, read_thread_num, consume_thread_num, shard_num) + def set_date(self, date): + """ + :api_attr: Static Graph + + Set training date for pull sparse parameters, saving and loading model. Only used in psgpu + + Args: + date(str): training date(format : YYMMDD). eg.20211111 + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_date("20211111") + """ + year = int(date[:4]) + month = int(date[4:6]) + day = int(date[6:]) + if self.use_ps_gpu and core._is_compiled_with_heterps(): + self.psgpu.set_date(year, month, day) + @deprecated( since="2.0.0", update_to="paddle.distributed.InMemoryDataset.load_into_memory") diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py index 78af7fd65dccbb..309532cafc2e16 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py @@ -799,6 +799,15 @@ def save_one_table(self, table_id, model_dir, **kwargs): self._fleet_ptr.save_model_one_table(table_id, model_dir, mode) self._role_maker._barrier_worker() + def set_date(self, table_id, date): + """ + set_date, eg, 20210918 + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.set_date(table_id, str(date)) + self._role_maker._barrier_worker() + def _set_opt_info(self, opt_info): """ this function saves the result from DistributedOptimizer.minimize() diff --git a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py index 6ab8a2c3a4b220..1faa084d412e42 100644 --- a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py +++ b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py @@ -74,6 +74,7 @@ def test_communicator_ps_gpu(self): batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars) dataset.set_filelist(["test_communicator_ps_gpu.txt"]) dataset._set_use_ps_gpu(1) + dataset.set_date("20211111") dataset.load_into_memory(is_shuffle=True) os.environ["TEST_MODE"] = "1" @@ -88,7 +89,6 @@ def test_communicator_ps_gpu(self): pass except Exception as e: self.assertTrue(False) - time.sleep(10) fleet.stop_worker() os.remove("./test_communicator_ps_gpu.txt") From 49d7bd38448b7b876a08af8c8afb1062d9469f14 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Tue, 19 Oct 2021 15:56:57 +0800 Subject: [PATCH 210/298] [NPU] update inference cmake, test=develop (#36505) * [NPU] update inference cmake, test=develop * address review comments, test=develop * fix compile error when WITH_ASCEND_CXX11 ON, test=develop --- cmake/external/ascend.cmake | 32 +++++++++++++++++++++++++++ cmake/inference_lib.cmake | 9 +++++++- cmake/miopen.cmake | 2 -- paddle/fluid/platform/resource_pool.h | 1 + 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index 414b2a54be0342..b643923cdd3531 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -85,5 +85,37 @@ if(WITH_ASCEND_CL) ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib}) add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) +endif() +if (WITH_ASCEND_CL) +macro(find_ascend_toolkit_version ascend_toolkit_version_info) + file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS) + string(REGEX MATCH "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}") + string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}") + if(NOT ASCEND_TOOLKIT_VERSION) + set(ASCEND_TOOLKIT_VERSION "???") + else() + message(STATUS "Current Ascend Toolkit version is ${ASCEND_TOOLKIT_VERSION}") + endif() +endmacro() + +macro(find_ascend_driver_version ascend_driver_version_info) + file(READ ${ascend_driver_version_info} ASCEND_DRIVER_VERSION_CONTENTS) + string(REGEX MATCH "Version=([0-9]+\.[0-9]+\.[0-9]+)" ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION_CONTENTS}") + string(REGEX REPLACE "Version=([0-9]+\.[0-9]+\.[0-9]+)" "\\1" ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION}") + if(NOT ASCEND_DRIVER_VERSION) + set(ASCEND_DRIVER_VERSION "???") + else() + message(STATUS "Current Ascend Driver version is ${ASCEND_DRIVER_VERSION}") + endif() +endmacro() + +if (WITH_ARM) + set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/arm64-linux) +else() + set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/x86_64-linux) endif() + +find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info) +find_ascend_driver_version(${ASCEND_DIR}/driver/version.info) +endif() \ No newline at end of file diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index cb2ed614d3d7ca..5ffbf15c960a32 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -353,7 +353,9 @@ function(version version_file) "WITH_MKL: ${WITH_MKL}\n" "WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_GPU: ${WITH_GPU}\n" - "WITH_ROCM: ${WITH_ROCM}\n") + "WITH_ROCM: ${WITH_ROCM}\n" + "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n" + "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n") if(WITH_GPU) file(APPEND ${version_file} "CUDA version: ${CUDA_VERSION}\n" @@ -364,6 +366,11 @@ function(version version_file) "HIP version: ${HIP_VERSION}\n" "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n") endif() + if(WITH_ASCEND_CL) + file(APPEND ${version_file} + "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n" + "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n") + endif() file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") if(TENSORRT_FOUND) file(APPEND ${version_file} diff --git a/cmake/miopen.cmake b/cmake/miopen.cmake index f482f423dc5c12..493c37955f7258 100644 --- a/cmake/miopen.cmake +++ b/cmake/miopen.cmake @@ -15,8 +15,6 @@ find_path(MIOPEN_INCLUDE_DIR "miopen/miopen.h" NO_DEFAULT_PATH ) -get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) - find_library(MIOPEN_LIBRARY NAMES "libMIOpen.so" PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/lib ${MIOPEN_ROOT}/lib64 ${__libpath_hist} $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/lib $ENV{MIOPEN_ROOT}/lib64 diff --git a/paddle/fluid/platform/resource_pool.h b/paddle/fluid/platform/resource_pool.h index 3603c0f24f2790..f01d006d5b273b 100644 --- a/paddle/fluid/platform/resource_pool.h +++ b/paddle/fluid/platform/resource_pool.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include From f2612462bd0dcc87f406e458240155d2c9108613 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Tue, 19 Oct 2021 16:54:54 +0800 Subject: [PATCH 211/298] fix op_flops not define. test=develop (#36489) --- python/paddle/hapi/static_flops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/hapi/static_flops.py b/python/paddle/hapi/static_flops.py index 07fc19b2cb89a5..f386bbd0dd6db1 100644 --- a/python/paddle/hapi/static_flops.py +++ b/python/paddle/hapi/static_flops.py @@ -176,6 +176,7 @@ def count_element_op(op): def _graph_flops(graph, detail=False): assert isinstance(graph, GraphWrapper) flops = 0 + op_flops = 0 table = Table(["OP Type", 'Param name', "Flops"]) for op in graph.ops(): param_name = '' From 999242e35f450e2904df22a56ca8954f1811dbf8 Mon Sep 17 00:00:00 2001 From: zhulei <563755780@qq.com> Date: Tue, 19 Oct 2021 19:32:30 +0800 Subject: [PATCH 212/298] [NPU] Add iou_similarity op (#36412) * [NPU] Add iou_similarity op * [NPU] Add iou_similarity op * [NPU] Add iou_similarity op --- .../fluid/operators/detection/CMakeLists.txt | 2 + .../detection/iou_similarity_op_npu.cc | 192 ++++++++++++++++++ .../npu/test_iou_similarity_op_npu.py | 126 ++++++++++++ 3 files changed, 320 insertions(+) create mode 100644 paddle/fluid/operators/detection/iou_similarity_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 871240aa15fce0..506ae56a126427 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -64,6 +64,8 @@ endif() if(WITH_XPU) detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc) +elseif(WITH_ASCEND_CL) + detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc) else() detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu) endif() diff --git a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc new file mode 100644 index 00000000000000..9a91d4bd8fac13 --- /dev/null +++ b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/iou_similarity_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct IouFunction { + public: + explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) { + place = ctx.GetPlace(); + stream = ctx.template device_context() + .stream(); + } + void Transpose(const Tensor* x, Tensor* y, const std::vector& axis) { + // y should be init first + const auto& runner = + NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}}); + runner.Run(stream); + } + void Add(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Sub(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Mul(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void DivNoNan(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Adds(const Tensor* x, float scalar, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); + runner.Run(stream); + } + void Maximum(const Tensor* x, const Tensor* y, Tensor* z) { + // z should be init first + const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Minimum(const Tensor* x, const Tensor* y, Tensor* z) { + // z should be init first + const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + + private: + platform::Place place; + aclrtStream stream; + const framework::ExecutionContext& ctx; +}; + +template +class IouSimilarityNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + bool normalized = ctx.Attr("box_normalized"); + auto* out = ctx.Output("Out"); + + auto _type = x->type(); + auto place = ctx.GetPlace(); + + IouFunction F(ctx); + + auto N = x->dims()[0]; + auto M = y->dims()[0]; + + out->mutable_data({N, M}, place); + Tensor xt(_type); + Tensor yt(_type); + xt.mutable_data({4, N}, place); + yt.mutable_data({4, M}, place); + std::vector vec_trans = {1, 0}; + F.Transpose(x, &xt, vec_trans); + F.Transpose(y, &yt, vec_trans); + Tensor xmin1 = xt.Slice(0, 1); + Tensor ymin1 = xt.Slice(1, 2); + Tensor xmax1 = xt.Slice(2, 3); + Tensor ymax1 = xt.Slice(3, 4); + Tensor xmin2 = yt.Slice(0, 1); + Tensor ymin2 = yt.Slice(1, 2); + Tensor xmax2 = yt.Slice(2, 3); + Tensor ymax2 = yt.Slice(3, 4); + xmin1.Resize({N, 1}); + ymin1.Resize({N, 1}); + xmax1.Resize({N, 1}); + ymax1.Resize({N, 1}); + xmin2.Resize({1, M}); + ymin2.Resize({1, M}); + xmax2.Resize({1, M}); + ymax2.Resize({1, M}); + + Tensor w1(_type); + Tensor h1(_type); + Tensor w2(_type); + Tensor h2(_type); + Tensor area1(_type); + Tensor area2(_type); + w1.mutable_data({N, 1}, place); + h1.mutable_data({N, 1}, place); + w2.mutable_data({1, M}, place); + h2.mutable_data({1, M}, place); + area1.mutable_data({N, 1}, place); + area2.mutable_data({1, M}, place); + F.Sub(&xmax1, &xmin1, &w1); + F.Sub(&ymax1, &ymin1, &h1); + F.Sub(&xmax2, &xmin2, &w2); + F.Sub(&ymax2, &ymin2, &h2); + if (!normalized) { + F.Adds(&w1, 1.0f, &w1); + F.Adds(&h1, 1.0f, &h1); + F.Adds(&w2, 1.0f, &w2); + F.Adds(&h2, 1.0f, &h2); + } + F.Mul(&w1, &h1, &area1); + F.Mul(&w2, &h2, &area2); + + Tensor inter_xmax(_type); + Tensor inter_ymax(_type); + Tensor inter_xmin(_type); + Tensor inter_ymin(_type); + inter_xmax.mutable_data({N, M}, place); + inter_ymax.mutable_data({N, M}, place); + inter_xmin.mutable_data({N, M}, place); + inter_ymin.mutable_data({N, M}, place); + F.Minimum(&xmax1, &xmax2, &inter_xmax); + F.Minimum(&ymax1, &ymax2, &inter_ymax); + F.Maximum(&xmin1, &xmin2, &inter_xmin); + F.Maximum(&ymin1, &ymin2, &inter_ymin); + + Tensor inter_w(_type); + Tensor inter_h(_type); + inter_w.mutable_data({N, M}, place); + inter_h.mutable_data({N, M}, place); + F.Sub(&inter_xmax, &inter_xmin, &inter_w); + F.Sub(&inter_ymax, &inter_ymin, &inter_h); + + if (!normalized) { + F.Adds(&inter_w, 1.0f, &inter_w); + F.Adds(&inter_h, 1.0f, &inter_h); + } + Tensor zeros(_type); + zeros.mutable_data({1}, place); + FillNpuTensorWithConstant(&zeros, static_cast(0)); + F.Maximum(&inter_w, &zeros, &inter_w); + F.Maximum(&inter_h, &zeros, &inter_h); + + F.Mul(&inter_w, &inter_h, out); + Tensor union_area(_type); + union_area.mutable_data({N, M}, place); + F.Add(&area1, &area2, &union_area); + F.Sub(&union_area, out, &union_area); + F.DivNoNan(out, &union_area, out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(iou_similarity, ops::IouSimilarityNPUKernel, + ops::IouSimilarityNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py new file mode 100644 index 00000000000000..22042ce49200b3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py @@ -0,0 +1,126 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import numpy.random as random +import sys +sys.path.append("..") +import math +import paddle +from op_test import OpTest + +paddle.enable_static() + +np.random.seed(2021) + + +class TestNpuIouSimilarityOp(OpTest): + def setUp(self): + self.op_type = "iou_similarity" + self.set_npu() + self.init_dtype() + self.set_init_config() + self.set_attrs() + self.set_inputs() + self.set_outputs() + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def set_init_config(self): + self.N = 2 + self.M = 3 + self.box_normalized = False + self.use_lod = False + + def set_inputs(self): + self.boxes1 = random.rand(self.N, 4).astype(self.dtype) + self.boxes2 = random.rand(self.M, 4).astype(self.dtype) + if self.use_lod: + self.boxes1_lod = [[1 for _ in range(self.N)]] + self.inputs = { + 'X': (self.boxes1, self.boxes1_lod), + 'Y': self.boxes2 + } + else: + self.inputs = {'X': self.boxes1, 'Y': self.boxes2} + + def set_attrs(self): + self.attrs = {"box_normalized": self.box_normalized} + + def set_outputs(self): + self.output = random.rand(self.N, self.M).astype(self.dtype) + self._compute_iou() + self.outputs = {'Out': self.output} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def _compute_iou(self, ): + for row in range(self.boxes1.shape[0]): + for col in range(self.boxes2.shape[0]): + xmin1, ymin1, xmax1, ymax1 = self.boxes1[row] + xmin2, ymin2, xmax2, ymax2 = self.boxes2[col] + if not self.box_normalized: + area1 = (ymax1 - ymin1 + 1) * (xmax1 - xmin1 + 1) + area2 = (ymax2 - ymin2 + 1) * (xmax2 - xmin2 + 1) + else: + area1 = (ymax1 - ymin1) * (xmax1 - xmin1) + area2 = (ymax2 - ymin2) * (xmax2 - xmin2) + + inter_xmax = min(xmax1, xmax2) + inter_ymax = min(ymax1, ymax2) + inter_xmin = max(xmin1, xmin2) + inter_ymin = max(ymin1, ymin2) + inter_height = inter_ymax - inter_ymin + inter_width = inter_xmax - inter_xmin + if not self.box_normalized: + inter_height += 1 + inter_width += 1 + inter_height = max(inter_height, 0) + inter_width = max(inter_width, 0) + inter_area = inter_width * inter_height + union_area = area1 + area2 - inter_area + sim_score = inter_area / union_area + self.output[row, col] = sim_score + + +class TestNpuIouSimilarityOpWithLoD(TestNpuIouSimilarityOp): + def set_init_config(self): + super(TestNpuIouSimilarityOpWithLoD, self).set_init_config() + self.box_normalized = True + self.use_lod = True + + +class TestNpuIouSimilarityOpWithBoxNormalized(TestNpuIouSimilarityOp): + def set_init_config(self): + super(TestNpuIouSimilarityOpWithBoxNormalized, self).set_init_config() + self.box_normalized = True + self.use_lod = True + + +def TestNpuIouSimilarityOpFp16(TestNpuIouSimilarityOp): + def init_dtype(self): + self.dtype = np.float16 + + +if __name__ == '__main__': + unittest.main() From 51c97d9f14048c60fa901f397e3ba540ec353226 Mon Sep 17 00:00:00 2001 From: Weilong Wu <87417304+veyron95@users.noreply.github.com> Date: Tue, 19 Oct 2021 19:37:06 +0800 Subject: [PATCH 213/298] Support elementwise_add triple grad Kernel (#36508) * Support elementwise_add triple grad Kernel * Change code-format to follow CI std --- .../elementwise/elementwise_add_op.cc | 47 ++++++++++++-- .../elementwise/elementwise_add_op.cu | 11 ++++ .../elementwise/elementwise_add_op.h | 39 ++++++++++++ .../operators/elementwise/elementwise_op.h | 61 +++++++++++++++++++ .../fluid/tests/unittests/gradient_checker.py | 12 +++- .../unittests/test_elementwise_nn_grad.py | 54 ++++++++++++++++ 6 files changed, 217 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 67e2e3a1e96772..d66d6b66a05824 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -110,6 +110,25 @@ class ElementwiseAddDoubleGradMaker : public framework::SingleGradOpMaker { } }; +template +class ElementwiseAddTripleGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("elementwise_add_triple_grad"); + op->SetInput("DDX", this->Input("DDX")); + op->SetInput("DDY", this->Input("DDY")); + op->SetInput("D_DDOut", this->OutputGrad("DDOut")); + + op->SetAttrMap(this->Attrs()); + + op->SetOutput("D_DDX", this->InputGrad("DDX")); + op->SetOutput("D_DDY", this->InputGrad("DDY")); + } +}; + } // namespace operators } // namespace paddle @@ -123,10 +142,16 @@ REGISTER_OPERATOR( ops::ElementwiseAddDoubleGradMaker, ops::ElementwiseAddDoubleGradMaker); -REGISTER_OPERATOR(elementwise_add_grad_grad, - ops::ElementwiseOpDoubleGradWithoutDXDY, - ops::ElementwiseDoubleGradOpInplaceInferer, - ops::ElementwiseDoubleGradNoBufVarsInferer); +REGISTER_OPERATOR( + elementwise_add_grad_grad, ops::ElementwiseOpDoubleGradWithoutDXDY, + ops::ElementwiseDoubleGradOpInplaceInferer, + ops::ElementwiseDoubleGradNoBufVarsInferer, + ops::ElementwiseAddTripleGradMaker, + ops::ElementwiseAddTripleGradMaker); + +REGISTER_OPERATOR(elementwise_add_triple_grad, ops::ElementwiseOpTripleGrad, + ops::ElementwiseTripleGradOpInplaceInferer, + ops::ElementwiseTripleGradNoBufVarsInferer); REGISTER_OP_CPU_KERNEL( elementwise_add, @@ -162,6 +187,20 @@ REGISTER_OP_CPU_KERNEL( paddle::platform::complex>, ops::ElementwiseAddDoubleGradKernel>); +REGISTER_OP_CPU_KERNEL( + elementwise_add_triple_grad, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel>, + ops::ElementwiseAddTripleGradKernel>); // A specialization elementwise_add operator, used in gradient accumulation with // inplace addto. diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 331867617bd78a..0b78aa4a01a741 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -196,6 +196,17 @@ REGISTER_OP_CUDA_KERNEL( plat::complex>, ops::ElementwiseAddDoubleGradKernel>); +REGISTER_OP_CUDA_KERNEL( + elementwise_add_triple_grad, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel>, + ops::ElementwiseAddTripleGradKernel>); REGISTER_OP_CUDA_KERNEL( grad_add, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index 6c61ce61eecd57..0ce4ca665dd9d1 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -205,5 +205,44 @@ class ElementwiseAddDoubleGradKernel : public framework::OpKernel { } }; +template +class ElementwiseAddTripleGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + using Tensor = framework::Tensor; + auto *ddx = ctx.Input("DDX"); + auto *ddy = ctx.Input("DDY"); + auto *d_ddout = ctx.Input("D_DDOut"); + auto *d_ddx = ctx.Output("D_DDX"); + auto *d_ddy = ctx.Output("D_DDY"); + // skip out + auto *out = d_ddout; + + // Special case when d_ddy is not needed and d_ddx doesn't reduce + if (d_ddx != nullptr && d_ddy == nullptr && + d_ddx->dims() == d_ddout->dims()) { + VLOG(4) << "Special case when d_ddy is not needed and d_ddx doesn't " + "reduce"; + framework::TensorCopy( + *d_ddout, ctx.GetPlace(), + ctx.template device_context(), d_ddx); + } else if (d_ddx == nullptr && d_ddy != nullptr && + d_ddy->dims() == d_ddout->dims()) { + VLOG(4) << "Special case when d_ddx is not needed and d_ddy doesn't " + "reduce"; + framework::TensorCopy( + *d_ddout, ctx.GetPlace(), + ctx.template device_context(), d_ddy); + } else if (d_ddx != nullptr && d_ddy != nullptr && + (d_ddx->dims() == d_ddy->dims())) { + elementwise_add_grad(ctx, ddx, ddy, out, d_ddout, d_ddx, + d_ddy); + } else { + default_elementwise_add_grad(ctx, ddx, ddy, out, + d_ddout, d_ddx, d_ddy); + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 3614602156f4d9..5703e904c240b3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -426,6 +426,62 @@ class ElementwiseOpDoubleGradWithoutDXDY } }; +class ElementwiseOpTripleGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + using Tensor = framework::Tensor; + + void InferShape(framework::InferShapeContext *ctx) const override { + if (ctx->HasOutput("D_DDX")) { + ctx->ShareDim("DDX", "D_DDX"); + ctx->ShareLoD("DDX", "D_DDX"); + } + if (ctx->HasOutput("D_DDY")) { + ctx->ShareDim("DDY", "D_DDY"); + ctx->ShareLoD("DDY", "D_DDY"); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::proto::VarType::Type input_data_type; + if (ctx.HasInput("DDX") == false) { + OP_INOUT_CHECK(ctx.HasInput("DDY"), "Input", "DDY", + "ElementwiseOpTripleGrad"); + input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDY"); + } else if (ctx.HasInput("DDY") == false) { + OP_INOUT_CHECK(ctx.HasInput("DDX"), "Input", "DDX", + "ElementwiseOpTripleGrad"); + input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDX"); + } else { + input_data_type = + OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "DDX", "DDY"); + } + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const framework::Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + if (framework::IsComplexType(expected_kernel_type.data_type_)) { + // only promote inputs’s types when contains complex input + return framework::OpKernelType(tensor.type(), tensor.place(), + tensor.layout()); + } else { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } + } +}; + template class ElemwiseGradKernel : public framework::OpKernel { public: @@ -447,9 +503,14 @@ DECLARE_INPLACE_OP_INFERER(ElementwiseGradOpInplaceInferer, DECLARE_INPLACE_OP_INFERER(ElementwiseDoubleGradOpInplaceInferer, {"DDX", "DDOut"}); +DECLARE_INPLACE_OP_INFERER(ElementwiseTripleGradOpInplaceInferer, + {"D_DDOut", "D_DDX"}); + DECLARE_NO_NEED_BUFFER_VARS_INFERER(ElementwiseGradNoBufVarsInferer, "X", "Y"); DECLARE_NO_NEED_BUFFER_VARS_INFERER(ElementwiseDoubleGradNoBufVarsInferer, "Y", "DOut"); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(ElementwiseTripleGradNoBufVarsInferer, + "DDX", "DDY"); } // namespace operators } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py index 01aa2fd9efa4fb..b56bbc07a7f44f 100644 --- a/python/paddle/fluid/tests/unittests/gradient_checker.py +++ b/python/paddle/fluid/tests/unittests/gradient_checker.py @@ -486,20 +486,26 @@ def triple_grad_check(x, var_to_np_array_in_scope(scope, place, v.name) for v in x_grads_grads ] - # append second order grads - target_grads_grads = fluid.gradients(target_grads, x, x_grads_grads) x += y_grads x_init = _as_list(x_init) x_init += y_grads_init + # append second order grads + target_grads_grads = fluid.gradients(target_grads, x, x_grads_grads) + + # filter None in target_grads_grads for Dy/Dx may be None in kernel + filted = [(i, dyi) for i, dyi in enumerate(target_grads_grads) + if dyi is not None] + filted_idx, filted_target_grads_grads = zip(*filted) + x += x_grads_grads x_init += x_grads_grads_init # x <=> [x, dout, ddx] grad_check( x=x, - y=target_grads_grads, + y=filted_target_grads_grads, x_init=x_init, place=place, program=program, diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py index 12b75c8bf703d2..0dba2b1924d249 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py @@ -243,5 +243,59 @@ def test_grad(self): self.func(p) +class TestElementwiseAddTripleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + # the shape of input variable should be clearly specified, not inlcude -1. + shape = [2, 3, 4, 5] + eps = 0.005 + dtype = np.float64 + + x = layers.data('x', shape, False, dtype) + y = layers.data('y', shape, False, dtype) + x.persistable = True + y.persistable = True + out = layers.elementwise_add(x, y) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + y_arr = np.random.uniform(-1, 1, shape).astype(dtype) + + gradient_checker.triple_grad_check( + [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + +class TestElementwiseAddBroadcastTripleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + # the shape of input variable should be clearly specified, not inlcude -1. + shape = [2, 3, 4, 5] + eps = 0.005 + dtype = np.float64 + + x = layers.data('x', shape, False, dtype) + y = layers.data('y', shape[:-1], False, dtype) + x.persistable = True + y.persistable = True + out = layers.elementwise_add(x, y, axis=0) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + y_arr = np.random.uniform(-1, 1, shape[:-1]).astype(dtype) + + gradient_checker.triple_grad_check( + [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + if __name__ == "__main__": unittest.main() From fe01ba6a14f9d8209fc07346c7701f953e8dba44 Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Wed, 20 Oct 2021 10:16:52 +0800 Subject: [PATCH 214/298] remove no_value using var.name (#36513) * remove no_value using var.name * fix unit test for CI * fix unit test * add test case * fix test case * add more test case --- .../dygraph_to_static/convert_operators.py | 42 +++++++- .../dygraph_to_static/variable_trans_func.py | 6 +- .../test_convert_operators.py | 95 +++++++++++++++++++ .../test_program_translator.py | 4 +- .../test_variable_trans_func.py | 18 ++-- 5 files changed, 151 insertions(+), 14 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py index 4126e942259434..d27af5c0dd9e0c 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py @@ -20,6 +20,7 @@ from paddle.fluid.layers import assign, fill_constant, slice, reduce_all, reduce_any from paddle.fluid.layers import cast, control_flow, logical_and, logical_not, logical_or, nn from paddle.fluid.layers.control_flow import cond, while_loop, less_than, increment +from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_VAR_NAME def convert_while_loop(cond, body, loop_vars): @@ -204,10 +205,45 @@ def convert_ifelse(pred, true_fn, false_fn, true_args, false_args, return_vars): """ if isinstance(pred, Variable): - return _run_paddle_cond(pred, true_fn, false_fn, true_args, false_args, - return_vars) + out = _run_paddle_cond(pred, true_fn, false_fn, true_args, false_args, + return_vars) else: - return _run_py_ifelse(pred, true_fn, false_fn, true_args, false_args) + out = _run_py_ifelse(pred, true_fn, false_fn, true_args, false_args) + + return _remove_no_value_return_var(out) + + +def _remove_no_value_return_var(out): + if out and isinstance(out, tuple): + processed_out = out + align_ret = out[0] + if isinstance(align_ret, tuple): + for index, item in enumerate(align_ret): + if isinstance(item, Variable) and ( + RETURN_NO_VALUE_VAR_NAME in item.name): + # return None + if index == 0: + processed_out = (None, ) + out[1:] + elif index == 1: + processed_out = align_ret[:1] + out[1:] + else: + processed_out = (align_ret[:index], ) + out[1:] + break + + for index, item in enumerate(processed_out): + if isinstance(item, Variable) and ( + RETURN_NO_VALUE_VAR_NAME in item.name): + processed_out = processed_out[:index] + + if not processed_out: + return None + elif len(processed_out) == 1: + return processed_out[0] + else: + return processed_out + + else: + return out def _run_paddle_cond(pred, true_fn, false_fn, true_args, false_args, diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py index b118eeadf7e7e5..2cd6c5e43f7e12 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py @@ -93,14 +93,14 @@ def create_fill_constant_node(name, value): func_code = "{} = paddle.fluid.layers.fill_constant(shape=[1], ".format( name) if isinstance(value, bool): - func_code += "dtype='bool', value={})".format(value) + func_code += "dtype='bool', value={}, name='{}')".format(value, name) return gast.parse(func_code).body[0] if isinstance(value, float): - func_code += "dtype='float64', value={})".format(value) + func_code += "dtype='float64', value={}, name='{}')".format(value, name) return gast.parse(func_code).body[0] if isinstance(value, int): - func_code += "dtype='int64', value={})".format(value) + func_code += "dtype='int64', value={}, name='{}')".format(value, name) return gast.parse(func_code).body[0] diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py index 54dcc152fd6b28..bb1942692fd9d2 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py @@ -261,5 +261,100 @@ def test_tensor_shape(self): self.assertTrue(np.array_equal(out.numpy(), x.numpy())) +class TestIfElseNoValue(unittest.TestCase): + def test_else_ret_none(self): + input_x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]]) + + @paddle.jit.to_static + def with_common_value(x, use_cache=False): + if use_cache: + y = x + 1 + z = x + 2 + return y, z + else: + c = x + 1 + z = x - 1 + return None + + @paddle.jit.to_static + def without_common_value(x, use_cache=False): + if use_cache: + y = x + 1 + z = x + 2 + return y, z + else: + c = x + 1 + return None + + out = with_common_value(input_x, False) + self.assertIsNone(out) + out = without_common_value(input_x, False) + self.assertIsNone(out) + + def test_else_ret_c(self): + input_x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]]) + + @paddle.jit.to_static + def with_common_value(x, use_cache=False): + if use_cache: + y = x + 1 + z = x + 2 + return y, z + else: + c = x + 1 + z = x - 1 + return c + + @paddle.jit.to_static + def without_common_value(x, use_cache=False): + if use_cache: + y = x + 1 + z = x + 2 + return y, z + else: + c = x + 1 + return c + + out = with_common_value(input_x, False) + self.assertListEqual(paddle.tolist(out), paddle.tolist(input_x + 1)) + out = without_common_value(input_x, False) + self.assertListEqual(paddle.tolist(out), paddle.tolist(input_x + 1)) + y, z = with_common_value(input_x, True) + self.assertListEqual(paddle.tolist(y), paddle.tolist(input_x + 1)) + self.assertListEqual(paddle.tolist(z), paddle.tolist(input_x + 2)) + + def test_else_ret_cz(self): + input_x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]]) + + @paddle.jit.to_static + def with_common_value(x, use_cache=False): + if use_cache: + y = x + 1 + z = x + 2 + return y, z, 1 + else: + c = x + 1 + z = x - 1 + return c, z + + @paddle.jit.to_static + def without_common_value(x, use_cache=False): + if use_cache: + y = x + 1 + z = x + 2 + return y, z, 1 + else: + c = x + 1 + d = x - 1 + return c, d + + c, z = with_common_value(input_x, False) + self.assertListEqual(paddle.tolist(c), paddle.tolist(input_x + 1)) + self.assertListEqual(paddle.tolist(z), paddle.tolist(input_x - 1)) + c, d = without_common_value(input_x, False) + self.assertListEqual(paddle.tolist(c), paddle.tolist(input_x + 1)) + self.assertListEqual(paddle.tolist(d), paddle.tolist(input_x - 1)) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py index 9e12b6fa208505..6fef356326b81d 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py @@ -64,7 +64,7 @@ def get_source_code(func): class StaticCode1(): def dyfunc_with_if_else(x_v, label=None): __return_value_init_0 = paddle.fluid.layers.fill_constant( - shape=[1], dtype='float64', value=0.0) + shape=[1], dtype='float64', value=0.0, name='__return_value_init_0') __return_value_0 = __return_value_init_0 def true_fn_0(x_v): @@ -116,7 +116,7 @@ class StaticCode2(): # TODO: Transform return statement def dyfunc_with_if_else(x_v, label=None): __return_value_init_1 = paddle.fluid.layers.fill_constant( - shape=[1], dtype='float64', value=0.0) + shape=[1], dtype='float64', value=0.0, name='__return_value_init_1') __return_value_1 = __return_value_init_1 def true_fn_3(x_v): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py index 3431c6aac4cbef..8500f46d974d8f 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py @@ -50,16 +50,22 @@ def test_feed_mismatch_shape(self): class TestVariableTransFunc(unittest.TestCase): def test_create_fill_constant_node(self): node = create_fill_constant_node("a", 1.0) - source = "a = paddle.fluid.layers.fill_constant(shape=[1], dtype='float64', value=1.0)" - self.assertEqual(ast_to_source_code(node).strip(), source) + source = "a = paddle.fluid.layers.fill_constant(shape=[1], dtype='float64', value=1.0, name='a')" + self.assertEqual( + ast_to_source_code(node).replace('\n', '').replace(' ', ''), + source.replace(' ', '')) node = create_fill_constant_node("b", True) - source = "b = paddle.fluid.layers.fill_constant(shape=[1], dtype='bool', value=True)" - self.assertEqual(ast_to_source_code(node).strip(), source) + source = "b = paddle.fluid.layers.fill_constant(shape=[1], dtype='bool', value=True, name='b')" + self.assertEqual( + ast_to_source_code(node).replace('\n', '').replace(' ', ''), + source.replace(' ', '')) node = create_fill_constant_node("c", 4293) - source = "c = paddle.fluid.layers.fill_constant(shape=[1], dtype='int64', value=4293)" - self.assertEqual(ast_to_source_code(node).strip(), source) + source = "c = paddle.fluid.layers.fill_constant(shape=[1], dtype='int64', value=4293, name='c')" + self.assertEqual( + ast_to_source_code(node).replace('\n', '').replace(' ', ''), + source.replace(' ', '')) self.assertIsNone(create_fill_constant_node("e", None)) self.assertIsNone(create_fill_constant_node("e", [])) From 127488ba91fb5a9ead32cce93a23ec3750fcc90e Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 20 Oct 2021 10:19:24 +0800 Subject: [PATCH 215/298] Add kQueueSync.synchronize_run_ logic (#36546) --- .../fluid/framework/new_executor/interpretercore.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 083d989cb52672..f6157367cd4e2e 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -410,13 +410,14 @@ void InterpreterCore::RunNextInstruction(const Instruction& instr) { [&, next_id] { RunInstructionAsync(next_id); }); } } - - for (size_t i = 0; i < next_instr.direct_run_.size(); ++i) { - auto next_id = next_instr.direct_run_[i]; + auto direct_run_ops = interpretercore::merge_vector( + next_instr.synchronize_run_, next_instr.direct_run_); + size_t first_op = 0; + for (auto next_id : direct_run_ops) { if (IsReady(next_id)) { // only keep one op running in current thread - if (i == 0) { - RunInstructionAsync(next_id); + if (first_op == 0) { + first_op = next_id; continue; } // move rest ops into other threads @@ -425,6 +426,7 @@ void InterpreterCore::RunNextInstruction(const Instruction& instr) { [&, next_id] { RunInstructionAsync(next_id); }); } } + if (first_op != 0) RunInstructionAsync(first_op); } } From 797bd40d093189ce3c9f24fcd0f59bbe2878b2ca Mon Sep 17 00:00:00 2001 From: JZ-LIANG <38102074+JZ-LIANG@users.noreply.github.com> Date: Wed, 20 Oct 2021 10:23:35 +0800 Subject: [PATCH 216/298] [Auto Parallel] Generalization for Partition and Completion (#35735) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * default dist op * add dist_attr for dist op * add unitest * update inputname * update function name * add unitest * update CMakeLists.txt for CI * fix dis_matmul * fix compile error * update matmul to matmul_v2 * unify api * unify api * todo * update distop forward func * update distop forward func * auto parallel backward * update dist op * autoparallel backward * add backward for embedding * temp1 * temp2 * temp3 * temp4 * backward done1 * backward done2 * backward done3 * dist embedding remove mp mode * dist matmul remove mp mode * update dist embedding 『 * dist op init1 * dist op init 2 * update unitest * context remove parallel mode * partitioner remove parallel mode * update unitest * a more general method to support varying mesh in pipeline parallel * support varying mesh in pipeline parallel * embedding support varying mesh in pipeline parallel * matmul support varying mesh in pipeline parallel * default dist op support varying mesh in pipeline parallel * dist attribute for startup program * default dist op support varying mesh in pipeline parallel 2 * partitoner support varying mesh in pipeline parallel * revise logic for auto compeletion * revise framework.py * revise reshard unitest * revise unitest for parallelize * chmod * fixed bug for dist embedding name mapping Co-authored-by: zhaoyingli --- .../distributed/auto_parallel/completion.py | 269 +++--- .../distributed/auto_parallel/context.py | 125 ++- .../auto_parallel/operators/__init__.py | 1 + .../auto_parallel/operators/common.py | 6 +- .../auto_parallel/operators/dist_default.py | 247 +++++ .../auto_parallel/operators/dist_embedding.py | 331 ++++--- .../auto_parallel/operators/dist_matmul.py | 911 +++++++++++------- .../auto_parallel/operators/dist_reshape.py | 288 +++--- .../auto_parallel/operators/dist_softmax.py | 6 + .../auto_parallel/operators/dist_transpose.py | 6 + .../distributed/auto_parallel/parallelizer.py | 4 +- .../distributed/auto_parallel/partitioner.py | 414 ++++---- .../paddle/distributed/auto_parallel/utils.py | 45 +- python/paddle/fluid/backward.py | 13 +- .../fluid/tests/unittests/CMakeLists.txt | 3 + .../unittests/auto_parallel_parallelizer.py | 140 +++ .../test_auto_parallel_parallelizer.py | 126 +-- .../test_auto_parallel_partitioner.py | 100 +- .../test_auto_parallel_partitioner_gpt.py | 30 +- .../unittests/test_auto_parallel_reshard.py | 7 +- .../test_auto_parallel_reshard_dpmppp.py | 2 - .../test_auto_parallel_reshard_mppp.py | 2 - 22 files changed, 1896 insertions(+), 1180 deletions(-) create mode 100755 python/paddle/distributed/auto_parallel/operators/dist_default.py mode change 100644 => 100755 python/paddle/distributed/auto_parallel/operators/dist_embedding.py create mode 100755 python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index 3fdbad6950db51..855eb656bd90e3 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -24,6 +24,7 @@ from .context import get_default_distributed_context from .operators import find_best_compatible_distributed_operator_impl from .attribute import OperatorDistributedAttribute, TensorDistributedAttribute +from paddle.distributed.fleet.meta_optimizers.common import OpRole ELEMENTWISE_LIKE_OP_LIST = ["elementwise_add", "gelu", "dropout", "cast"] @@ -600,7 +601,7 @@ def sort_key_fun(node): return program -def complete_backward_annotation(auto_parallel_main_prog, dist_context): +def complete_backward_annotation(auto_parallel_main_prog, dist_context=None): """Complete the annotation of vars and ops in the backward phase for parallel program.""" def _is_grad_var_name(name): @@ -608,24 +609,44 @@ def _is_grad_var_name(name): return True return False - grad_start_idx = None + def _get_forward_varname_from_grad_varname(grad_var_name): + assert _is_grad_var_name( + grad_var_name), "[{}] is not a grad varnme.".format(grad_var_name) + return grad_var_name[:grad_var_name.find("@GRAD")] + + def _get_op_by_id(ops, id): + for op in ops: + if op.desc.id() == id: + return op + return None + + if dist_context is None: + dist_context = get_default_distributed_context() + + grad_start_idx = -1 for idx, op in enumerate(auto_parallel_main_prog.global_block().ops): - for var_name in op.output_arg_names: - # TODO: use _is_loss_op to judge - if "@GRAD" in var_name and op.type == "fill_constant": - grad_start_idx = idx - break - assert grad_start_idx is not None, "No backward procedure found in this program." + if int(op.attr('op_role')) == int( + int(core.op_proto_and_checker_maker.OpRole.Backward) | int( + core.op_proto_and_checker_maker.OpRole.Loss)): + assert op.type == "fill_constant" + grad_start_idx = idx + break + + assert grad_start_idx >= 0, "No backward procedure found in this program." ops = list(auto_parallel_main_prog.global_block().ops) vars = auto_parallel_main_prog.global_block().vars + for idx in range(grad_start_idx, len(ops)): - # complete the loss op + + # complete the initial grad loss op if idx == grad_start_idx: grad_var = vars[ops[idx].output_arg_names[0]] - grad_var_name = grad_var.name - forward_var_name = grad_var_name[:grad_var_name.find("@GRAD")] + forward_var_name = _get_forward_varname_from_grad_varname( + grad_var.name) forward_var = vars[forward_var_name] + + # TODO complete other attribte for grad var tensor_attr = TensorDistributedAttribute(grad_var, dist_context) process_mesh = dist_context.get_tensor_distributed_attr_for_program( forward_var).get_process_mesh() @@ -635,39 +656,31 @@ def _is_grad_var_name(name): tensor_attr.set_process_mesh(process_mesh) dist_context.set_tensor_distributed_attr_for_program(grad_var, tensor_attr) + op_attr = OperatorDistributedAttribute(ops[idx], dist_context) op_attr.set_process_mesh(process_mesh) dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr) - - # in the data parallel mode, the loss op followed by scale op. - if ops[idx + 1].type == "scale" and grad_var_name in ops[idx + 1].input_arg_names \ - and grad_var_name in ops[idx + 1].output_arg_names: - op_attr = OperatorDistributedAttribute(ops[idx + 1], - dist_context) - op_attr.set_process_mesh(process_mesh) - dist_context.set_op_distributed_attr_for_program(ops[idx + 1], - op_attr) continue - # complete the annotation of the optimizer op. - # TODO: use _is_optimizer_op to judge - if "Grad" in ops[idx].input_names and "Param" in ops[idx].input_names: - assert len(ops[idx].input( - "Param")) == 1, "Only support one-to-one now." - assert len(ops[idx].input( - "Grad")) == 1, "Only support one-to-one now." - var = vars[ops[idx].input("Param")[0]] - grad_var = vars[ops[idx].input("Grad")[0]] + # TODO remove this when dist op handle its own grad scale + # in the data parallel mode, the loss op followed by scale op. + if ops[idx].type == "scale" and idx == grad_start_idx + 1: + assert grad_var.name in ops[ + idx].input_arg_names and grad_var.name in ops[ + idx].output_arg_names + grad_var = vars[ops[idx].output_arg_names[0]] + forward_var_name = _get_forward_varname_from_grad_varname( + grad_var.name) + forward_var = vars[forward_var_name] process_mesh = dist_context.get_tensor_distributed_attr_for_program( - var).get_process_mesh() - dims_mapping = dist_context.get_tensor_distributed_attr_for_program( - var).get_dims_mapping() + forward_var).get_process_mesh() op_attr = OperatorDistributedAttribute(ops[idx], dist_context) op_attr.set_process_mesh(process_mesh) - op_attr.set_input_dims_mapping(grad_var.name, dims_mapping) dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr) continue + # TODO remove this when dist op handle its own communication + # TODO should distinguish the dp allreduce and mp allreduce # complete the c_allreduce_sum op for gradient in the data parallel mode. if ops[idx].type == "c_allreduce_sum" and ops[ idx].input_arg_names == ops[idx].output_arg_names: @@ -679,91 +692,123 @@ def _is_grad_var_name(name): dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr) continue - # complete the annotation of grad op + # complete the annotation of grad op (xxx_grad op or sum op) grad_op = ops[idx] - for i, op in enumerate(ops[:grad_start_idx]): - match_op = None - grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc, - set(), - []) - grad_op_input = [] - for input_arg_name in grad_op.desc.input_arg_names(): - if "@GRAD" in input_arg_name: - name = input_arg_name[:input_arg_name.find("@GRAD") + 5] - grad_op_input.append(name) - else: - grad_op_input.append(input_arg_name) - - # like sum op: the count of grad op will larger than 1 - if len(grad_op_desc_list) > 1: - for grad_op_desc in grad_op_desc_list: - if grad_op_input == grad_op_desc.input_arg_names() \ - and grad_op.desc.type() == grad_op_desc.type(): - match_op = op - break - elif len(grad_op_desc_list) == 1: - if grad_op_input == grad_op_desc_list[0].input_arg_names() \ - and grad_op.desc.type() == grad_op_desc_list[0].type(): - match_op = op - - if match_op is not None: - op_attr = dist_context.get_op_distributed_attr_for_program(op) - grad_op_attr = OperatorDistributedAttribute(grad_op, - dist_context) - grad_op_attr.set_process_mesh(op_attr.get_process_mesh()) - for var_name in grad_op.input_arg_names: - if "@GRAD" in var_name: - dims_mapping = dist_context.get_tensor_distributed_attr_for_program( - vars[var_name]).get_dims_mapping() - grad_op_attr.set_input_dims_mapping(var_name, - dims_mapping) - else: - dims_mapping = op_attr.get_input_dims_mapping(var_name) - grad_op_attr.set_input_dims_mapping(var_name, - dims_mapping) - dist_context.set_op_distributed_attr_for_program(grad_op, - grad_op_attr) - - for var_name in grad_op.output_arg_names: - if "@GRAD" in var_name: - forward_var = vars[var_name[:var_name.find("@GRAD")]] - tensor_attr = TensorDistributedAttribute(vars[var_name], - dist_context) - process_mesh = grad_op_attr.get_process_mesh() - dims_mapping = grad_op_attr.get_input_dims_mapping( - forward_var.name) - tensor_attr.set_process_mesh(process_mesh) - tensor_attr.set_dims_mapping(dims_mapping) - dist_context.set_tensor_distributed_attr_for_program( - vars[var_name], tensor_attr) - break - - # complete the annotation of sum op for multiple renamed grad var - if grad_op.type == "sum" and all( - map(_is_grad_var_name, grad_op.input_arg_names)): - assert len(grad_op.output_arg_names - ) == 1, "The output count of sum op should be one." + + # xxx_grad op will have a corresponding forward op in gradopidx2opidx + dist_op_helper = dist_context.get_dist_op_helper() + if grad_op.desc.id() in dist_op_helper.gradopidx2opidx: + # TODO support the case where one forward op corresponding to multiple xxx_grad op + forward_op = _get_op_by_id( + ops[:grad_start_idx], + dist_op_helper.gradopidx2opidx[grad_op.desc.id()]) + assert forward_op is not None + + # op dist attr + forward_op_attr = dist_context.get_op_distributed_attr_for_program( + forward_op) grad_op_attr = OperatorDistributedAttribute(grad_op, dist_context) + grad_op_attr.set_process_mesh(forward_op_attr.get_process_mesh()) + for var_name in grad_op.input_arg_names: if "@GRAD" in var_name: - forward_var = vars[var_name[:var_name.find("@GRAD")]] dims_mapping = dist_context.get_tensor_distributed_attr_for_program( - forward_var).get_dims_mapping() + vars[var_name]).get_dims_mapping() + grad_op_attr.set_input_dims_mapping(var_name, dims_mapping) + else: + dims_mapping = forward_op_attr.get_input_dims_mapping( + var_name) + # TODO fixed here + if dims_mapping == None: + dims_mapping = forward_op_attr.get_output_dims_mapping( + var_name) + assert dims_mapping is not None, "[{}]'s dims_mapping is None".format( + var_name) grad_op_attr.set_input_dims_mapping(var_name, dims_mapping) + dist_context.set_op_distributed_attr_for_program(grad_op, + grad_op_attr) + # var dist attr for var_name in grad_op.output_arg_names: - forward_var = vars[var_name[:var_name.find("@GRAD")]] - tensor_attr = TensorDistributedAttribute(vars[var_name], - dist_context) - process_mesh = dist_context.get_tensor_distributed_attr_for_program( - forward_var).get_process_mesh() - dims_mapping = dist_context.get_tensor_distributed_attr_for_program( - forward_var).get_dims_mapping() - tensor_attr.set_dims_mapping(dims_mapping) - tensor_attr.set_process_mesh(process_mesh) - dist_context.set_tensor_distributed_attr_for_program( - vars[var_name], tensor_attr) - grad_op_attr.set_process_mesh( - dist_context.get_tensor_distributed_attr_for_program( - forward_var).get_process_mesh()) + if _is_grad_var_name(var_name): + + forward_var_name = _get_forward_varname_from_grad_varname( + var_name) + forward_var = vars[forward_var_name] + tensor_attr = TensorDistributedAttribute(vars[var_name], + dist_context) + process_mesh = grad_op_attr.get_process_mesh() + dims_mapping = grad_op_attr.get_input_dims_mapping( + forward_var_name) + tensor_attr.set_process_mesh(process_mesh) + tensor_attr.set_dims_mapping(dims_mapping) + dist_context.set_tensor_distributed_attr_for_program( + vars[var_name], tensor_attr) + + # only sum op for merge mutiple version grad has no a corresponding mapping in gradopidx2opidx + else: + assert grad_op.type == "sum", "got unexpect op [{}]".format( + str(grad_op.type)) + assert all(map(_is_grad_var_name, grad_op.input_arg_names)) + assert len(grad_op.output_arg_names) == 1 + + ref_forward_var_name = _get_forward_varname_from_grad_varname( + grad_op.output_arg_names[0]) + forward_var = vars[ref_forward_var_name] + ref_forward_var_dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_dims_mapping() + ref_forward_var_process_mesh = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_process_mesh() + + # output + tensor_attr = TensorDistributedAttribute( + vars[grad_op.output_arg_names[0]], dist_context) + tensor_attr.set_dims_mapping(ref_forward_var_dims_mapping) + tensor_attr.set_process_mesh(ref_forward_var_process_mesh) + dist_context.set_tensor_distributed_attr_for_program( + vars[grad_op.output_arg_names[0]], tensor_attr) + + # op + grad_op_attr = OperatorDistributedAttribute(grad_op, dist_context) + grad_op_attr.set_process_mesh(ref_forward_var_process_mesh) + for var_name in grad_op.input_arg_names: + assert _get_forward_varname_from_grad_varname( + var_name) == ref_forward_var_name + grad_op_attr.set_input_dims_mapping( + var_name, ref_forward_var_dims_mapping) dist_context.set_op_distributed_attr_for_program(grad_op, grad_op_attr) + + +def complete_update_annotation(auto_parallel_main_prog, dist_context): + """Complete the annotation of vars and ops in the update phase for parallel program.""" + + if dist_context is None: + dist_context = get_default_distributed_context() + + ops = list(auto_parallel_main_prog.global_block().ops) + vars = auto_parallel_main_prog.global_block().vars + + for idx in range(len(ops)): + + # complete the annotation of the optimizer op. + # TODO to add attribute for moment var + if int(ops[idx].attr('op_role')) == int(OpRole.Optimize): + if "Grad" in ops[idx].input_names and "Param" in ops[ + idx].input_names: + assert len(ops[idx].input( + "Param")) == 1, "Only support one-to-one now." + assert len(ops[idx].input( + "Grad")) == 1, "Only support one-to-one now." + param = vars[ops[idx].input("Param")[0]] + grad_var = vars[ops[idx].input("Grad")[0]] + process_mesh = dist_context.get_tensor_distributed_attr_for_program( + param).get_process_mesh() + dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + param).get_dims_mapping() + op_attr = OperatorDistributedAttribute(ops[idx], dist_context) + op_attr.set_process_mesh(process_mesh) + op_attr.set_input_dims_mapping(grad_var.name, dims_mapping) + op_attr.set_input_dims_mapping(param.name, dims_mapping) + dist_context.set_op_distributed_attr_for_program(ops[idx], + op_attr) + continue diff --git a/python/paddle/distributed/auto_parallel/context.py b/python/paddle/distributed/auto_parallel/context.py index 5e6565aa3d84cb..6785f21351aa4a 100644 --- a/python/paddle/distributed/auto_parallel/context.py +++ b/python/paddle/distributed/auto_parallel/context.py @@ -51,23 +51,8 @@ def __init__(self): self._op_distributed_attr_map_for_program = {} self._tensor_distributed_attr_map_for_graph = {} self._op_distributed_attr_map_for_graph = {} - # The following is a hard code and will be removed in the future - self._data_parallel_axis = None - self._model_parallel_axis = None + self._get_dist_op_helper = DistOpHelper() self._process_mesh = _g_process_mesh_map.get(0, None) - if self._process_mesh is not None: - if self._process_mesh.ndim == 1: - self._data_parallel_axis = 0 - self._model_parallel_axis = 0 - elif self._process_mesh.ndim == 3: - self._data_parallel_axis = 1 - self._model_parallel_axis = 2 - else: - self._data_parallel_axis = 0 - self._model_parallel_axis = 1 - else: - self._data_parallel_axis = -1 - self._model_parallel_axis = -1 def is_initialized_for_program(self): return self._is_initialized_for_program @@ -120,16 +105,9 @@ def set_op_distributed_attr_for_graph(self, op_node, op_dist_attr): def set_process_mesh(self, process_mesh): self._process_mesh = process_mesh - if self._process_mesh is not None: - if self._process_mesh.ndim == 1: - self._data_parallel_axis = 0 - self._model_parallel_axis = 0 - else: - self._data_parallel_axis = 0 - self._model_parallel_axis = 1 - else: - self._data_parallel_axis = -1 - self._model_parallel_axis = -1 + + def get_dist_op_helper(self): + return self._get_dist_op_helper def initialize_distributed_attr_for_program(self, program): if self._is_initialized_for_program: @@ -425,10 +403,93 @@ def amend_distributed_attr_for_program(self): and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]: dims_mapping[i] = -1 - def _get_data_parallel_info(self): - # This function is a hard code, and will be obsoleted in the future - return self._data_parallel_axis, self._process_mesh - def _get_model_parallel_info(self): - # This function is a hard code, and will be obsoleted in the future - return self._model_parallel_axis, self._process_mesh +class DistOpHelper: + """ + DistOpHelper is used to create a dist op desc in Program. + Every time to create a new dist op, the context should be updated for it accordingly. + """ + + def __init__(self): + self._dst_main_program = None + self._dst_startup_program = None + self._varname_mapping = None + self._rank_id = None + self._cur_src_op = None + self._cur_dist_attr = None + self.gradopidx2opidx = {} + self.already_init_sync_vars = set() + + def set_dst_main_program(self, prog): + self._dst_main_program = prog + + def get_dst_main_program(self): + return self._dst_main_program + + def set_dst_startup_program(self, prog): + self._dst_startup_program = prog + + def get_dst_startup_program(self): + return self._dst_startup_program + + def set_varname_mapping(self, mapping): + self._varname_mapping = mapping + + def get_varname_mapping(self): + return self._varname_mapping + + def set_rank_id(self, rank_id): + self._rank_id = rank_id + + def get_rank_id(self): + return self._rank_id + + def set_cur_src_op(self, cur_src_op): + self._cur_src_op = cur_src_op + + def get_cur_src_op(self): + return self._cur_src_op + + def prepare_forward_context(self, src_op): + + self.set_cur_src_op(src_op) + + # build input varname mapping + kinputs = {} + for input_name in src_op.desc.input_names(): + varnames = [] + for varname in src_op.desc.input(input_name): + varnames.append(self._varname_mapping[varname]) + kinputs[input_name] = varnames + + # build output varname mapping + koutputs = {} + for output_name in src_op.desc.output_names(): + varnames = [] + for varname in src_op.desc.output(output_name): + varnames.append(self._varname_mapping[varname]) + koutputs[output_name] = varnames + + return kinputs, koutputs + + def prepare_backward_context(self, backward_op): + + self.set_cur_src_op(backward_op) + + # build input varname mapping + kinputs = {} + for input_name in backward_op.desc.input_names(): + varnames = [] + for varname in backward_op.desc.input(input_name): + varnames.append(varname) + kinputs[input_name] = varnames + + # build output varname mapping + koutputs = {} + for output_name in backward_op.desc.output_names(): + varnames = [] + for varname in backward_op.desc.output(output_name): + varnames.append(varname) + koutputs[output_name] = varnames + + return kinputs, koutputs diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py index 14ded477cb7092..3b3359b4ebf1cf 100644 --- a/python/paddle/distributed/auto_parallel/operators/__init__.py +++ b/python/paddle/distributed/auto_parallel/operators/__init__.py @@ -22,3 +22,4 @@ from . import dist_reshape from . import dist_softmax from . import dist_transpose +from . import dist_default diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py index 1b0b05d39547ac..5685c40a3227b6 100644 --- a/python/paddle/distributed/auto_parallel/operators/common.py +++ b/python/paddle/distributed/auto_parallel/operators/common.py @@ -36,10 +36,12 @@ def __init__(self): self._forward_implemented = False self._backward_implemented = False - def forward(self, dist_ctx, *args, **kwargs): + @staticmethod + def forward(dist_ctx, *args, **kwargs): raise NotImplementedError("Please Implement this method in Subclass.") - def backward(self, dist_ctx, *grad_outputs): + @staticmethod + def backward(dist_ctx, *grad_outputs, **kwargs): raise NotImplementedError("Please Implement this method in Subclass.") def get_name(self): diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py new file mode 100755 index 00000000000000..cf17b7afb0f397 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py @@ -0,0 +1,247 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +from .common import DistributedOperator +from .common import DistributedOperatorImpl +from .common import register_distributed_operator +from .common import register_distributed_operator_impl +from ..utils import is_dim_shard +from ..utils import is_dim_replicate +from ..utils import is_valid_list_index +from ..utils import compute_compatible_dim_mapping +from ..utils import compute_compatible_dims_mapping +from ..utils import compute_compatible_and_update_dim_mapping +from ..attribute import OperatorDistributedAttribute +from paddle.fluid import core, unique_name +from paddle.fluid.framework import in_dygraph_mode +from paddle.fluid.framework import Program, Parameter, Variable, program_guard +from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype +from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY +from ..process import new_process_group +from ..utils import _get_comm_group, _get_corresponding_rank + + +class DistributedDefault(DistributedOperator): + def __init__(self, name): + super(DistributedDefault, self).__init__() + self._name = name + + +register_distributed_operator("default", DistributedDefault("default")) + + +# Replicated Default +class DistributedDefaultImpl0(DistributedOperatorImpl): + def __init__(self, name): + super(DistributedDefaultImpl0, self).__init__() + self._name = name + self._forward_implemented = True + self._backward_implemented = True + + def is_process_mesh_compatible(self, op_dist_attr): + raise NotImplementedError("Please Implement this method.") + + def is_input_compatible(self, op_dist_attr): + raise NotImplementedError("Please Implement this method.") + + def is_output_compatible(self, op_dist_attr): + raise NotImplementedError("Please Implement this method.") + + def update_dims_mapping(self, op_dist_attr): + raise NotImplementedError("Please Implement this method.") + + @staticmethod + def forward(ctx, *args, **kwargs): + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + startup_block = dist_op_helper.get_dst_startup_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + varname_mapping = dist_op_helper.get_varname_mapping() + rank_id = dist_op_helper.get_rank_id() + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + # replicate op in dist program + dist_op_desc = main_block.desc.append_op() + dist_op_desc.copy_from(src_op.desc) + for input_name in src_op.desc.input_names(): + dist_op_desc.set_input(input_name, kwargs[input_name]) + for output_name in src_op.desc.output_names(): + dist_op_desc.set_output(output_name, kwargs[output_name]) + + main_block._sync_with_cpp() + + # param initialization sync + for varname in dist_op_desc.input_arg_names(): + if startup_block.has_var(varname) and startup_block.var( + varname + ).is_parameter and varname not in dist_op_helper.already_init_sync_vars: + dist_op_helper.already_init_sync_vars.add(varname) + param = startup_block.var(varname) + param_dist_attr = ctx.get_tensor_distributed_attr_for_program( + param) + process_mesh = param_dist_attr.get_process_mesh() + dims_mapping = param_dist_attr.get_dims_mapping() + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in process_mesh.process_group: + rank_id = _get_corresponding_rank(process_mesh, rank_id) + + # NOTE all not splited axis should be presented in mesh + for axis, size in enumerate(process_mesh.topology): + if size <= 1 or axis in dims_mapping: + pass + else: + group_ranks = _get_comm_group( + process_mesh.process_group, process_mesh.topology, + axis, rank_id) + sync_group = new_process_group(group_ranks) + + new_op = startup_block.append_op( + type='c_broadcast', + inputs={'X': param}, + outputs={'Out': param}, + attrs={ + 'ring_id': sync_group.id, + 'root': 0, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward + }) + + # set distributed attribute + op_attr = OperatorDistributedAttribute(new_op, ctx) + op_attr.set_process_mesh(process_mesh) + op_attr.set_output_dims_mapping(param.name, + dims_mapping) + op_attr.set_input_dims_mapping(param.name, dims_mapping) + ctx.set_op_distributed_attr_for_program(new_op, op_attr) + + startup_block._sync_with_cpp() + + @staticmethod + def backward(ctx, *args, **kwargs): + + # by now the backward function only insert the gradient allreduce for dist op itself + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + backward_op = dist_op_helper.get_cur_src_op() + dist_attr = ctx.get_op_distributed_attr_for_program(backward_op) + assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(backward_op)) + rank_id = dist_op_helper.get_rank_id() + + # check if need gradient allreduce + # if there is a non-gradient & non-parameter input and its batch dimension is splited, + # we need insert gradient allreduce for the gradient of parameter in its output + need_gradient_allreduce = False + for input_name in backward_op.desc.input_names(): + for varname in backward_op.desc.input(input_name): + if "@GRAD" not in varname and not main_block.var( + varname).is_parameter: + + # NOTE input var's dim_mapping of backward op should be the same with input var instead of corresponding varname of forward op + process_mesh = dist_attr.get_process_mesh() + var_dim_mapping = dist_attr.get_input_dims_mapping(varname) + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in process_mesh.process_group: + rank_id = _get_corresponding_rank(process_mesh, rank_id) + + mesh_shape = process_mesh.topology + batch_size_axis = var_dim_mapping[0] + if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: + need_gradient_allreduce = True + group_ranks = _get_comm_group( + process_mesh.process_group, process_mesh.topology, + batch_size_axis, rank_id) + dp_degree = len(group_ranks) + dp_group = new_process_group(group_ranks) + break + + if need_gradient_allreduce: + allreduce_vars = [] + for input_name in backward_op.desc.input_names(): + for varname in backward_op.desc.input(input_name): + if "@GRAD" not in varname and main_block.var( + varname).is_parameter: + assert len( + backward_op.desc.input(input_name) + ) == 1, "parameter input to grad op should be length 1, but got [{}]".format( + backward_op.desc.input(input_name)) + + assert varname + "@GRAD" in backward_op.desc.output_arg_names( + ), "parameter's grad [{}] not found in the grad op's output".format( + varname + "@GRAD") + assert len( + backward_op.desc.output(input_name + "@GRAD") + ) == 1, "parameter grad of grad op should be length 1, but got [{}]".format( + backward_op.desc.output(input_name + "@GRAD")) + allreduce_vars.append( + backward_op.desc.output(input_name + "@GRAD")[0]) + + if len(allreduce_vars) > 0: + + for varname in allreduce_vars: + + grad_var = main_block.var(varname) + allreduce_op = main_block.append_op( + type='c_allreduce_sum', + inputs={'X': [grad_var]}, + outputs={'Out': [grad_var]}, + attrs={ + 'ring_id': dp_group.id, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Backward + }) + + scale_op = main_block.append_op( + type='scale', + inputs={'X': grad_var}, + outputs={'Out': grad_var}, + attrs={ + 'scale': 1.0 / dp_degree, + OP_ROLE_KEY: OpRole.Backward + }) + + dims_mapping = ctx.get_tensor_distributed_attr_for_program( + grad_var).get_dims_mapping() + process_mesh = dist_attr.get_process_mesh() + for op in [allreduce_op, scale_op]: + op_attr = OperatorDistributedAttribute(op, ctx) + op_attr.set_process_mesh(process_mesh) + op_attr.set_output_dims_mapping(grad_var.name, + dims_mapping) + op_attr.set_input_dims_mapping(grad_var.name, + dims_mapping) + ctx.set_op_distributed_attr_for_program(op, op_attr) + + main_block._sync_with_cpp() + + +register_distributed_operator_impl( + "default", DistributedDefaultImpl0("replicate_parallel")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py old mode 100644 new mode 100755 index 3f8fbf9cc3a7af..cd6d2255c81f13 --- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py @@ -24,12 +24,14 @@ from ..utils import compute_compatible_dim_mapping from ..utils import compute_compatible_dims_mapping from ..utils import compute_compatible_and_update_dim_mapping +from ..attribute import OperatorDistributedAttribute from paddle.fluid import core, unique_name from paddle.fluid.framework import in_dygraph_mode from paddle.fluid.framework import Program, Parameter, Variable, program_guard from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype +from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY from ..process import new_process_group -from ..utils import _get_comm_group +from ..utils import _get_comm_group, _get_idx_in_axis, _get_corresponding_rank class DistributedEmbedding(DistributedOperator): @@ -40,6 +42,7 @@ def __init__(self, name): register_distributed_operator("lookup_table_v2", DistributedEmbedding("embedding")) +register_distributed_operator("c_embedding", DistributedEmbedding("embedding")) # RowParallel @@ -48,7 +51,7 @@ def __init__(self, name): super(DistributedEmbeddingImpl, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -102,127 +105,231 @@ def update_dims_mapping(self, op_dist_attr): return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 2, "row_parallel_embedding take 2 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 1, "row_parallel_embedding take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['Ids'] - ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( - input_name_mapping['Ids']) - assert len( - input_name_mapping['W'] - ) == 1, "row_parallel_embedding input W take 1 variable but got {}".format( - input_name_mapping['W']) - assert len( - output_name_mapping['Out'] - ) == 1, "row_parallel_embedding input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - - Ids_var = dst_block.var(input_name_mapping['Ids'][0]) - Weight_var = dst_block.var(input_name_mapping['W'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - - # got dist attribute info - embedding_row_dim_mapping = op_dist_attr.get_input_dims_mapping( - Weight_var.name)[0] - process_mesh_shape = op_dist_attr.get_process_mesh().topology - process_mesh_group = op_dist_attr.get_process_mesh().process_group - - # caculate embedding offset - # TODO generalize here, using cartisian product to allow any dimensional mesh shape - mesh_shape = len(process_mesh_shape) - assert mesh_shape <= 2, "row_parallel_embedding only support 1 or 2 dimensional process mesh, but got {}".format( - process_mesh_shape) - num_partition = process_mesh_shape[embedding_row_dim_mapping] - # TODO generalize here, support any mesh group - model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( - )._get_model_parallel_info() - if mesh_shape == 1: - if rank_id not in process_mesh_group: - assert len( - process_mesh.topology - ) == 2, " row_parallel_embedding process mapping only support 2 dimensional process mesh, \ - but got {}".format(len(process_mesh.topology)) - rank_id = process_mesh_group[ - process_mesh.process_group.index(rank_id) % - process_mesh_shape[0]] - relative_idx = process_mesh_group.index(rank_id) + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + startup_block = dist_op_helper.get_dst_startup_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # check validation of inputs / outputs + assert 'Ids' in kwargs, "input [{}] is not given".format('Ids') + assert 'W' in kwargs, "input [{}] is not given".format('W') + assert 'Out' in kwargs, "output [{}] is not given".format('Out') + + assert len( + kwargs['Ids'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['Ids']) + assert len( + kwargs['W'] + ) == 1, "row_parallel_embedding input W take 1 variable but got {}".format( + kwargs['W']) + assert len( + kwargs['Out'] + ) == 1, "row_parallel_embedding output Out take 1 variable but got {}".format( + kwargs['Out']) + + Ids_var = main_block.var(kwargs['Ids'][0]) + Weight_var = main_block.var(kwargs['W'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + + # got dist attribute info + embedding_row_dim_mapping = op_dist_attr.get_input_dims_mapping( + Weight_var.name)[0] + assert embedding_row_dim_mapping >= 0, "row_parallel_embedding's row should be divided by a specific mesh axis, but got [{}]".format( + embedding_row_dim_mapping) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + process_mesh_group = op_dist_attr.get_process_mesh().process_group + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in process_mesh_group: + rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(), + rank_id) + + # A generalized method to caculate embedding offset using cartisian product + relative_idx = _get_idx_in_axis(process_mesh_group, process_mesh_shape, + embedding_row_dim_mapping, rank_id) + + per_part_size = Weight_var.shape[0] + relative_idx = relative_idx * per_part_size + + # TODO caculate ring id + parallel_axis = embedding_row_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + # append op + check_variable_and_dtype(Ids_var, 'input', ['int32', 'int64'], + 'c_embedding') + + intermediate_var_0 = main_block.create_var( + name=unique_name.generate_with_ignorable_key(".".join( + ["c_embedding", 'tmp'])), + dtype=Weight_var.dtype, + shape=Out_var.shape, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=Out_var.stop_gradient) + + # copy Out_var's dist_attr to intermediate_var_0's dist_attr + copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, Out_var) + + check_variable_and_dtype( + Out_var, 'tensor', + ['float16', 'float32', 'float64', 'int32', 'int64'], + 'c_allreduce_sum') + + c_embedding_op = main_block.append_op( + type='c_embedding', + inputs={'Ids': [Ids_var], + 'W': [Weight_var]}, + outputs={'Out': [intermediate_var_0]}, + attrs={"start_index": relative_idx}) + + # use_model_parallel + c_allreduce_sum_op = main_block.append_op( + type='c_allreduce_sum', + inputs={'X': [intermediate_var_0]}, + outputs={'Out': [Out_var]}, + attrs={ + 'ring_id': group.id, + 'use_calc_stream': True, + 'use_model_parallel': True, + }) + + # copy serial op's dist_attr to dist op's dist_attr + copy_distributed_attr_for_dist_op(c_embedding_op, main_block, + op_dist_attr) + copy_distributed_attr_for_dist_op(c_allreduce_sum_op, main_block, + op_dist_attr) + + # param initialization sync + assert Weight_var.name not in dist_op_helper.already_init_sync_vars + dist_op_helper.already_init_sync_vars.add(Weight_var.name) + param = startup_block.var(Weight_var.name) + param_dist_attr = ctx.get_tensor_distributed_attr_for_program(param) + process_mesh = param_dist_attr.get_process_mesh() + dim_mapping = param_dist_attr.get_dims_mapping() + + # NOTE all not splited axis should be presented in mesh + for axis, size in enumerate(process_mesh.topology): + if size <= 1 or axis in dim_mapping: + pass else: - relative_idx = rank_id % num_partition + group_ranks = _get_comm_group(process_mesh.process_group, + process_mesh.topology, axis, + rank_id) + sync_group = new_process_group(group_ranks) + + startup_block.append_op( + type='c_broadcast', + inputs={'X': param}, + outputs={'Out': param}, + attrs={ + 'ring_id': sync_group.id, + 'root': 0, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward + }) + startup_block._sync_with_cpp() + + @staticmethod + def backward(ctx, *args, **kwargs): + + # by now the backward function only insert the gradient allreduce for dist op itself + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + backward_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + dist_attr = ctx.get_op_distributed_attr_for_program(backward_op) + assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(backward_op)) - per_part_size = Weight_var.shape[0] - relative_idx = relative_idx * per_part_size + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in dist_attr.get_process_mesh().process_group: + rank_id = _get_corresponding_rank(dist_attr.get_process_mesh(), + rank_id) + + # check if need gradient allreduce + need_gradient_allreduce = False + + assert 'Ids' in kwargs, "input [{}] is not given".format('Ids') + assert 'W' in kwargs, "input [{}] is not given".format('W') + assert 'Out@GRAD' in kwargs, "input [{}] is not given".format('Out') + assert 'W@GRAD' in kwargs, "output [{}] is not given".format('W@GRAD') + + assert len( + kwargs['Ids'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['Ids']) + assert len( + kwargs['W'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['W']) + assert len( + kwargs['Out@GRAD'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['Out']) + assert len( + kwargs['W@GRAD'] + ) == 1, "row_parallel_embedding output Ids take 1 variable but got {}".format( + kwargs['W@GRAD']) + + Ids_var = main_block.var(kwargs['Ids'][0]) + process_mesh = dist_attr.get_process_mesh() + var_dim_mapping = dist_attr.get_input_dims_mapping(Ids_var.name) + mesh_shape = process_mesh.topology + batch_size_axis = var_dim_mapping[0] + if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: + need_gradient_allreduce = True - # TODO caculate ring id group_ranks = _get_comm_group(process_mesh.process_group, process_mesh.topology, - model_parallel_axis, rank_id) - group = new_process_group(group_ranks) - - # append op - check_variable_and_dtype(Ids_var, 'input', ['int32', 'int64'], - 'c_embedding') - - intermediate_var_0 = dst_block.create_var( - name=unique_name.generate_with_ignorable_key(".".join( - ["c_embedding", 'tmp'])), - dtype=Weight_var.dtype, - shape=Out_var.shape, - type=core.VarDesc.VarType.LOD_TENSOR, - persistable=False, - stop_gradient=Out_var.stop_gradient) - # copy Out_var's dist_attr to intermediate_var_0's dist_attr - copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, - Out_var) - - check_variable_and_dtype( - Out_var, 'tensor', - ['float16', 'float32', 'float64', 'int32', 'int64'], - 'c_allreduce_sum') - - c_embedding_op = dst_block.append_op( - type='c_embedding', - inputs={'Ids': [Ids_var], - 'W': [Weight_var]}, - outputs={'Out': [intermediate_var_0]}, - attrs={"start_index": relative_idx}) - - # use_model_parallel - c_allreduce_sum_op = dst_block.append_op( + batch_size_axis, rank_id) + dp_degree = len(group_ranks) + dp_group = new_process_group(group_ranks) + + if need_gradient_allreduce: + W_Grad_var = main_block.var(kwargs['W@GRAD'][0]) + allreduce_op = main_block.append_op( type='c_allreduce_sum', - inputs={'X': [intermediate_var_0]}, - outputs={'Out': [Out_var]}, + inputs={'X': [W_Grad_var]}, + outputs={'Out': [W_Grad_var]}, attrs={ - 'ring_id': group.id, + 'ring_id': dp_group.id, 'use_calc_stream': True, - 'use_model_parallel': True, + OP_ROLE_KEY: OpRole.Backward }) + scale_op = main_block.append_op( + type='scale', + inputs={'X': W_Grad_var}, + outputs={'Out': W_Grad_var}, + attrs={'scale': 1.0 / dp_degree, + OP_ROLE_KEY: OpRole.Backward}) + main_block._sync_with_cpp() - # copy serial op's dist_attr to dist op's dist_attr - copy_distributed_attr_for_dist_op(c_embedding_op, dst_block, - op_dist_attr) - copy_distributed_attr_for_dist_op(c_allreduce_sum_op, dst_block, - op_dist_attr) - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + dims_mapping = ctx.get_tensor_distributed_attr_for_program( + W_Grad_var).get_dims_mapping() + process_mesh = dist_attr.get_process_mesh() + for op in [allreduce_op, scale_op]: + op_attr = OperatorDistributedAttribute(op, ctx) + op_attr.set_process_mesh(process_mesh) + op_attr.set_output_dims_mapping(W_Grad_var.name, dims_mapping) + op_attr.set_input_dims_mapping(W_Grad_var.name, dims_mapping) + ctx.set_op_distributed_attr_for_program(op, op_attr) register_distributed_operator_impl("lookup_table_v2", DistributedEmbeddingImpl("row_parallel")) +register_distributed_operator_impl("c_embedding", + DistributedEmbeddingImpl("row_parallel")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index 10a01dc57ed2b9..2edbcd2318cdf7 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -24,12 +24,14 @@ from ..utils import compute_compatible_dim_mapping from ..utils import compute_compatible_dims_mapping from ..utils import compute_compatible_and_update_dim_mapping +from ..attribute import OperatorDistributedAttribute from paddle.fluid import core, unique_name from paddle.fluid.framework import in_dygraph_mode from paddle.fluid.framework import Program, Parameter, Variable, program_guard from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype +from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY from ..process import new_process_group -from ..utils import _get_comm_group +from ..utils import _get_comm_group, _get_corresponding_rank def _update_dims_mapping_for_matmul(op_dist_attr): @@ -123,6 +125,130 @@ def _update_dims_mapping_for_matmul(op_dist_attr): return changed +def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs): + + # by now the backward function only insert the gradient allreduce for dist op itself + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + backward_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + dist_attr = ctx.get_op_distributed_attr_for_program(backward_op) + assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(backward_op)) + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in dist_attr.get_process_mesh().process_group: + rank_id = _get_corresponding_rank(dist_attr.get_process_mesh(), rank_id) + + # check if need gradient allreduce + need_gradient_allreduce = False + + assert 'Y' in kwargs, "input [{}] is not given".format('Y') + assert 'X' in kwargs, "input [{}] is not given".format('X') + assert 'Out@GRAD' in kwargs, "input [{}] is not given".format('Out@GRAD') + assert 'Y@GRAD' in kwargs, "output [{}] is not given".format('Y@GRAD') + assert 'X@GRAD' in kwargs, "output [{}] is not given".format('X@GRAD') + + assert len( + kwargs['Y'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['Y']) + assert len( + kwargs['X'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['X']) + assert len( + kwargs['Out@GRAD'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['Out']) + assert len( + kwargs['Y@GRAD'] + ) == 1, "row_parallel_embedding output Ids take 1 variable but got {}".format( + kwargs['Y@GRAD']) + assert len( + kwargs['X@GRAD'] + ) == 1, "row_parallel_embedding output Ids take 1 variable but got {}".format( + kwargs['X@GRAD']) + + X_var = main_block.var(kwargs['X'][0]) + assert not X_var.is_parameter, "left operand(X) [{}] of dist matmul should not be parameter".format( + X_var.name) + + process_mesh = dist_attr.get_process_mesh() + var_dim_mapping = dist_attr.get_input_dims_mapping(X_var.name) + mesh_shape = process_mesh.topology + batch_size_axis = var_dim_mapping[0] + if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: + need_gradient_allreduce = True + group_ranks = _get_comm_group(process_mesh.process_group, + process_mesh.topology, batch_size_axis, + rank_id) + dp_degree = len(group_ranks) + dp_group = new_process_group(group_ranks) + + Y_var = main_block.var(kwargs['Y'][0]) + if need_gradient_allreduce and Y_var.is_parameter: + Y_Grad_var = main_block.var(kwargs['Y@GRAD'][0]) + allreduce_op = main_block.append_op( + type='c_allreduce_sum', + inputs={'X': [Y_Grad_var]}, + outputs={'Out': [Y_Grad_var]}, + attrs={ + 'ring_id': dp_group.id, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Backward + }) + scale_op = main_block.append_op( + type='scale', + inputs={'X': Y_Grad_var}, + outputs={'Out': Y_Grad_var}, + attrs={'scale': 1.0 / dp_degree, + OP_ROLE_KEY: OpRole.Backward}) + main_block._sync_with_cpp() + + dims_mapping = ctx.get_tensor_distributed_attr_for_program( + Y_Grad_var).get_dims_mapping() + process_mesh = dist_attr.get_process_mesh() + for op in [allreduce_op, scale_op]: + op_attr = OperatorDistributedAttribute(op, ctx) + op_attr.set_process_mesh(process_mesh) + op_attr.set_output_dims_mapping(Y_Grad_var.name, dims_mapping) + op_attr.set_input_dims_mapping(Y_Grad_var.name, dims_mapping) + ctx.set_op_distributed_attr_for_program(op, op_attr) + + +def _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx, rank_id): + + assert Weight_var.name not in dist_op_helper.already_init_sync_vars + assert startup_block.has_var(Weight_var.name) + dist_op_helper.already_init_sync_vars.add(Weight_var.name) + param = startup_block.var(Weight_var.name) + param_dist_attr = ctx.get_tensor_distributed_attr_for_program(param) + process_mesh = param_dist_attr.get_process_mesh() + dim_mapping = param_dist_attr.get_dims_mapping() + + for axis, size in enumerate(process_mesh.topology): + if size <= 1 or axis in dim_mapping: + pass + else: + group_ranks = _get_comm_group(process_mesh.process_group, + process_mesh.topology, axis, rank_id) + sync_group = new_process_group(group_ranks) + + startup_block.append_op( + type='c_broadcast', + inputs={'X': param}, + outputs={'Out': param}, + attrs={ + 'ring_id': sync_group.id, + 'root': 0, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward + }) + startup_block._sync_with_cpp() + + class DistributedMatmul(DistributedOperator): def __init__(self, name): super(DistributedMatmul, self).__init__() @@ -138,7 +264,7 @@ def __init__(self, name): super(DistributedMatmulImpl0, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -178,101 +304,109 @@ def update_dims_mapping(self, op_dist_attr): changed = True return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 2, "col_parallel_linear take 2 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 1, "col_parallel_linear take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['X'] - ) == 1, "col_parallel_linear input X take 1 variable but got {}".format( - input_name_mapping['X']) - assert len( - input_name_mapping['Y'] - ) == 1, "col_parallel_linear input Y take 1 variable but got {}".format( - input_name_mapping['Y']) - assert len( - output_name_mapping['Out'] - ) == 1, "col_parallel_linear input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - X_var = dst_block.var(input_name_mapping['X'][0]) - Weight_var = dst_block.var(input_name_mapping['Y'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - - # TODO infer logic comm presentation - model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( - )._get_model_parallel_info() - group_ranks = _get_comm_group(process_mesh.process_group, - process_mesh.topology, - model_parallel_axis, rank_id) - group = new_process_group(group_ranks) - - intermediate_var_0 = dst_block.create_var( - name=unique_name.generate_with_ignorable_key(".".join( - ["c_identity", 'tmp'])), - dtype=X_var.dtype, - shape=X_var.shape, - type=core.VarDesc.VarType.LOD_TENSOR, - persistable=False, - stop_gradient=X_var.stop_gradient) - # copy X_var's dist_attr to intermediate_var_0's dist_attr - copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, - X_var) - - check_variable_and_dtype( - X_var, 'tensor', - ['float16', 'float32', 'float64', 'int32', 'int64'], - '_c_identity') - - c_identity_op = dst_block.append_op( - type='c_identity', - inputs={'X': [X_var]}, - outputs={'Out': intermediate_var_0}, - attrs={ - 'ring_id': group.id, - 'use_calc_stream': True, - 'use_model_parallel': True, - }) - - check_variable_and_dtype(intermediate_var_0, 'x', - ['float16', 'float32', 'float64'], - 'linear') - check_dtype(intermediate_var_0.dtype, 'dtype', - ['float16', 'float32', 'float64'], 'linear') - attrs = { - 'transpose_X': False, - 'transpose_Y': False, - 'alpha': 1, - } - inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]} - matmul_op = dst_block.append_op( - type='matmul', - inputs=inputs, - outputs={'Out': Out_var}, - attrs=attrs) - - # copy serial op's dist_attr to dist op's dist_attr - copy_distributed_attr_for_dist_op(c_identity_op, dst_block, - op_dist_attr) - copy_distributed_attr_for_dist_op(matmul_op, dst_block, - op_dist_attr) - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + startup_block = dist_op_helper.get_dst_startup_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in op_dist_attr.get_process_mesh().process_group: + rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(), + rank_id) + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + X_var = main_block.var(kwargs['X'][0]) + Weight_var = main_block.var(kwargs['Y'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + + # TODO infer logic comm presentation + matmul_col_dim_mapping = op_dist_attr.get_input_dims_mapping( + Weight_var.name)[1] + assert matmul_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format( + matmul_col_dim_mapping) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + process_mesh_group = op_dist_attr.get_process_mesh().process_group + + parallel_axis = matmul_col_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + intermediate_var_0 = main_block.create_var( + name=unique_name.generate_with_ignorable_key(".".join( + ["c_identity", 'tmp'])), + dtype=X_var.dtype, + shape=X_var.shape, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=X_var.stop_gradient) + # copy X_var's dist_attr to intermediate_var_0's dist_attr + copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, X_var) + + check_variable_and_dtype( + X_var, 'tensor', + ['float16', 'float32', 'float64', 'int32', 'int64'], '_c_identity') + + c_identity_op = main_block.append_op( + type='c_identity', + inputs={'X': [X_var]}, + outputs={'Out': intermediate_var_0}, + attrs={ + 'ring_id': group.id, + 'use_calc_stream': True, + 'use_model_parallel': True, + }) + + check_variable_and_dtype(intermediate_var_0, 'x', + ['float16', 'float32', 'float64'], 'linear') + check_dtype(intermediate_var_0.dtype, 'dtype', + ['float16', 'float32', 'float64'], 'linear') + attrs = { + 'transpose_X': False, + 'transpose_Y': False, + 'alpha': 1, + } + inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]} + matmul_op = main_block.append_op( + type='matmul', inputs=inputs, outputs={'Out': Out_var}, attrs=attrs) + + # copy serial op's dist_attr to dist op's dist_attr + copy_distributed_attr_for_dist_op(c_identity_op, main_block, + op_dist_attr) + copy_distributed_attr_for_dist_op(matmul_op, main_block, op_dist_attr) + + # init param sync + if Weight_var.is_parameter: + _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx, + rank_id) + + @staticmethod + def backward(ctx, *args, **kwargs): + _right_operand_parameter_matmul_backward(ctx, *args, **kwargs) # RowParallel @@ -281,7 +415,7 @@ def __init__(self, name): super(DistributedMatmulImpl1, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -323,95 +457,108 @@ def update_dims_mapping(self, op_dist_attr): changed = True return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 2, "col_parallel_linear take 2 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 1, "col_parallel_linear take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['X'] - ) == 1, "col_parallel_linear input X take 1 variable but got {}".format( - input_name_mapping['X']) - assert len( - input_name_mapping['Y'] - ) == 1, "col_parallel_linear input Y take 1 variable but got {}".format( - input_name_mapping['Y']) - assert len( - output_name_mapping['Out'] - ) == 1, "col_parallel_linear input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - X_var = dst_block.var(input_name_mapping['X'][0]) - Weight_var = dst_block.var(input_name_mapping['Y'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - - # TODO infer logic comm presentation - model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( - )._get_model_parallel_info() - group_ranks = _get_comm_group(process_mesh.process_group, - process_mesh.topology, - model_parallel_axis, rank_id) - group = new_process_group(group_ranks) - - check_variable_and_dtype( - X_var, 'x', ['float16', 'float32', 'float64'], 'linear') - check_dtype(X_var.dtype, 'dtype', - ['float16', 'float32', 'float64'], 'linear') - attrs = { - 'transpose_X': False, - 'transpose_Y': False, - 'alpha': 1, - } - inputs = {'X': X_var, 'Y': Weight_var} - intermediate_var_0 = dst_block.create_var( - shape=Out_var.shape, - dtype=Out_var.dtype, - type=Out_var.type, - lod_level=Out_var.lod_level, - persistable=False, - is_data=False, - need_check_feed=Out_var.desc.need_check_feed()) - # copy Out_var's dist_attr to intermediate_var_0's dist_attr - copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, - Out_var) - - matmul_op = dst_block.append_op( - type='matmul', - inputs=inputs, - outputs={'Out': intermediate_var_0}, - attrs=attrs) - - c_allreduce_sum_op = dst_block.append_op( - type='c_allreduce_sum', - inputs={'X': intermediate_var_0}, - outputs={'Out': Out_var}, - attrs={ - 'ring_id': group.id, - 'use_calc_stream': True, - 'use_model_parallel': True - }) - - # copy serial op's dist_attr to dist op's dist_attr - copy_distributed_attr_for_dist_op(matmul_op, dst_block, - op_dist_attr) - copy_distributed_attr_for_dist_op(c_allreduce_sum_op, dst_block, - op_dist_attr) - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + startup_block = dist_op_helper.get_dst_startup_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in op_dist_attr.get_process_mesh().process_group: + rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(), + rank_id) + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + X_var = main_block.var(kwargs['X'][0]) + Weight_var = main_block.var(kwargs['Y'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + + # TODO infer logic comm presentation + matmul_row_dim_mapping = op_dist_attr.get_input_dims_mapping( + Weight_var.name)[0] + assert matmul_row_dim_mapping >= 0, "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format( + matmul_row_dim_mapping) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + process_mesh_group = op_dist_attr.get_process_mesh().process_group + + parallel_axis = matmul_row_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + check_variable_and_dtype(X_var, 'x', ['float16', 'float32', 'float64'], + 'linear') + check_dtype(X_var.dtype, 'dtype', ['float16', 'float32', 'float64'], + 'linear') + attrs = { + 'transpose_X': False, + 'transpose_Y': False, + 'alpha': 1, + } + inputs = {'X': X_var, 'Y': Weight_var} + intermediate_var_0 = main_block.create_var( + shape=Out_var.shape, + dtype=Out_var.dtype, + type=Out_var.type, + lod_level=Out_var.lod_level, + persistable=False, + is_data=False, + need_check_feed=Out_var.desc.need_check_feed()) + # copy Out_var's dist_attr to intermediate_var_0's dist_attr + copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, Out_var) + + matmul_op = main_block.append_op( + type='matmul', + inputs=inputs, + outputs={'Out': intermediate_var_0}, + attrs=attrs) + + c_allreduce_sum_op = main_block.append_op( + type='c_allreduce_sum', + inputs={'X': intermediate_var_0}, + outputs={'Out': Out_var}, + attrs={ + 'ring_id': group.id, + 'use_calc_stream': True, + 'use_model_parallel': True + }) + + # copy serial op's dist_attr to dist op's dist_attr + copy_distributed_attr_for_dist_op(matmul_op, main_block, op_dist_attr) + copy_distributed_attr_for_dist_op(c_allreduce_sum_op, main_block, + op_dist_attr) + + # init param sync + if Weight_var.is_parameter: + _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx, + rank_id) + + @staticmethod + def backward(ctx, *args, **kwargs): + _right_operand_parameter_matmul_backward(ctx, *args, **kwargs) # ReplicateParallel @@ -465,6 +612,10 @@ def update_dims_mapping(self, op_dist_attr): changed = True return changed + @staticmethod + def backward(ctx, *args, **kwargs): + _right_operand_parameter_matmul_backward(ctx, *args, **kwargs) + register_distributed_operator_impl("matmul", DistributedMatmulImpl0("column_parallel")) @@ -489,7 +640,7 @@ def __init__(self, name): super(DistributedMatmulV2Impl0, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -529,97 +680,109 @@ def update_dims_mapping(self, op_dist_attr): changed = True return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 2, "col_parallel_linear take 2 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 1, "col_parallel_linear take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['X'] - ) == 1, "col_parallel_linear input X take 1 variable but got {}".format( - input_name_mapping['X']) - assert len( - input_name_mapping['Y'] - ) == 1, "col_parallel_linear input Y take 1 variable but got {}".format( - input_name_mapping['Y']) - assert len( - output_name_mapping['Out'] - ) == 1, "col_parallel_linear input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - X_var = dst_block.var(input_name_mapping['X'][0]) - Weight_var = dst_block.var(input_name_mapping['Y'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - - # TODO infer logic comm presentation - model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( - )._get_model_parallel_info() - group_ranks = _get_comm_group(process_mesh.process_group, - process_mesh.topology, - model_parallel_axis, rank_id) - group = new_process_group(group_ranks) - - intermediate_var_0 = dst_block.create_var( - name=unique_name.generate_with_ignorable_key(".".join( - ["c_identity", 'tmp'])), - dtype=X_var.dtype, - shape=X_var.shape, - type=core.VarDesc.VarType.LOD_TENSOR, - persistable=False, - stop_gradient=X_var.stop_gradient) - # copy X_var's dist_attr to intermediate_var_0's dist_attr - copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, - X_var) - - check_variable_and_dtype( - X_var, 'tensor', - ['float16', 'float32', 'float64', 'int32', 'int64'], - '_c_identity') - - c_identity_op = dst_block.append_op( - type='c_identity', - inputs={'X': [X_var]}, - outputs={'Out': intermediate_var_0}, - attrs={ - 'ring_id': group.id, - 'use_calc_stream': True, - 'use_model_parallel': True, - }) - - check_variable_and_dtype(intermediate_var_0, 'x', - ['float16', 'float32', 'float64'], - 'linear') - check_dtype(intermediate_var_0.dtype, 'dtype', - ['float16', 'float32', 'float64'], 'linear') - attrs = {'trans_x': False, 'trans_y': False} - inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]} - matmul_v2_op = dst_block.append_op( - type='matmul_v2', - inputs=inputs, - outputs={'Out': Out_var}, - attrs=attrs) - - # copy serial op's dist_attr to dist op's dist_attr - copy_distributed_attr_for_dist_op(c_identity_op, dst_block, - op_dist_attr) - copy_distributed_attr_for_dist_op(matmul_v2_op, dst_block, - op_dist_attr) - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + startup_block = dist_op_helper.get_dst_startup_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in op_dist_attr.get_process_mesh().process_group: + rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(), + rank_id) + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + X_var = main_block.var(kwargs['X'][0]) + Weight_var = main_block.var(kwargs['Y'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + + # TODO infer logic comm presentation + matmul_col_dim_mapping = op_dist_attr.get_input_dims_mapping( + Weight_var.name)[1] + assert matmul_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format( + matmul_col_dim_mapping) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + process_mesh_group = op_dist_attr.get_process_mesh().process_group + + parallel_axis = matmul_col_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + intermediate_var_0 = main_block.create_var( + name=unique_name.generate_with_ignorable_key(".".join( + ["c_identity", 'tmp'])), + dtype=X_var.dtype, + shape=X_var.shape, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=X_var.stop_gradient) + # copy X_var's dist_attr to intermediate_var_0's dist_attr + copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, X_var) + + check_variable_and_dtype( + X_var, 'tensor', + ['float16', 'float32', 'float64', 'int32', 'int64'], '_c_identity') + + c_identity_op = main_block.append_op( + type='c_identity', + inputs={'X': [X_var]}, + outputs={'Out': intermediate_var_0}, + attrs={ + 'ring_id': group.id, + 'use_calc_stream': True, + 'use_model_parallel': True, + }) + + check_variable_and_dtype(intermediate_var_0, 'x', + ['float16', 'float32', 'float64'], 'linear') + check_dtype(intermediate_var_0.dtype, 'dtype', + ['float16', 'float32', 'float64'], 'linear') + attrs = {'trans_x': False, 'trans_y': False} + inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]} + matmul_v2_op = main_block.append_op( + type='matmul_v2', + inputs=inputs, + outputs={'Out': Out_var}, + attrs=attrs) + + # copy serial op's dist_attr to dist op's dist_attr + copy_distributed_attr_for_dist_op(c_identity_op, main_block, + op_dist_attr) + copy_distributed_attr_for_dist_op(matmul_v2_op, main_block, + op_dist_attr) + + # init param sync + if Weight_var.is_parameter: + _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx, + rank_id) + + @staticmethod + def backward(ctx, *args, **kwargs): + _right_operand_parameter_matmul_backward(ctx, *args, **kwargs) # RowParallel @@ -628,7 +791,7 @@ def __init__(self, name): super(DistributedMatmulV2Impl1, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -670,91 +833,105 @@ def update_dims_mapping(self, op_dist_attr): changed = True return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 2, "col_parallel_linear take 2 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 1, "col_parallel_linear take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['X'] - ) == 1, "col_parallel_linear input X take 1 variable but got {}".format( - input_name_mapping['X']) - assert len( - input_name_mapping['Y'] - ) == 1, "col_parallel_linear input Y take 1 variable but got {}".format( - input_name_mapping['Y']) - assert len( - output_name_mapping['Out'] - ) == 1, "col_parallel_linear input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - X_var = dst_block.var(input_name_mapping['X'][0]) - Weight_var = dst_block.var(input_name_mapping['Y'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - - # TODO infer logic comm presentation - model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( - )._get_model_parallel_info() - group_ranks = _get_comm_group(process_mesh.process_group, - process_mesh.topology, - model_parallel_axis, rank_id) - group = new_process_group(group_ranks) - - check_variable_and_dtype( - X_var, 'x', ['float16', 'float32', 'float64'], 'linear') - check_dtype(X_var.dtype, 'dtype', - ['float16', 'float32', 'float64'], 'linear') - attrs = {'trans_x': False, 'trans_y': False} - inputs = {'X': X_var, 'Y': Weight_var} - intermediate_var_0 = dst_block.create_var( - shape=Out_var.shape, - dtype=Out_var.dtype, - type=Out_var.type, - lod_level=Out_var.lod_level, - persistable=False, - is_data=False, - need_check_feed=Out_var.desc.need_check_feed()) - # copy Out_var's dist_attr to intermediate_var_0's dist_attr - copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, - Out_var) - - matmul_v2_op = dst_block.append_op( - type='matmul_v2', - inputs=inputs, - outputs={'Out': intermediate_var_0}, - attrs=attrs) - - c_allreduce_sum_op = dst_block.append_op( - type='c_allreduce_sum', - inputs={'X': intermediate_var_0}, - outputs={'Out': Out_var}, - attrs={ - 'ring_id': group.id, - 'use_calc_stream': True, - 'use_model_parallel': True - }) - - # copy serial op's dist_attr to dist op's dist_attr - copy_distributed_attr_for_dist_op(matmul_v2_op, dst_block, - op_dist_attr) - copy_distributed_attr_for_dist_op(c_allreduce_sum_op, dst_block, - op_dist_attr) - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + startup_block = dist_op_helper.get_dst_startup_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in op_dist_attr.get_process_mesh().process_group: + rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(), + rank_id) + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + X_var = main_block.var(kwargs['X'][0]) + Weight_var = main_block.var(kwargs['Y'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + + # TODO infer logic comm presentation + matmul_row_dim_mapping = op_dist_attr.get_input_dims_mapping( + Weight_var.name)[0] + assert matmul_row_dim_mapping >= 0, "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format( + matmul_row_dim_mapping) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + process_mesh_group = op_dist_attr.get_process_mesh().process_group + + parallel_axis = matmul_row_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + check_variable_and_dtype(X_var, 'x', ['float16', 'float32', 'float64'], + 'linear') + check_dtype(X_var.dtype, 'dtype', ['float16', 'float32', 'float64'], + 'linear') + attrs = {'trans_x': False, 'trans_y': False} + inputs = {'X': X_var, 'Y': Weight_var} + intermediate_var_0 = main_block.create_var( + shape=Out_var.shape, + dtype=Out_var.dtype, + type=Out_var.type, + lod_level=Out_var.lod_level, + persistable=False, + is_data=False, + need_check_feed=Out_var.desc.need_check_feed()) + # copy Out_var's dist_attr to intermediate_var_0's dist_attr + copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, Out_var) + + matmul_v2_op = main_block.append_op( + type='matmul_v2', + inputs=inputs, + outputs={'Out': intermediate_var_0}, + attrs=attrs) + + c_allreduce_sum_op = main_block.append_op( + type='c_allreduce_sum', + inputs={'X': intermediate_var_0}, + outputs={'Out': Out_var}, + attrs={ + 'ring_id': group.id, + 'use_calc_stream': True, + 'use_model_parallel': True + }) + + # copy serial op's dist_attr to dist op's dist_attr + copy_distributed_attr_for_dist_op(matmul_v2_op, main_block, + op_dist_attr) + copy_distributed_attr_for_dist_op(c_allreduce_sum_op, main_block, + op_dist_attr) + + # init param sync + if Weight_var.is_parameter: + _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx, + rank_id) + + @staticmethod + def backward(ctx, *args, **kwargs): + _right_operand_parameter_matmul_backward(ctx, *args, **kwargs) # ReplicateParallel @@ -808,6 +985,10 @@ def update_dims_mapping(self, op_dist_attr): changed = True return changed + @staticmethod + def backward(ctx, *args, **kwargs): + _right_operand_parameter_matmul_backward(ctx, *args, **kwargs) + register_distributed_operator_impl("matmul_v2", DistributedMatmulV2Impl0("column_parallel")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py index e7fbe9cfebad83..39e97850b8656b 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py @@ -42,7 +42,7 @@ def __init__(self, name): super(DistributedReshapeImpl0, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -97,82 +97,72 @@ def update_dims_mapping(self, op_dist_attr): return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 3, "Dist op of Reshape take 3 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 2, "Dist op of Reshape take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['X'] - ) == 1, "Dist op of Reshape input X take 1 variable but got {}".format( - input_name_mapping['X']) - assert len( - input_name_mapping['ShapeTensor'] - ) <= 1, "Dist op of Reshape input ShapeTensor take 0 or 1 variable but got {}".format( - input_name_mapping['ShapeTensor']) - assert len( - input_name_mapping['Shape'] - ) <= 1, "Dist op of Reshape input Shape take 0 or 1 variable but got {}".format( - input_name_mapping['Shape']) - assert len( - output_name_mapping['Out'] - ) == 1, "Dist op of Reshape input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - assert len( - output_name_mapping['XShape'] - ) == 1, "Dist op of Reshape input XShape take 1 variable but got {}".format( - input_name_mapping['XShape']) - - X_var = dst_block.var(input_name_mapping['X'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - XShape_var = dst_block.var(output_name_mapping['XShape'][0]) - shape_list = src_op.desc.attr("shape") - ShapeTensor_var_list = [] - for name in input_name_mapping['ShapeTensor']: - ShapeTensor_var_list.append(name) - Shape_var_list = [] - for name in input_name_mapping['Shape']: - Shape_var_list.append(name) - - # got dist attribute info - dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name) - process_mesh_shape = op_dist_attr.get_process_mesh().topology - - # modify target shape - for idx, axis in enumerate(dim_mapping): - if axis >= 0: - if len(shape_list) > idx: - shape_list[idx] = shape_list[idx] // process_mesh_shape[ - axis] - - # create op - new_op_desc = dst_block.desc.append_op() - new_op_desc.copy_from(src_op.desc) - new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list) - new_op_desc.set_input('Shape', Shape_var_list) - new_op_desc.set_input('X', [X_var.name]) - new_op_desc.set_output('XShape', [XShape_var.name]) - new_op_desc.set_output('Out', [Out_var.name]) - new_op_desc._set_attr('shape', shape_list) - - dst_block._sync_with_cpp() - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + X_var = main_block.var(kwargs['X'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + XShape_var = main_block.var(kwargs['XShape'][0]) + shape_list = src_op.desc.attr("shape") + ShapeTensor_var_list = [] + for name in kwargs['ShapeTensor']: + ShapeTensor_var_list.append(name) + Shape_var_list = [] + for name in kwargs['Shape']: + Shape_var_list.append(name) + + # got dist attribute info + dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + + # modify target shape + for idx, axis in enumerate(dim_mapping): + if axis >= 0: + if len(shape_list) > idx: + shape_list[idx] = shape_list[idx] // process_mesh_shape[ + axis] + + # create op + new_op_desc = main_block.desc.append_op() + new_op_desc.copy_from(src_op.desc) + new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list) + new_op_desc.set_input('Shape', Shape_var_list) + new_op_desc.set_input('X', [X_var.name]) + new_op_desc.set_output('XShape', [XShape_var.name]) + new_op_desc.set_output('Out', [Out_var.name]) + new_op_desc._set_attr('shape', shape_list) + + main_block._sync_with_cpp() + + @staticmethod + def backward(ctx, *args, **kwargs): + pass class DistributedReshapeImpl1(DistributedOperatorImpl): @@ -180,7 +170,7 @@ def __init__(self, name): super(DistributedReshapeImpl1, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -235,82 +225,72 @@ def update_dims_mapping(self, op_dist_attr): return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 3, "Dist op of Reshape take 3 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 2, "Dist op of Reshape take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['X'] - ) == 1, "Dist op of Reshape input X take 1 variable but got {}".format( - input_name_mapping['X']) - assert len( - input_name_mapping['ShapeTensor'] - ) <= 1, "Dist op of Reshape input ShapeTensor take 0 or 1 variable but got {}".format( - input_name_mapping['ShapeTensor']) - assert len( - input_name_mapping['Shape'] - ) <= 1, "Dist op of Reshape input Shape take 0 or 1 variable but got {}".format( - input_name_mapping['Shape']) - assert len( - output_name_mapping['Out'] - ) == 1, "Dist op of Reshape input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - assert len( - output_name_mapping['XShape'] - ) == 1, "Dist op of Reshape input XShape take 1 variable but got {}".format( - input_name_mapping['XShape']) - - X_var = dst_block.var(input_name_mapping['X'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - XShape_var = dst_block.var(output_name_mapping['XShape'][0]) - shape_list = src_op.desc.attr("shape") - ShapeTensor_var_list = [] - for name in input_name_mapping['ShapeTensor']: - ShapeTensor_var_list.append(name) - Shape_var_list = [] - for name in input_name_mapping['Shape']: - Shape_var_list.append(name) - - # got dist attribute info - dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name) - process_mesh_shape = op_dist_attr.get_process_mesh().topology - - # modify target shape - for idx, axis in enumerate(dim_mapping): - if axis >= 0: - if len(shape_list) > idx: - shape_list[idx] = shape_list[idx] // process_mesh_shape[ - axis] - - # create op - new_op_desc = dst_block.desc.append_op() - new_op_desc.copy_from(src_op.desc) - new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list) - new_op_desc.set_input('Shape', Shape_var_list) - new_op_desc.set_input('X', [X_var.name]) - new_op_desc.set_output('XShape', [XShape_var.name]) - new_op_desc.set_output('Out', [Out_var.name]) - new_op_desc._set_attr('shape', shape_list) - - dst_block._sync_with_cpp() - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + X_var = main_block.var(kwargs['X'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + XShape_var = main_block.var(kwargs['XShape'][0]) + shape_list = src_op.desc.attr("shape") + ShapeTensor_var_list = [] + for name in kwargs['ShapeTensor']: + ShapeTensor_var_list.append(name) + Shape_var_list = [] + for name in kwargs['Shape']: + Shape_var_list.append(name) + + # got dist attribute info + dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + + # modify target shape + for idx, axis in enumerate(dim_mapping): + if axis >= 0: + if len(shape_list) > idx: + shape_list[idx] = shape_list[idx] // process_mesh_shape[ + axis] + + # create op + new_op_desc = main_block.desc.append_op() + new_op_desc.copy_from(src_op.desc) + new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list) + new_op_desc.set_input('Shape', Shape_var_list) + new_op_desc.set_input('X', [X_var.name]) + new_op_desc.set_output('XShape', [XShape_var.name]) + new_op_desc.set_output('Out', [Out_var.name]) + new_op_desc._set_attr('shape', shape_list) + + main_block._sync_with_cpp() + + @staticmethod + def backward(ctx, *args, **kwargs): + pass register_distributed_operator_impl("reshape2", diff --git a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py index dc78bdee1fb149..56be75b3beaf2c 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py @@ -37,6 +37,8 @@ class DistributedSoftmaxImpl(DistributedOperatorImpl): def __init__(self, name): super(DistributedSoftmaxImpl, self).__init__() self._name = name + self._forward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -86,6 +88,10 @@ def update_dims_mapping(self, op_dist_attr): return changed + @staticmethod + def backward(ctx, *args, **kwargs): + pass + register_distributed_operator_impl( "softmax", DistributedSoftmaxImpl("replicate_last_axis")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py index c2ca4d85fdf106..10b8bf2666f4ba 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py @@ -37,6 +37,8 @@ class DistributedTranspose2Impl(DistributedOperatorImpl): def __init__(self, name): super(DistributedTranspose2Impl, self).__init__() self._name = name + self._forward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -82,6 +84,10 @@ def update_dims_mapping(self, op_dist_attr): return changed + @staticmethod + def backward(ctx, *args, **kwargs): + pass + register_distributed_operator_impl( "transpose2", DistributedTranspose2Impl("same_mapping_transpose")) diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index 1437dbb2f9049f..8f4a4866eb8db9 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -94,10 +94,8 @@ def parallelize(self, # The last step: remove all distributed attributes to be compatiable # with inference. self._remove_distributed_attrs(partitioned_main_prog) - - complete_backward_annotation(partitioned_main_prog, self._dist_context) - make_data_unshard(partitioned_main_prog, partitioned_startup_prog) + reshard(partitioned_main_prog, partitioned_startup_prog, rank, self._dist_context) diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py index b67f1e1ab97f21..c0a91f4b53a0d6 100755 --- a/python/paddle/distributed/auto_parallel/partitioner.py +++ b/python/paddle/distributed/auto_parallel/partitioner.py @@ -23,15 +23,15 @@ from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype from paddle.fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_ from paddle.distributed.auto_parallel.operators.common import get_distributed_operator -from paddle.distributed.auto_parallel.operators.common import find_best_compatible_distributed_operator_impl from paddle.fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops, ClipGradByGlobalNorm from paddle.distributed.fleet.base.distributed_strategy import DistributedStrategy -from paddle.distributed.auto_parallel.context import DistributedContext +from paddle.distributed.auto_parallel.context import DistributedContext, DistOpHelper from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op, is_backward_op, is_optimizer_op from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY from .process import new_process_group from .interface import _g_process_mesh_map -from .utils import _get_comm_group +from .attribute import OperatorDistributedAttribute +from paddle.distributed.auto_parallel.completion import complete_backward_annotation, complete_update_annotation __varname_not_in_block__ = ["lod_tensor_blocking_queue_0"] @@ -122,16 +122,6 @@ def __init__(self, dist_strategy, auto_parallel_context, rank_id=0): # should be set to False self._compatible_with_auto_backward = True - # data parallelism - self._enable_data_parallel = False - self._dp_degree = 0 - self._dp_group = None - - # tensor parallelism - self._enable_tensor_parallel = False - self._tp_degree = 0 - self._tp_group = None - def transpile_forward(self, serial_main_program, serial_startup_program): """ take serial forward programs with shard annotation, create a new distributed forward programs based on the serial ones. @@ -236,9 +226,6 @@ def transpile_forward_impl(self, main_program, startup_program): raise RuntimeError( "Not all vars or ops are annotated in main program !") - # determine parallelism mode - self._determine_parallel_mode(main_program) - # dist op & partition vars new_main_prog, new_startup_program = self._dist_var_op_forward_transpile( main_program, startup_program) @@ -270,11 +257,6 @@ def apply_backward_impl(self, self._sharding_backward_transpile(new_main_prog, new_startup_program) - # Data Parallel pass - if self._enable_data_parallel: - self._gradient_sync_transpile(dist_main_program, - dist_startup_program) - return params_grads def apply_optimize_impl(self, user_define_optimizer, params_grads, @@ -311,9 +293,78 @@ def _dist_var_op_forward_transpile(self, partitioned_main_prog = fluid.Program() partitioned_global_block = partitioned_main_prog.global_block() - serial_global_block = serial_main_program.global_block() + serial_main_block = serial_main_program.global_block() serial_ops = serial_main_program.global_block().ops + # transpile startup program + if serial_startup_program == None: + partitioned_startup_prog = None + else: + partitioned_startup_prog = fluid.Program() + # create parameter + partitioned_startup_global_block = partitioned_startup_prog.global_block( + ) + param2shape = {} + temp_varname_map = {} + for var in serial_startup_program.list_vars(): + if isinstance(var, Parameter): + # TODO if var not belong to this rank, should be filtered + serial_main_var = serial_main_block.var(var.name) + dist_attr = self._auto_parallel_context.get_tensor_distributed_attr_for_program( + serial_main_var) + target_shape = _get_dist_shape(serial_main_var, dist_attr) + new_name = var.name + self._dist_varname_suffix + temp_varname_map[var.name] = new_name + _partition_parameter(self._auto_parallel_context, + serial_main_var, + partitioned_startup_global_block, + new_name, target_shape) + param2shape[new_name] = target_shape + + # copy initializer + for op in serial_startup_program.global_block().ops: + # TODO if var not belong to this rank, should be filtered + output_vars = op.desc.output_arg_names() + assert len( + output_vars + ) == 1, "initializer should output only ONE variable, but got [{}]".format( + str(op.desc)) + assert temp_varname_map[output_vars[ + 0]] in param2shape, "try to initialize [{}] which is not a Parameter".format( + output_vars[0]) + new_op_desc = partitioned_startup_global_block.desc.append_op() + new_op_desc.copy_from(op.desc) + new_op_desc._rename_output(output_vars[0], + temp_varname_map[output_vars[0]]) + new_op_desc._set_attr( + "shape", param2shape[temp_varname_map[output_vars[0]]]) + partitioned_startup_global_block._sync_with_cpp() + + # set distribute atrribute + new_op = partitioned_startup_global_block.ops[-1] + assert new_op.type == new_op_desc.type() + assert new_op.desc == new_op_desc + output_var = partitioned_startup_global_block.var(output_vars[ + 0]) + output_var_attr = self._auto_parallel_context.get_tensor_distributed_attr_for_program( + output_var) + op_attr = OperatorDistributedAttribute( + new_op, self._auto_parallel_context) + op_attr.set_process_mesh(output_var_attr.get_process_mesh()) + op_attr.set_output_dims_mapping( + output_var.name, output_var_attr.get_dims_mapping()) + op_attr.set_input_dims_mapping( + output_var.name, output_var_attr.get_dims_mapping()) + self._auto_parallel_context.set_op_distributed_attr_for_program( + new_op, op_attr) + + # TODO move helper init to a comm place + dist_op_helper = self._auto_parallel_context.get_dist_op_helper() + dist_op_helper.set_dst_main_program(partitioned_main_prog) + dist_op_helper.set_dst_startup_program(partitioned_startup_prog) + dist_op_helper.set_varname_mapping(self._serial2dist_varname_mapping) + dist_op_helper.set_rank_id(self._rank_id) + # transpile main program for op in serial_ops: @@ -321,9 +372,9 @@ def _dist_var_op_forward_transpile(self, for serial_input_varname in op.desc.input_arg_names(): if serial_input_varname not in self._serial2dist_varname_mapping: new_varname = serial_input_varname + self._dist_varname_suffix - if serial_global_block.has_var(serial_input_varname): + if serial_main_block.has_var(serial_input_varname): _partition_var(self._auto_parallel_context, - serial_global_block, + serial_main_block, partitioned_global_block, serial_input_varname, new_varname) else: @@ -337,118 +388,27 @@ def _dist_var_op_forward_transpile(self, if serial_output_varname not in self._serial2dist_varname_mapping: new_varname = serial_output_varname + self._dist_varname_suffix _partition_var(self._auto_parallel_context, - serial_global_block, - partitioned_global_block, + serial_main_block, partitioned_global_block, serial_output_varname, new_varname) self._serial2dist_varname_mapping[ serial_output_varname] = new_varname # partition op - if _found_match_dist_op(self._auto_parallel_context, op): - # replace with corresponding dist op - _insert_dist_op(op, partitioned_global_block, - self._serial2dist_varname_mapping, - self._auto_parallel_context, self._rank_id) + kinputs, koutputs = dist_op_helper.prepare_forward_context(op) + dist_attr = self._auto_parallel_context.get_op_distributed_attr_for_program( + op) + if _is_dist_op_forward_implement(self._auto_parallel_context, op): + dist_ops = get_distributed_operator(op.type) + dist_op_impl = dist_ops.get_impl(dist_attr.get_impl_idx()) + dist_op_impl.forward(self._auto_parallel_context, **kinputs, + **koutputs) + else: # replicate op - _insert_src_op(op, partitioned_global_block, - self._serial2dist_varname_mapping) - - # transpile startup program - if serial_startup_program == None: - partitioned_startup_prog = None - else: - partitioned_startup_prog = fluid.Program() - # create parameter - partitioned_startup_global_block = partitioned_startup_prog.global_block( - ) - param2shape = {} - for var in partitioned_main_prog.list_vars(): - if isinstance(var, Parameter): - _partition_parameter(self._auto_parallel_context, var, - partitioned_startup_global_block, - var.name, var.shape) - param2shape[var.name] = var.shape - - # copy initializer - for op in serial_startup_program.global_block().ops: - output_vars = op.desc.output_arg_names() - assert len( - output_vars - ) == 1, "initializer should output only ONE variable, but got [{}]".format( - str(op.desc)) - assert self._serial2dist_varname_mapping[output_vars[ - 0]] in param2shape, "try to initialize [{}] which is not a Parameter".format( - output_vars[0]) - new_op_desc = partitioned_startup_global_block.desc.append_op() - new_op_desc.copy_from(op.desc) - new_op_desc._rename_output( - output_vars[0], - self._serial2dist_varname_mapping[output_vars[0]]) - new_op_desc._set_attr("shape", param2shape[ - self._serial2dist_varname_mapping[output_vars[0]]]) - partitioned_startup_global_block._sync_with_cpp() - - # MP broadcast not split parameter - # NOTE Theoretically, the MP param init broadcast should be handled by - # each dist op itself. but if we insert the broadcast op at that moment, the broadcast - # will before the initializer, which lead to a undertermined case. - if self._enable_tensor_parallel: - param_to_sync = [] - for param in partitioned_startup_prog.all_parameters(): - if not self._is_var_distributed(param): - param_to_sync.append(param) - # FIXME the ring id should be set by autoparallel.mapping module - # it should be determined by dp groups butfixed it here for hacking - partitioned_startup_global_block.append_op( - type='c_broadcast', - inputs={'X': param}, - outputs={'Out': param}, - attrs={ - 'ring_id': self._tp_group.id, - 'root': 0, - 'use_calc_stream': True, - OP_ROLE_KEY: OpRole.Forward - }) - partitioned_startup_global_block.append_op( - type='c_sync_comm_stream', - inputs={'X': param_to_sync}, - outputs={'Out': param_to_sync}, - attrs={ - 'ring_id': self._tp_group.id, - OP_ROLE_KEY: OpRole.Forward - }) - partitioned_startup_global_block._sync_with_cpp() - - # DP init param broadcast - if self._enable_data_parallel: - # parameters initialization synchronization - param_to_sync = [] - - for param in partitioned_startup_global_block.all_parameters(): - param_to_sync.append(param) - - # FIXME the ring id should be set by autoparallel.mapping module - # it should be determined by dp groups butfixed it here for hacking - partitioned_startup_global_block.append_op( - type='c_broadcast', - inputs={'X': param}, - outputs={'Out': param}, - attrs={ - 'ring_id': self._dp_group.id, - 'root': 0, - 'use_calc_stream': True, - OP_ROLE_KEY: OpRole.Forward - }) - partitioned_startup_global_block.append_op( - type='c_sync_comm_stream', - inputs={'X': param_to_sync}, - outputs={'Out': param_to_sync}, - attrs={ - 'ring_id': self._dp_group.id, - OP_ROLE_KEY: OpRole.Forward - }) - partitioned_startup_global_block._sync_with_cpp() + dist_ops = get_distributed_operator("default") + dist_op_impl = dist_ops.get_impl(0) + dist_op_impl.forward(self._auto_parallel_context, **kinputs, + **koutputs) return partitioned_main_prog, partitioned_startup_prog @@ -493,12 +453,65 @@ def _dist_var_op_backward_transpile(self, for param in no_grad_set ] - return _auto_backward( + dist_op_helper = self._auto_parallel_context.get_dist_op_helper() + params_and_grads = _auto_backward( dist_loss, dist_startup_program, parameter_list=parameter_list, no_grad_set=no_grad_set, - callbacks=callbacks) + callbacks=callbacks, + distop_context=dist_op_helper) + + # backward completion + complete_backward_annotation( + dist_main_program, dist_context=self._auto_parallel_context) + + # transpiler backward for dist op + # get backward ops + ops = dist_main_program.global_block().ops + first_backward_op_idx = -1 + forward_op_id2forward_op = {} + for idx in range(len(ops)): + if is_forward_op(ops[idx]): + forward_op_id2forward_op[ops[idx].desc.id()] = ops[idx] + + if int(ops[idx].attr('op_role')) == int(OpRole.Backward): + first_backward_op_idx = idx + break + assert first_backward_op_idx >= 0, "not found backward ops in program" + assert len(forward_op_id2forward_op + ) > 0, "not found forward ops in program" + + backward_ops = ops[first_backward_op_idx:] + for backward_op in backward_ops: + # if the backward op has a corresponding forward op + if backward_op.desc.id() in dist_op_helper.gradopidx2opidx: + forward_op_id = dist_op_helper.gradopidx2opidx[ + backward_op.desc.id()] + forward_op = forward_op_id2forward_op[forward_op_id] + # TODO backward attr should has _impl_idx + forward_op_dist_attr = self._auto_parallel_context.get_op_distributed_attr_for_program( + forward_op) + # TODO use the backward op itself to find the dist op + dist_ops = get_distributed_operator(forward_op.type) + kinputs, koutputs = dist_op_helper.prepare_backward_context( + backward_op) + + # TODO use backward op itself to determine impl idx + if _is_dist_op_backward_implement( + self._auto_parallel_context, forward_op): + dist_op_impl = dist_ops.get_impl( + forward_op_dist_attr.get_impl_idx()) + dist_op_impl.backward(self._auto_parallel_context, + **kinputs, **koutputs) + else: + # replicate op + dist_ops = get_distributed_operator("default") + dist_op_impl = dist_ops.get_impl(0) + dist_op_impl.backward(self._auto_parallel_context, + **kinputs, **koutputs) + + return params_and_grads # replace dist grad ops else: raise RuntimeError("transpile NOT implemented !") @@ -509,6 +522,10 @@ def _optimize_transpile(self, user_define_optimizer, params_grads, with program_guard(main_program, startup_program): optimize_ops = user_define_optimizer.apply_gradients(params_grads) + # update completion + complete_update_annotation( + main_program, dist_context=self._auto_parallel_context) + return optimize_ops def _is_valid_annotated_program(self, program): @@ -544,47 +561,6 @@ def _serial_varname2dist_var(self, serial_varname, dist_program): return dist_var - def _determine_parallel_mode(self, program): - """ - determine the parallelism that is enabled - NOTE a hard rule and should be updated in future - """ - - for param in program.all_parameters(): - if self._is_var_distributed(param): - self._enable_tensor_parallel = True - break - - for var in program.list_vars(): - var_dist_attr = self._auto_parallel_context.get_tensor_distributed_attr_for_program( - var) - if not var_dist_attr.is_parameter(): - mapping = var_dist_attr.get_dims_mapping() - mesh = var_dist_attr.get_process_mesh().topology - if mapping and mapping[0] >= 0 and mesh[mapping[0]] > 1: - self._enable_data_parallel = True - break - - # tensor parallelism - if self._enable_tensor_parallel: - model_parallel_axis, process_mesh = self._auto_parallel_context._get_model_parallel_info( - ) - group_ranks = _get_comm_group(process_mesh.process_group, - process_mesh.topology, - model_parallel_axis, self._rank_id) - self._tp_degree = len(group_ranks) - self._tp_group = new_process_group(group_ranks) - - # data parallelism - data_parallel_axis, process_mesh = self._auto_parallel_context._get_data_parallel_info( - ) - if self._enable_data_parallel: - group_ranks = _get_comm_group(process_mesh.process_group, - process_mesh.topology, - data_parallel_axis, self._rank_id) - self._dp_degree = len(group_ranks) - self._dp_group = new_process_group(group_ranks) - def _is_var_distributed(self, var): dist_attr = self._auto_parallel_context.get_tensor_distributed_attr_for_program( @@ -629,68 +605,6 @@ def _sharding_optimize_transpile(self, params_grads, dist_main_program, """ raise RuntimeError("sharding transpile is NOT implemented !") - def _gradient_sync_transpile(self, main_program, startup_program): - """ - append the gradient allreduce ops for all parameters' grad in case of Data Parallel - """ - - # scale loss by dp degree - main_global_block = main_program.global_block() - for idx, op in reversed(list(enumerate(main_global_block.ops))): - if is_loss_grad_op(op): - loss_grad_var = main_global_block.vars[op.output_arg_names[0]] - main_global_block._insert_op_without_sync( - idx + 1, - type='scale', - inputs={'X': loss_grad_var}, - outputs={'Out': loss_grad_var}, - attrs={ - 'scale': 1.0 / self._dp_degree, - OP_ROLE_KEY: OpRole.Backward - }) - break - main_global_block._sync_with_cpp() - - # gradient synchronization - # NOTE naive gradient sync without overlapping - # so there is not need to sync between calc and comm - # collecting grad var - grad_to_sync = [] - for idx, op in reversed(list(enumerate(main_global_block.ops))): - if is_backward_op(op) and \ - OP_ROLE_VAR_KEY in op.attr_names: - op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY] - if len(op_role_var) != 0: - assert len(op_role_var) % 2 == 0 - for i in range(0, len(op_role_var), 2): - param, reduced_grad = op_role_var[i], op_role_var[i + 1] - assert (reduced_grad not in grad_to_sync) - grad_to_sync.append(reduced_grad) - if is_optimizer_op(op): - first_optimize_op_idx = idx - - # insert allreduce - for grad in grad_to_sync: - # FIXME the ring id should be set by autoparallel.mapping module - # it should be determined by dp groups butfixed it here for hacking - main_global_block.append_op( - type='c_allreduce_sum', - inputs={'X': grad}, - outputs={'Out': grad}, - attrs={ - 'ring_id': self._dp_group.id, - 'root': 0, - 'use_calc_stream': True, - OP_ROLE_KEY: OpRole.Backward - }) - main_global_block.append_op( - type='c_sync_comm_stream', - inputs={'X': grad_to_sync}, - outputs={'Out': grad_to_sync}, - attrs={'ring_id': self._dp_group.id, - OP_ROLE_KEY: OpRole.Backward}) - main_global_block._sync_with_cpp() - def _get_no_grad_set_name(no_grad_set): no_grad_set_name = set() @@ -723,7 +637,7 @@ def _get_no_grad_set(loss, no_grad_set=None): return no_grad_set -def _found_match_dist_op(auto_paralle_context, op): +def _is_dist_op_forward_implement(auto_paralle_context, op): dist_attr = auto_paralle_context.get_op_distributed_attr_for_program(op) dist_ops = get_distributed_operator(op.type) @@ -731,11 +645,20 @@ def _found_match_dist_op(auto_paralle_context, op): dist_attr.get_impl_idx())._forward_implemented +def _is_dist_op_backward_implement(auto_paralle_context, op): + dist_attr = auto_paralle_context.get_op_distributed_attr_for_program(op) + dist_ops = get_distributed_operator(op.type) + + return dist_ops and dist_attr.get_impl_idx() >= 0 and dist_ops.get_impl( \ + dist_attr.get_impl_idx())._backward_implemented + + def _auto_backward(loss, startup_program=None, parameter_list=None, no_grad_set=None, - callbacks=None): + callbacks=None, + distop_context=None): """ modification is inplaced """ @@ -753,9 +676,14 @@ def _auto_backward(loss, loss.shape) program = loss.block.program + with program_guard(program, startup_program): - params_grads = append_backward(loss, parameter_list, act_no_grad_set, - callbacks) + params_grads = append_backward( + loss, + parameter_list, + act_no_grad_set, + callbacks, + distop_context=distop_context) return params_grads @@ -822,6 +750,7 @@ def _partition_parameter(auto_paralle_context, src_var, dst_block, dst_varname, # param.desc.set_distributed_attr_uid(distributed_attr_uid) dist_attr = copy.deepcopy( auto_paralle_context.get_tensor_distributed_attr_for_program(src_var)) + assert dist_attr is not None dist_attr._owner_tensor = param dist_attr._owner_context = auto_paralle_context.get_tensor_distributed_attr_for_program( src_var)._owner_context @@ -848,6 +777,7 @@ def _partition_intermediate_var(auto_paralle_context, src_var, dst_block, # var.desc.set_distributed_attr_uid(distributed_attr_uid) dist_attr = copy.deepcopy( auto_paralle_context.get_tensor_distributed_attr_for_program(src_var)) + assert dist_attr is not None dist_attr._owner_tensor = var dist_attr._owner_context = auto_paralle_context.get_tensor_distributed_attr_for_program( src_var)._owner_context @@ -923,3 +853,11 @@ def _insert_dist_op(src_op, dst_block, varname_mapping, auto_paralle_context, input_mapping, output_mapping, rank_id=rank_id) + + +def is_forward_op(op): + role1 = int(core.op_proto_and_checker_maker.OpRole.Forward) | int( + core.op_proto_and_checker_maker.OpRole.Loss) + role2 = int(core.op_proto_and_checker_maker.OpRole.Forward) + op_role = int(op.attr('op_role')) + return op_role == role2 or op_role == role1 diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index a81ff69918905c..813bd481d92869 100755 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -15,6 +15,7 @@ import threading import paddle.fluid.core as core import numpy as np +from .interface import _g_process_mesh_map def is_valid_list_index(list, index): @@ -171,7 +172,9 @@ def _get_comm_group(processes, shape, axis, rank): """ # NOTE _linear_idx2coordinate assume processes mesh start with 0 and continuous - # tricks to support processes mesh when it is not start with 0 or continuous + # tricks to support processes mesh when it is not start with 0 or continuous + assert rank in processes, "rank [{}] is NOT in processes group {}".format( + rank, processes) rank_relatvie = processes.index(rank) coordinate = _linear_idx2coordinate(shape, rank_relatvie) coordinates_in_group = [coordinate[:] for i in range(shape[axis])] @@ -189,6 +192,25 @@ def _get_comm_group(processes, shape, axis, rank): return sorted(ranks_in_group) +def _get_idx_in_axis(processes, shape, axis, rank): + """ + Given a rank and the processes mesh the rank belongs to, + compute the index of the rank in given axis. + + Example: 27 processes managed in a 3-Dimensinal mesh with shape of [3, 3, 3]. + the index of rank 22 are: + in axis 0: 1 + in axis 1: 1 + in axis 2: 2 + """ + + # NOTE _linear_idx2coordinate assume processes mesh start with 0 and continuous + # tricks to support processes mesh when it is not start with 0 or continuous + rank_relatvie = processes.index(rank) + coordinate = _linear_idx2coordinate(shape, rank_relatvie) + return coordinate[axis] + + def _coordinate2linear_idx(mesh_shape, coordinate): """ convert a coordinate in multidimensional mesh space into a scala idx in linear space. @@ -279,6 +301,27 @@ def _linear_idx2coordinate(mesh_shape, linear_idx): return coordinate +def _get_corresponding_rank(target_mesh, rank): + + # TODO(JZ-LIANG) a hack method to support varying mesh in Pipeline parallelism case. + # we assume that all mesh are evenly divide from a parent mesh and should have same size. + # to revise this in future. + + coordinate = None + for key, mesh in _g_process_mesh_map.items(): + if key == 0: + continue + if rank in mesh.process_group and mesh.topology == target_mesh.topology: + coordinate = _linear_idx2coordinate(mesh.topology, + mesh.process_group.index(rank)) + break + + assert coordinate is not None, "could NOT found rank [{}] in any registered mesh".format( + rank) + return target_mesh.process_group[_coordinate2linear_idx(mesh.topology, + coordinate)] + + def _get_unshard_dist_shape(var, dist_attr): var_shape = var.shape mapping = dist_attr.get_dims_mapping() diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index d62f7b5941126b..9ea407c760f07d 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -1051,7 +1051,8 @@ def _append_backward_ops_(block, grad_to_var, callbacks=None, input_grad_names_set=None, - op_path_dict=None): + op_path_dict=None, + distop_context=None): """ Create all grad ops, and insert them into given block @@ -1108,6 +1109,10 @@ def _append_backward_ops_(block, # Getting op's corresponding grad_op grad_op_desc, op_grad_to_var = core.get_grad_op_desc( op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list) + if distop_context is not None: + for op_desc in grad_op_desc: + assert op_desc.id() not in distop_context.gradopidx2opidx + distop_context.gradopidx2opidx[op_desc.id()] = op.desc.id() # Set device for grad_op according to forward Op device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName() @@ -1402,7 +1407,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callbacks=None, - checkpoints=None): + checkpoints=None, + distop_context=None): """ :api_attr: Static Graph @@ -1617,7 +1623,8 @@ def append_backward(loss, grad_to_var, callbacks, input_grad_names_set=input_grad_names_set, - op_path_dict=op_path_dict) + op_path_dict=op_path_dict, + distop_context=distop_context, ) grad_info_map = dict() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 90f59758a2faf9..745e7118522722 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -32,6 +32,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel) +list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers) list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper) list(APPEND DIST_TEST_OPS test_parallel_class_center_sample) @@ -221,6 +222,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_parallel) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_tensor_parallel) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sharding_parallel) + list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers) LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision) LIST(REMOVE_ITEM TEST_OPS test_mixed_precision) @@ -1002,6 +1004,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) + set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120) set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_class_center_sample PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py new file mode 100755 index 00000000000000..89880f8c2f49d5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py @@ -0,0 +1,140 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +from paddle.fluid import layers +from paddle.distributed import fleet +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.utils import print_program_with_distributed_attr +import paddle.fluid.core as core + +paddle.enable_static() +_global_parallel_strategy = None +_global_process_mesh = None +ROOT_MESH = auto.ProcessMesh([0, 1]) + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + dropout_ratio=0.1, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train") + + def forward(self, input): + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + out = self.dropout(out) + out = self.linear2(out) + + return out + + +def mlp_pretrain_forward(train_program, start_program): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 1024 + sequence_len = 512 + input = static.data( + name="input", + shape=[batch_size, sequence_len, hidden_size], + dtype='float32') + label = static.data( + name="label", shape=[batch_size, sequence_len, 1], dtype='float32') + + auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1, -1]) + auto.set_pipeline_stage(1) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + + predict = mlp(input) + + cost = layers.cross_entropy(input=predict, label=label) + avg_cost = layers.mean(x=cost) + + return avg_cost, train_program, start_program + + +class TestMLPAutoParallelizer(unittest.TestCase): + def test_mlp_serial(self): + + global _global_process_mesh + _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH) + + dist_strategy = fleet.DistributedStrategy() + dist_strategy.amp = False + dist_strategy.pipeline = False + dist_strategy.recompute = False + + # init parallel optimizer + dist_strategy.semi_auto = True + + fleet.init(is_collective=True, strategy=dist_strategy) + + train_program = static.Program() + start_program = static.Program() + loss, train_program, start_program = mlp_pretrain_forward(train_program, + start_program) + + optimizer = paddle.fluid.optimizer.AdamOptimizer( + learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + + optimizer = fleet.distributed_optimizer(optimizer) + _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( + loss, start_program) + suffix = core.kAutoParallelSuffix() + for block in distributed_main_program.blocks: + for op in block.ops: + for attr_name in op.attr_names: + self.assertTrue(suffix not in attr_name) + # print_program_with_distributed_attr(distributed_main_program) + self.assertIsNotNone(distributed_startup_program) + self.assertIsNotNone(distributed_main_program) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_parallelizer.py index a92e1e2f338b10..7147716c74ccdc 100755 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_parallelizer.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_parallelizer.py @@ -15,130 +15,16 @@ from __future__ import print_function import unittest +import paddle.fluid as fluid -# The following statements are used to satisfy fleet initialization -import os -if os.getenv("CUDA_VISIBLE_DEVICES", None) is None: - os.environ["CUDA_VISIBLE_DEVICES"] = '0' +from test_parallel_dygraph_dataparallel import TestMultipleGpus -import paddle -import paddle.nn as nn -import paddle.static as static -import paddle.nn.functional as F -import paddle.utils as utils -from paddle.fluid import layers -from paddle.distributed import fleet -import paddle.distributed.auto_parallel as auto -from paddle.distributed.auto_parallel.utils import print_program_with_distributed_attr -import paddle.fluid.core as core -paddle.enable_static() -_global_parallel_strategy = None -_global_process_mesh = None -ROOT_MESH = auto.ProcessMesh([0, 1]) +class TestParallelizer(TestMultipleGpus): - -class MLPLayer(nn.Layer): - def __init__(self, - hidden_size=1024, - intermediate_size=4 * 1024, - dropout_ratio=0.1, - initializer_range=0.02): - super(MLPLayer, self).__init__() - d_model = hidden_size - dim_feedforward = intermediate_size - weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( - mean=0.0, std=initializer_range)) - bias_attr = None - - self.linear0 = nn.Linear( - d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) - self.linear1 = nn.Linear( - dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) - self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr) - self.norm = nn.LayerNorm(d_model, epsilon=1e-5) - self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train") - - def forward(self, input): - out = self.norm(input) - out = self.linear0(out) - out = F.gelu(out, approximate=True) - out = self.linear1(out) - out = self.dropout(out) - out = self.linear2(out) - - return out - - -def mlp_pretrain_forward(train_program, start_program): - with static.program_guard(train_program, - start_program), utils.unique_name.guard(): - batch_size = 4 - hidden_size = 1024 - sequence_len = 512 - input = static.data( - name="input", - shape=[batch_size, sequence_len, hidden_size], - dtype='float32') - label = static.data( - name="label", shape=[batch_size, sequence_len, 1], dtype='float32') - - auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1, -1]) - auto.set_pipeline_stage(1) - - mlp = MLPLayer( - hidden_size=hidden_size, - intermediate_size=4 * hidden_size, - dropout_ratio=0.1, - initializer_range=0.02) - - predict = mlp(input) - - cost = layers.cross_entropy(input=predict, label=label) - avg_cost = layers.mean(x=cost) - - return avg_cost, train_program, start_program - - -class TestMLPAutoParallelizer(unittest.TestCase): - def test_mlp_serial(self): - - global _global_process_mesh - _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH) - - dist_strategy = fleet.DistributedStrategy() - dist_strategy.amp = False - dist_strategy.pipeline = False - dist_strategy.recompute = False - - # init parallel optimizer - dist_strategy.semi_auto = True - - fleet.init(is_collective=True, strategy=dist_strategy) - - train_program = static.Program() - start_program = static.Program() - loss, train_program, start_program = mlp_pretrain_forward(train_program, - start_program) - - optimizer = paddle.fluid.optimizer.AdamOptimizer( - learning_rate=0.00001, - beta1=0.9, - beta2=0.999, - epsilon=1e-08, - grad_clip=None) - - optimizer = fleet.distributed_optimizer(optimizer) - _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( - loss, start_program) - suffix = core.kAutoParallelSuffix() - for block in distributed_main_program.blocks: - for op in block.ops: - for attr_name in op.attr_names: - self.assertTrue(suffix not in attr_name) - # print_program_with_distributed_attr(distributed_main_program) - self.assertIsNotNone(distributed_startup_program) - self.assertIsNotNone(distributed_main_program) + # check sharding logic as well as the accuracy with single mode + def test_parallelizer_logic(self): + self.run_mnist_2gpu('auto_parallel_parallelizer.py') if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py index 29ba863c96226e..44a525244015b4 100755 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py @@ -92,9 +92,9 @@ def check_tensor_split(prog1, varnames1, prog2, varnames2, axis, nsplit): def initialization_check(mode, dist_context, dist_startup_prog, - serial_startup_prog, var_need_broadcast): + serial_startup_prog, var_need_broadcast, process_mesh, + mp_parallel_axis, dp_parallel_axis): if 'mp' in mode: - mp_parallel_axis, process_mesh = dist_context._get_model_parallel_info() group_ranks = _get_comm_group(process_mesh.process_group, process_mesh.topology, mp_parallel_axis, 3) @@ -110,7 +110,6 @@ def initialization_check(mode, dist_context, dist_startup_prog, return False if 'dp' in mode: - dp_parallel_axis, process_mesh = dist_context._get_data_parallel_info() group_ranks = _get_comm_group(process_mesh.process_group, process_mesh.topology, dp_parallel_axis, 3) @@ -359,9 +358,15 @@ def test_mlp_dp(self): # parameter initialization var_need_broadcast = [] self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=None, + dp_parallel_axis=0)) def test_mlp_mp(self): global _global_parallel_strategy @@ -406,9 +411,15 @@ def test_mlp_mp(self): var_need_broadcast = sorted( ['layer_norm_0.b_0', 'layer_norm_0.w_0', 'linear_1.b_0']) self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=0, + dp_parallel_axis=None)) # check var and op all have dist_attr in dist_main_program self.assertTrue( @@ -464,9 +475,15 @@ def test_mlp_dp_mp(self): var_need_broadcast = sorted( ['layer_norm_0.b_0', 'layer_norm_0.w_0', 'linear_1.b_0']) self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=1, + dp_parallel_axis=0)) # check var and op all have dist_attr in dist_main_program self.assertTrue( @@ -635,9 +652,15 @@ def test_attn_dp(self): # parameter initialization var_need_broadcast = [] self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=None, + dp_parallel_axis=0)) def test_attn_mp(self): global _global_parallel_strategy @@ -686,9 +709,15 @@ def test_attn_mp(self): # parameter initialization var_need_broadcast = ['linear_3.b_0'] self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=0, + dp_parallel_axis=None)) # check var and op all have dist_attr in dist_main_program self.assertTrue( @@ -748,9 +777,15 @@ def test_attn_dp_mp(self): # parameter initialization var_need_broadcast = ['linear_3.b_0'] self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=1, + dp_parallel_axis=0)) # check var and op all have dist_attr in dist_main_program self.assertTrue( @@ -1043,9 +1078,15 @@ def test_decoder_dp_mp(self): 'layer_norm_0.w_0', 'linear_5.b_0' ]) self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=1, + dp_parallel_axis=0)) # check var and op all have dist_attr in dist_main_program self.assertTrue( @@ -1117,7 +1158,16 @@ def test_decoder_noparallel(self): 'fill_constant', 'gaussian_random', 'fill_constant', 'gaussian_random', 'fill_constant', 'gaussian_random', 'fill_constant', 'gaussian_random', 'fill_constant', - 'gaussian_random', 'fill_constant', 'fill_constant', 'fill_constant' + 'gaussian_random', 'fill_constant', 'fill_constant', + 'fill_constant', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast' ] self.assertTrue(dist_ops == ref_ops) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py index 16cbad3ef6f8b6..11b3338bc675cf 100755 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py @@ -521,7 +521,7 @@ class GPTModel(nn.Layer): def __init__(self, vocab_size, hidden_size=768, - num_hidden_layers=12, + num_hidden_layers=4, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", @@ -787,6 +787,14 @@ def test_gpt_dp_mp(self): dist_params_grads = partitioner.apply_backward( loss, complete_train_program, start_program, auto_parallel_main_prog, auto_parallel_startup_prog) + + with open("./test_auto_parallel_partitioner_serial_main_new.txt", + "w") as fw: + fw.write(str(train_program)) + with open("./test_auto_parallel_partitioner_serial_startup_new.txt", + "w") as fw: + fw.write(str(start_program)) + optimizer = paddle.fluid.optimizer.AdamOptimizer( learning_rate=0.00001, beta1=0.9, @@ -796,7 +804,17 @@ def test_gpt_dp_mp(self): opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, auto_parallel_main_prog, auto_parallel_startup_prog) - + from paddle.distributed.auto_parallel.context import set_default_distributed_context + set_default_distributed_context(dist_context) + with open("./test_auto_parallel_partitioner_main_new.txt1", "w") as fw: + fw.write(str(auto_parallel_main_prog)) + with open("./test_auto_parallel_partitioner_startup_new.txt1", + "w") as fw: + fw.write(str(auto_parallel_startup_prog)) + # with open("./test_auto_parallel_partitioner_main_completed.txt", "w") as fw: + # from paddle.distributed.auto_parallel.completion import complete_backward_annotation + # complete_backward_annotation(auto_parallel_main_prog) + # fw.write(str(auto_parallel_main_prog)) nrank = 4 # col parallel weights = [ @@ -826,16 +844,20 @@ def test_gpt_dp_mp(self): 'layer_norm_6.tmp_2', 'layer_norm_7.tmp_2', 'layer_norm_7.tmp_2', 'layer_norm_7.tmp_2', 'layer_norm_8.tmp_2' ] - mp_parallel_axis, process_mesh = dist_context._get_model_parallel_info() + process_mesh = _global_process_mesh + mp_parallel_axis = 1 + dp_parallel_axis = 0 + group_ranks = _get_comm_group(process_mesh.process_group, process_mesh.topology, mp_parallel_axis, 3) mp_ring_id = new_process_group(group_ranks).id - dp_parallel_axis, process_mesh = dist_context._get_data_parallel_info() + group_ranks = _get_comm_group(process_mesh.process_group, process_mesh.topology, dp_parallel_axis, 3) dp_ring_id = new_process_group(group_ranks).id + tensor_parallel_allreduce_vars = sorted([ op.desc.output_arg_names()[0].split("@")[0] for op in auto_parallel_main_prog.global_block().ops diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py index da82e56d4a1518..fe9b965ed8733c 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py @@ -25,7 +25,6 @@ from paddle.distributed.auto_parallel.context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.partitioner import Partitioner -from paddle.distributed.auto_parallel.completion import complete_backward_annotation from paddle.distributed.auto_parallel.reshard import reshard from paddle.distributed.auto_parallel.process import PROCESS_GROUP_MAP @@ -211,7 +210,8 @@ def check_initialization_for_dp(dist_startup_prog): if op.type == "c_broadcast": broadcast_varnames.append(op.output_arg_names[0]) - return params == need_check_params == broadcast_varnames + return sorted(params) == sorted(need_check_params) == sorted( + broadcast_varnames) class TestMLPReshard(unittest.TestCase): @@ -225,7 +225,6 @@ def test_complete_backward_annotation(self): rank_id = 0 dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, 0) - complete_backward_annotation(dist_main_prog, dist_context) op_need_check = None for op in dist_main_prog.global_block().ops: @@ -254,7 +253,6 @@ def test_mlp_pp(self): rank_id = 1 dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, rank_id) - complete_backward_annotation(dist_main_prog, dist_context) for key in list(PROCESS_GROUP_MAP.keys()): del PROCESS_GROUP_MAP[key] reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) @@ -277,7 +275,6 @@ def test_mlp_dp(self): rank_id = 0 dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, rank_id) - complete_backward_annotation(dist_main_prog, dist_context) reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) # send and recv should not exist in dp scene. self.assertFalse(check_send_recv_result(dist_main_prog, rank_id)) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py index 1e134eebfd23bb..babc622393c404 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py @@ -25,7 +25,6 @@ from paddle.distributed.auto_parallel.context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.partitioner import Partitioner -from paddle.distributed.auto_parallel.completion import complete_backward_annotation from paddle.distributed.auto_parallel.reshard import reshard paddle.enable_static() @@ -158,7 +157,6 @@ def test_mlp_dpmppp(self): dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, rank_id) print(dist_main_prog) - complete_backward_annotation(dist_main_prog, dist_context) reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) print(dist_main_prog) print(dist_startup_prog) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py index 5a10a218345705..96a8b2a8d7cdbe 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py @@ -25,7 +25,6 @@ from paddle.distributed.auto_parallel.context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.partitioner import Partitioner -from paddle.distributed.auto_parallel.completion import complete_backward_annotation from paddle.distributed.auto_parallel.reshard import reshard paddle.enable_static() @@ -187,7 +186,6 @@ def test_mlp_mppp(self): rank_id = 2 dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, rank_id) - complete_backward_annotation(dist_main_prog, dist_context) reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) # check send and recv result From c285c71916035e433b45e7642c17d31092b45199 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Wed, 20 Oct 2021 10:25:40 +0800 Subject: [PATCH 217/298] [FIX] Extend time for test_activation_nn_grad to avoid its timeout issue (#36527) * native commit for triple grad of sigmod * Updated unittests files * init functional jacobian api * Updated trible_test func * Updated gradient_checker & test_script * finish test with dtype float32 * add float64 test case * polish code * use atol=1e-5 with dtype float64 * fix for ci * set timeout for test_jacobian * fix dygraph grad to support high differential * polish API docstring * Updated gradient checker and some related files * fix double grad strip error for high differential * fix double grad strip error for high differential * Add Sigmoid triple grad tests * fix dygraph double grad dtype error when calling for high differential senario * Updated triple grad teses func * Use np.random to initialize ddx * Updated triple_grad_check func * add todo for gradient checker and refine some comments * remove additional code * add test for warnging in backward.py * add tanh triple grad * format python code * refine code * make test_activation_nn_grad test time to 150s Co-authored-by: veyron95 Co-authored-by: levi131 --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 745e7118522722..ac7471f8edfa4f 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -846,7 +846,7 @@ set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120) set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120) set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_activation_nn_grad PROPERTIES TIMEOUT 120) +set_tests_properties(test_activation_nn_grad PROPERTIES TIMEOUT 150) set_tests_properties(test_empty_op PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_executor_transformer PROPERTIES TIMEOUT 120) set_tests_properties(test_elementwise_div_op PROPERTIES TIMEOUT 120) From 4bd19770d9dc485a559f3ac698ba3a4d2c117943 Mon Sep 17 00:00:00 2001 From: wenbin Date: Wed, 20 Oct 2021 10:44:22 +0800 Subject: [PATCH 218/298] fix (#36557) * fix * remove const --- .../inference/tensorrt/convert/pool2d_op.cc | 35 +++++++++++++------ 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index e03842db2b8274..05cd7bad5cbacc 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -115,17 +115,17 @@ class Pool2dOpConverter : public OpConverter { nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); nvinfer1::ILayer *layer = nullptr; - nvinfer1::DimsHW pre_pad(0, 0); - nvinfer1::DimsHW post_pad(0, 0); + nvinfer1::DimsHW g_pre_pad(0, 0); + nvinfer1::DimsHW g_post_pad(0, 0); // paddle Non ceil_mode : Output size = (input size - filter size + 2 * // padding) / stride (stride size) + 1 // tensorrt EXPLICIT_ROUND_DOWN: O = floor((M - DK) / S) + 1 // so if M - DK < 0 we need extra padding if (input_shape.d[input_dims - 2] - ksize[0] + 2 * paddings[0] < 0) { - post_pad.h() = strides[0] - 1; + g_post_pad.h() = strides[0] - 1; } if (input_shape.d[input_dims - 1] - ksize[1] + 2 * paddings[1] < 0) { - post_pad.w() = strides[1] - 1; + g_post_pad.w() = strides[1] - 1; } if (op_desc.HasAttr("enable_int8")) { @@ -138,10 +138,10 @@ class Pool2dOpConverter : public OpConverter { if (engine_->with_dynamic_shape()) { if (!adaptive && !global_pooling && !ceil_mode) { - if ((post_pad.w() > 0 || post_pad.h() > 0) && + if ((g_post_pad.w() > 0 || g_post_pad.h() > 0) && (padding_algorithm != "SAME")) { auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, - pre_pad, post_pad); + g_pre_pad, g_post_pad); PADDLE_ENFORCE_NOT_NULL( pad_layer, platform::errors::Fatal( "Pad layer in poolOp converter could not be " @@ -230,22 +230,35 @@ class Pool2dOpConverter : public OpConverter { if (!adaptive) { if (ceil_mode) { + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); // If ceil mode is true, we will pad the appropriate size to the input. DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, input_dims); - } - - if ((post_pad.w() > 0 || post_pad.h() > 0) && - (padding_algorithm != "SAME")) { auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, pre_pad, post_pad); + PADDLE_ENFORCE_NOT_NULL( pad_layer, platform::errors::Fatal( "Pad layer in poolOp converter could not be " "created. The pointer to pad layer is `NULL`.")); input1 = pad_layer->getOutput(0); } - +#if IS_TRT_VERSION_GE(8000) + // Exclude padding pixels from the average mean is not supported well by + // TRT + // so enable padding for trt8.0 above. + if ((g_post_pad.w() > 0 || g_post_pad.h() > 0) && + (padding_algorithm != "SAME") && !ceil_mode) { + auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, + g_pre_pad, g_post_pad); + PADDLE_ENFORCE_NOT_NULL( + pad_layer, platform::errors::Fatal( + "Pad layer in poolOp converter could not be " + "created. The pointer to pad layer is `NULL`.")); + input1 = pad_layer->getOutput(0); + } +#endif auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL( From 6524fa8d335725d6d86e43c0fc809538650f6645 Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Wed, 20 Oct 2021 11:08:58 +0800 Subject: [PATCH 219/298] Add CINN Compile Option (#36292) Add CINN compile option in CMake. Now you can use CINN in Paddle by `-DWITH_CINN=ON` when `cmake` To test it, you can run `make cinn_lib_test -j` and `ctest -R cinn_lib_test`. Note: 1. You should set ``` export runtime_include_dir=${CINN_SOURCE_DIR}/cinn/runtime/cuda ``` When run test, the `${CINN_SOURCE_DIR}` should be set based on your CINN directory. 2. CINN is under developing now, you may have to change `CINN_GIT_TAG` to the git commit you need. --- CMakeLists.txt | 5 + cmake/cinn.cmake | 112 +++++++++++++++ paddle/fluid/framework/ir/CMakeLists.txt | 3 + paddle/fluid/framework/ir/cinn_lib_test.cc | 151 +++++++++++++++++++++ 4 files changed, 271 insertions(+) create mode 100644 cmake/cinn.cmake create mode 100644 paddle/fluid/framework/ir/cinn_lib_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 98772e96781531..d4a0eb067b4f17 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,6 +214,7 @@ option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VER option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) +option(WITH_CINN "Compile PaddlePaddle with CINN" OFF) option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON) option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF) @@ -299,6 +300,10 @@ if(WITH_GPU) endif() endif() +if(WITH_CINN) + include(cinn) +endif() + if(WITH_ROCM) include(hip) include(miopen) # set miopen libraries, must before configure diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake new file mode 100644 index 00000000000000..dd5f809e9581a2 --- /dev/null +++ b/cmake/cinn.cmake @@ -0,0 +1,112 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if (NOT WITH_CINN) + return() +endif() + +# TODO(zhhsplendid): CINN has lots of warnings during early development. +# They will be treated as errors under paddle. We set no-error now and we will +# clean the code in the future. +add_definitions(-w) + +###################################### +# Build CINN from Git External Project +###################################### +include(ExternalProject) +set(CINN_SOURCE_DIR ${THIRD_PARTY_PATH}/CINN) +# TODO(zhhsplendid): Modify git tag after we have release tag +set(CINN_GIT_TAG 3f004bfa3ed273ecf1de8e7b946433038c79b84f) +set(CINN_OPTIONAL_ARGS -DWITH_CUDA=${WITH_GPU} -DWITH_CUDNN=${WITH_GPU} -DPUBLISH_LIBS=ON) +set(CINN_BUILD_COMMAND $(MAKE) cinncore -j && $(MAKE) cinnapi -j) +ExternalProject_Add( + external_cinn + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/CINN.git" + GIT_TAG ${CINN_GIT_TAG} + PREFIX ${CINN_SOURCE_DIR} + UPDATE_COMMAND "" + BUILD_COMMAND ${CINN_BUILD_COMMAND} + INSTALL_COMMAND "" + CMAKE_ARGS ${CINN_OPTIONAL_ARGS}) + + + +ExternalProject_Get_property(external_cinn BINARY_DIR) +ExternalProject_Get_property(external_cinn SOURCE_DIR) +set(CINN_BINARY_DIR ${BINARY_DIR}) +set(CINN_SOURCE_DIR ${SOURCE_DIR}) + +message(STATUS "CINN BINARY_DIR: ${CINN_BINARY_DIR}") +message(STATUS "CINN SOURCE_DIR: ${CINN_SOURCE_DIR}") + + +######################### +# Add CINN's dependencies +######################### + +# Add absl +set(ABSL_LIB_NAMES + hash + wyhash + city + strings + throw_delegate + bad_any_cast_impl + bad_optional_access + bad_variant_access + raw_hash_set + ) +set(ABSL_LIB_DIR "${CINN_BINARY_DIR}/dist/third_party/absl/lib") +set(ABSL_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/absl/include") +add_library(absl STATIC IMPORTED GLOBAL) +set_target_properties(absl PROPERTIES IMPORTED_LOCATION ${ABSL_LIB_DIR}/libabsl_base.a) +foreach(lib_name ${ABSL_LIB_NAMES}) + target_link_libraries(absl INTERFACE ${ABSL_LIB_DIR}/libabsl_${lib_name}.a) +endforeach() +include_directories(${ABSL_INCLUDE_DIR}) + +# Add isl +set(ISL_LIB_DIR "${CINN_BINARY_DIR}/dist/third_party/isl/lib") +set(ISL_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/isl/include") +add_library(isl STATIC IMPORTED GLOBAL) +set_target_properties(isl PROPERTIES IMPORTED_LOCATION ${ISL_LIB_DIR}/libisl.a) +include_directories(${ISL_INCLUDE_DIR}) + +# Add LLVM +set(LLVM_LIB_NAMES + ExecutionEngine + ) +set(LLVM_LIB_DIR "${CINN_BINARY_DIR}/dist/third_party/llvm/lib") +set(LLVM_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/llvm/include") +add_library(llvm STATIC IMPORTED GLOBAL) +set_target_properties(llvm PROPERTIES IMPORTED_LOCATION ${LLVM_LIB_DIR}/libLLVMCore.a) +foreach(lib_name ${LLVM_LIB_NAMES}) + target_link_libraries(llvm INTERFACE ${LLVM_LIB_DIR}/libLLVM${lib_name}.a) +endforeach() +include_directories(${LLVM_INCLUDE_DIR}) + +###################################################### +# Put external_cinn and dependencies together as a lib +###################################################### + +set(CINN_LIB_NAME "libcinnapi.so") +set(CINN_LIB_LOCATION "${CINN_BINARY_DIR}/dist/cinn/lib") +set(CINN_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/cinn/include") + +add_library(cinn SHARED IMPORTED GLOBAL) +set_target_properties(cinn PROPERTIES IMPORTED_LOCATION "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}") +include_directories(${CINN_INCLUDE_DIR}) +add_dependencies(cinn external_cinn absl isl llvm glog gflag) + diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 904450b5b251ee..7b80d331ff7077 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -143,6 +143,9 @@ cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) +if (WITH_CINN) + cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn) +endif() cc_test(cost_model_test SRCS cost_model_test.cc DEPS cost_model op_registry) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass) diff --git a/paddle/fluid/framework/ir/cinn_lib_test.cc b/paddle/fluid/framework/ir/cinn_lib_test.cc new file mode 100644 index 00000000000000..cdee45a06c71af --- /dev/null +++ b/paddle/fluid/framework/ir/cinn_lib_test.cc @@ -0,0 +1,151 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include +#include +#include +#include + +#ifdef PADDLE_WITH_CUDA +#include +#endif + +#include "cinn/common/target.h" +#include "cinn/frontend/net_builder.h" +#include "cinn/frontend/syntax.h" +#include "cinn/hlir/framework/graph.h" +#include "cinn/hlir/framework/graph_compiler.h" +#include "cinn/hlir/framework/pass.h" +#include "cinn/hlir/framework/tensor.h" +#include "cinn/hlir/op/use_ops.h" +#include "cinn/hlir/pass/use_pass.h" + +namespace cinn { +namespace frontend { + +Program CreateAddProgram() { + constexpr int M = 32; + constexpr int N = 24; + + NetBuilder builder("net_builder"); + auto a = builder.CreateInput(Float(32), {M, N}); + auto b = builder.CreateInput(Float(32), {M, N}); + auto c = builder.add(a, b); + auto d = builder.add(a, c); + auto program = builder.Build(); + + return program; +} + +void SetRandData(hlir::framework::Tensor tensor, Target target) { + auto* data = tensor->mutable_data(target); + std::random_device seed; + std::default_random_engine engine(seed()); + std::uniform_real_distribution dist(0.f, 1.f); + size_t num_ele = tensor->shape().numel(); + std::vector random_data(num_ele); + for (size_t i = 0; i < num_ele; i++) { + random_data[i] = dist(engine); // All random data + } + +#ifdef PADDLE_WITH_CUDA + cudaMemcpy(data, random_data.data(), num_ele * sizeof(float), + cudaMemcpyHostToDevice); +#else + std::copy(random_data.begin(), random_data.end(), data); +#endif +} + +TEST(net_build, basic) { + auto program = CreateAddProgram(); + // output program + for (size_t i = 0; i < program.size(); i++) { + LOG(INFO) << "instruction: " << program[i]; + } +} + +TEST(net_build, program_execute_multi_elementwise_add) { + auto program = CreateAddProgram(); +#ifdef PADDLE_WITH_CUDA + Target target = common::DefaultNVGPUTarget(); +#else + Target target = common::DefaultHostTarget(); +#endif + + auto graph = std::make_shared(program, target); + std::cout << "graph:\n" << graph->Visualize() << std::endl; + + auto scope = BuildScope(target, graph); + hlir::framework::GraphCompiler gc(target, scope, graph); + auto runtime_program = gc.Build(); + + scope->Var("A"); + scope->Var("B"); + + auto A = scope->GetTensor("A"); + auto B = scope->GetTensor("B"); + SetRandData(A, target); + SetRandData(B, target); + + runtime_program->Execute(); +} + +TEST(net_build, program_execute_fc) { + constexpr int B = 10; // batch size + constexpr int M = 32; + constexpr int K = 18; + constexpr int N = 24; + + NetBuilder builder("net_builder"); + auto a = builder.CreateInput(Float(32), {B, M, K}, "A"); + auto w = builder.CreateInput(Float(32), {N, K}, "W"); // weight + auto b = builder.CreateInput(Float(32), {N}, "B"); // bias + + auto mul_out = builder.mul(a, w, 2, 1); + auto add_out = builder.add(mul_out, b); + auto program = builder.Build(); + +#ifdef PADDLE_WITH_CUDA + Target target = common::DefaultNVGPUTarget(); +#else + Target target = common::DefaultHostTarget(); +#endif + + auto graph = std::make_shared(program, target); + auto scope = BuildScope(target, graph); + hlir::framework::GraphCompiler gc(target, scope, graph); + auto runtime_program = gc.Build(); + + scope->Var(std::string(a.id())); + scope->Var(std::string(w.id())); + scope->Var(std::string(b.id())); + scope->Var(std::string(mul_out->id)); + + auto a_ten = scope->GetTensor(std::string(a.id())); + auto w_ten = scope->GetTensor(std::string(w.id())); + auto b_ten = scope->GetTensor(std::string(b.id())); + auto fake_out_ten = scope->GetTensor(std::string(mul_out->id)); + auto add_out_ten = scope->GetTensor(std::string(add_out->id)); + SetRandData(a_ten, target); + SetRandData(w_ten, target); + SetRandData(b_ten, target); + + runtime_program->Execute(); +} + +} // namespace frontend +} // namespace cinn From 8ca5206bab9ab6e13bf9367e431a3211b70a900b Mon Sep 17 00:00:00 2001 From: zmx Date: Wed, 20 Oct 2021 11:15:59 +0800 Subject: [PATCH 220/298] fix SerializeSelectedRows (#36543) * bug fix for DeserializeSelectedRows. test=develop * fix bug for SerializeSelectedRows. test=develop * update. test=develop --- paddle/fluid/distributed/service/brpc_utils.cc | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc index 376e820cb7a741..92dcde99cccb0b 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/service/brpc_utils.cc @@ -138,23 +138,11 @@ void SerializeSelectedRows(framework::Variable* var, var_data->clear(); var_data->resize(rows->size() * sizeof(int64_t)); char* data_ptr = const_cast(var_data->data()); - - if (platform::is_cpu_place(tensor->place())) { - memcpy(data_ptr, &(*rows)[0], rows->size() * sizeof(int64_t)); - } else { -#ifdef PADDLE_WITH_CUDA - auto stream = - reinterpret_cast(ctx).stream(); - memory::Copy(platform::CPUPlace(), data_ptr, - BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), - &(*rows)[0], rows->size() * sizeof(int64_t), stream); -#endif - } + memcpy(data_ptr, &((*rows)[0]), rows->size() * sizeof(int64_t)); var_msg->set_data_type(static_cast(tensor->type())); for (auto& dim : framework::vectorize(tensor->dims())) { var_msg->add_dims(dim); } - // IO Buffer if (platform::is_cpu_place(tensor->place())) { auto data_len = tensor->numel() * framework::SizeOfType(tensor->type()); From 06bd348d3c62874511f6f36af760063b50e054ca Mon Sep 17 00:00:00 2001 From: Wilber Date: Wed, 20 Oct 2021 11:26:10 +0800 Subject: [PATCH 221/298] update for trt convert ut. (#36549) --- paddle/fluid/inference/tensorrt/op_teller.cc | 8 +++ .../inference/test_trt_convert_activation.py | 1 + .../test_trt_convert_affine_channel.py | 1 + .../inference/test_trt_convert_elementwise.py | 1 + .../test_trt_convert_emb_eltwise_layernorm.py | 1 + .../ir/inference/test_trt_convert_flatten.py | 65 +++++++++++++++---- .../ir/inference/test_trt_convert_gather.py | 1 + .../inference/test_trt_convert_gather_nd.py | 1 + .../ir/inference/test_trt_convert_gelu.py | 1 + .../inference/test_trt_convert_group_norm.py | 1 + .../ir/inference/test_trt_convert_prelu.py | 14 ++++ .../ir/inference/test_trt_convert_reshape.py | 1 + .../ir/inference/test_trt_convert_scale.py | 1 + .../test_trt_convert_shuffle_channel.py | 1 + .../ir/inference/test_trt_convert_swish.py | 1 + .../inference/test_trt_convert_transpose.py | 1 + 16 files changed, 88 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index e7318d07611ea0..0d0a656c5b6074 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -1104,6 +1104,14 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; } } + +#if IS_TRT_VERSION_LT(7000) + if (!with_dynamic_shape) { + // TODO(inference): fix trt6 static plugin error. + VLOG(3) << "prelu static plugin in trt6 has bug."; + return false; + } +#endif } if (op_type == "mish") { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py index 9dc89bb9836d07..a87cab3430cd30 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_affine_channel.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_affine_channel.py index 1e6c94f145497c..33eb90b9f91230 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_affine_channel.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_affine_channel.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py index c8cba0f3723807..992e0353837bc2 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py index d7b0bcd908085c..356a2c942df0d8 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertEmbEltwiseLayernormTest1(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py index 4b461c75f0b28d..7b0089ab9ab7f7 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -73,10 +74,20 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if attrs[0]['axis'] == 1: - return 1, 2 + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130: + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 else: - return 0, 3 + if dynamic_shape: + return 0, 3 + + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 attrs = [ program_config.ops[i].attrs @@ -157,10 +168,20 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if attrs[0]['axis'] == 1: - return 1, 2 + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130: + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 else: - return 0, 3 + if dynamic_shape: + return 0, 3 + + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 attrs = [ program_config.ops[i].attrs @@ -241,10 +262,20 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if attrs[0]['axis'] == 1: - return 1, 2 + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130: + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 else: - return 0, 3 + if dynamic_shape: + return 0, 3 + + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 attrs = [ program_config.ops[i].attrs @@ -325,10 +356,20 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if attrs[0]['axis'] == 1: - return 1, 2 + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130: + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 else: - return 0, 3 + if dynamic_shape: + return 0, 3 + + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 attrs = [ program_config.ops[i].attrs diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py index 9a3c9aff61b987..37d23cb18d843a 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py @@ -19,6 +19,7 @@ from functools import partial from typing import Optional, List, Callable, Dict, Any, Set import logging +import unittest class TrtConvertGatherTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py index a109abdc298a65..0c7eae5f85f955 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertGatherNdTest_dim_4_1(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py index f9c3d09ef446f5..2f75e4e723e281 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertGeluTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py index b6b5aa9dbfe95c..203e86c4b25de1 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertGroupNormTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py index 4122e2623cb5a7..fbb78fceb3e84a 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertPreluTest(TrtLayerAutoScanTest): @@ -186,6 +187,19 @@ def teller2(program_config, predictor_config): "Need to repair the case: the output of GPU and tensorrt has diff when the input dimension is 2 in static shape mode." ) + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 < 7000: + + def teller(program_config, predictor_config): + if not predictor_config.tensorrt_dynamic_shape_enabled(): + return True + return False + + self.add_skip_case( + teller, SkipReasons.TRT_NOT_IMPLEMENTED, + "Need to repair the case: the output of GPU and tensorrt has diff in trt6, the prelu static plugin has bug." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py index cf7ab11c35de74..4355b83557fc6d 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertReshapeTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py index 8a44617dc8dc3c..51bcee080376ea 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertScaleTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py index 264ba31ad2716a..c6a81472360447 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertShuffleChannelTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_swish.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_swish.py index e162988bbb1b39..5eb4e8505ff228 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_swish.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_swish.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertSwishTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py index ad325bb0ab3b0c..31b4d027f1780b 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertTransposeTest(TrtLayerAutoScanTest): From 7325c9fb44e9ae600bc299ff1badfa87873ed5eb Mon Sep 17 00:00:00 2001 From: Wilber Date: Wed, 20 Oct 2021 11:26:22 +0800 Subject: [PATCH 222/298] add unittest (#36371) --- paddle/fluid/inference/tensorrt/op_teller.cc | 109 ++++++++++++++++-- .../tensorrt/plugin/hard_swish_op_plugin.h | 2 +- .../test_trt_convert_anchor_generator.py | 6 +- .../inference/test_trt_convert_batch_norm.py | 13 +++ .../ir/inference/test_trt_convert_clip.py | 18 ++- .../ir/inference/test_trt_convert_concat.py | 13 +++ .../ir/inference/test_trt_convert_dropout.py | 9 +- .../test_trt_convert_hard_sigmoid.py | 1 + .../test_trt_convert_multihead_matmul.py | 7 +- .../inference/test_trt_convert_reduce_sum.py | 10 +- .../inference/test_trt_convert_roi_align.py | 2 + .../test_trt_convert_skip_layernorm.py | 1 + .../ir/inference/test_trt_convert_slice.py | 6 +- .../ir/inference/test_trt_convert_softmax.py | 13 ++- .../ir/inference/test_trt_convert_split.py | 13 +++ .../ir/inference/test_trt_convert_stack.py | 1 + .../ir/inference/test_trt_convert_tile.py | 10 +- .../ir/inference/test_trt_convert_yolo_box.py | 1 + .../ir/inference/trt_layer_auto_scan_test.py | 8 +- 19 files changed, 208 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 0d0a656c5b6074..91515f1fa58116 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -174,6 +174,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << " op does not support input's dim is 1 in tensorrt."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "activation op does not support input's dim is 2 in " + "tensorrt static shape, the output shape has diff."; + return false; + } } if (op_type == "pool2d") { @@ -346,6 +352,24 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } } + if (op_type == "softmax") { + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "softmax op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } + } if (op_type == "group_norm") { if (!with_dynamic_shape) return false; bool has_attrs = (desc.HasAttr("epsilon") && desc.HasAttr("groups")); @@ -357,20 +381,35 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (op_type == "concat") { if (!desc.HasAttr("axis")) { return false; + } + int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); + if (with_dynamic_shape) { + if (axis < 0) return false; } else { - int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); - if (with_dynamic_shape) { - if (axis < 0) return false; - } else { - if (axis <= 0) return false; - } - auto concat_inputs = desc.Inputs(); - if (concat_inputs.find("AxisTensor") != concat_inputs.end()) { - if (desc.Input("AxisTensor").size() >= 1) { - return false; - } + if (axis <= 0) return false; + } + auto concat_inputs = desc.Inputs(); + if (concat_inputs.find("AxisTensor") != concat_inputs.end()) { + if (desc.Input("AxisTensor").size() >= 1) { + return false; } } + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "concat op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } } if (op_type == "transpose2" || op_type == "transpose") { if (!desc.HasAttr("axis")) { @@ -687,6 +726,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << desc.Output("Y").size() << "."; return false; } + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "batch_norm op does not support input's dim is 2 in " + "tensorrt static shape, the output shape has diff."; + return false; + } } if (op_type == "split") { @@ -774,6 +829,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "The output_length should be equal to the output size."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "split op does not support input's dim is 2 in tensorrt " + "static shape. The output shape has diff."; + return false; + } } if (op_type == "scale") { auto scale_inputs = desc.Inputs(); @@ -926,6 +987,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "gelu op does not support input's dim is 1 in tensorrt."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "gelu op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } } if (op_type == "layer_norm") { @@ -1041,7 +1108,13 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, auto* x_var_desc = block->FindVar(x_var_name); const auto x_shape = x_var_desc->GetShape(); if (x_shape.size() == 1) { - VLOG(3) << "dropout op does not support input's dim is 1 in tensorrt."; + VLOG(3) << "scale op does not support input's dim is 1 in tensorrt."; + return false; + } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "scale op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; return false; } } @@ -1061,6 +1134,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "swish op does not support input's dim is 1 in tensorrt."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "swish op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } } if (op_type == "prelu") { @@ -1314,6 +1393,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "clip op does not support input's dim is 1 in tensorrt."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "clip op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } } if (op_type == "reduce_sum" || op_type == "reduce_mean") { diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h index c0ee608c39dabc..475c908c13bbf2 100644 --- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h @@ -161,7 +161,7 @@ class HardSwishPluginDynamicCreator : public nvinfer1::IPluginCreator { public: HardSwishPluginDynamicCreator() {} const char* getPluginName() const TRT_NOEXCEPT override { - return "hardswish_plugin_dynamic"; + return "hard_swish_plugin_dynamic"; } const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py index bf457a9da40a8a..2dd380c53af443 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -83,7 +84,10 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - return 1, 3 + if dynamic_shape: + return 1, 3 + else: + return 0, 4 attrs = [ program_config.ops[i].attrs diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py index ceda10d5d94aa0..fc96f297918dda 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -211,6 +212,18 @@ def teller1(program_config, predictor_config): self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT, "INPUT MomentumTensor NOT SUPPORT") + def teller2(program_config, predictor_config): + if len( + program_config.inputs['batch_norm_input'].shape + ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled(): + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape has diff, but we can add shuffle layer to resolve it." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py index 95b4fb83d5bfde..081df87d103308 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertClipTest(TrtLayerAutoScanTest): @@ -84,8 +85,7 @@ def generate_weight2(attrs: List[Dict[str, Any]]): yield program_config - def sample_predictor_configs( - self, program_config) -> (paddle_infer.Config, List[int], float): + def sample_predictor_configs(self, program_config): def generate_dynamic_shape(attrs): if self.dims == 1: self.dynamic_shape.min_input_shape = {"input_data": [1]} @@ -146,7 +146,21 @@ def generate_trt_nodes_num(attrs, dynamic_shape): yield self.create_inference_config(), generate_trt_nodes_num(attrs, True), 1e-5 + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if len( + program_config.inputs['input_data'].shape + ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled(): + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape has diff, but we can add shuffle layer to resolve it." + ) + def test(self): + self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py index 25e96787dd1329..78ac06a323b1dd 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -317,6 +318,18 @@ def teller1(program_config, predictor_config): self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT, "INPUT AxisTensor NOT SUPPORT") + def teller2(program_config, predictor_config): + if len( + program_config.inputs['concat_input1'].shape + ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled(): + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape has diff, but we can add shuffle layer to resolve it." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py index 28a85ce96c64ff..57f5b5a0bb245c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -141,15 +142,19 @@ def generate_trt_nodes_num(attrs, dynamic_shape): def add_skip_trt_case(self): def teller1(program_config, predictor_config): - if self.dims == 2: + if len( + program_config.inputs['input_data'].shape + ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled(): return True return False self.add_skip_case( teller1, SkipReasons.TRT_NOT_IMPLEMENTED, - "When input dims is 2, pulgin will product a 4 dims output.") + "The output shape has diff, but we can add shuffle layer to resolve it." + ) def test(self): + self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py index d803d9e4616139..c09c7f0bc9c2f0 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertHardSigmoidTest_dim_2(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py index 0b98ab53fcc297..0754eede6d3706 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -26,16 +27,16 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: def sample_program_configs(self): def generate_input1(batch, dim1): - return np.random.randn(batch, dim1, 768).astype(np.float32) + return np.random.random((batch, dim1, 768)).astype(np.float32) def generate_input2(shape): return np.random.random(shape).astype(np.float32) def generate_weight1(): - return np.random.randn(768, 768).astype(np.float32) + return np.random.random((768, 768)).astype(np.float32) def generate_weight2(): - return np.random.randn(768).astype(np.float32) + return np.random.random(768).astype(np.float32) for batch in [1, 2, 4]: self.batch = batch diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py index 91e1c0677ac481..1cc9defa1010be 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -84,8 +85,7 @@ def generate_input1(attrs: List[Dict[str, Any]]): yield program_config - def sample_predictor_configs( - self, program_config) -> (paddle_infer.Config, List[int], float): + def sample_predictor_configs(self, program_config): def generate_dynamic_shape(attrs): self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]} self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]} @@ -117,7 +117,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), 1e-5 + attrs, False), (1e-5, 1e-5) self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( attrs, False), (1e-5, 1e-5) @@ -125,8 +125,8 @@ def generate_trt_nodes_num(attrs, dynamic_shape): # for dynamic_shape generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), generate_trt_nodes_num(attrs, - True), 1e-5 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( attrs, True), (1e-5, 1e-5) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py index 265065c7b357eb..56efdb91959ce4 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -141,6 +142,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): return 1, 3 else: return 0, 4 + return 0, 4 attrs = [ program_config.ops[i].attrs diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py index 11d060847a4186..9f3e7a81777c29 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertSkipLayernormTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py index 725a3085550de9..17a2c9cd74c079 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py @@ -143,7 +143,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape): True), 1e-4 def test(self): - self.run_test() + # TODO(inference): fix. + # trt6 and trt7.1 has bug. + # trt7.2 deserialize has bug. + # self.run_test() + pass if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py index e539bd9a563004..4a15a09b0f77ee 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -135,7 +136,17 @@ def generate_trt_nodes_num(attrs, dynamic_shape): True), 1e-5 def add_skip_trt_case(self): - pass + def teller1(program_config, predictor_config): + if len( + program_config.inputs['softmax_input'].shape + ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled(): + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape has diff, but we can add shuffle layer to resolve it." + ) def test(self): self.add_skip_trt_case() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py index 2db60ccc61b950..f03ed0a335eeba 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -226,6 +227,18 @@ def teller1(program_config, predictor_config): teller1, SkipReasons.TRT_NOT_SUPPORT, "INPUT AxisTensor AND SectionsTensorList NOT SUPPORT.") + def teller2(program_config, predictor_config): + if len( + program_config.inputs['split_input'].shape + ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled(): + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape has diff, but we can add shuffle layer to resolve it." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py index df7914689beaf4..93ba5da9d66d9a 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertStackTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py index 59ab1a6c5a376e..c1a5493fd328a0 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py @@ -77,10 +77,14 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if dynamic_shape == True: - return 0, 3 + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7000: + if dynamic_shape == True: + return 0, 3 + else: + return 1, 2 else: - return 1, 2 + return 0, 3 attrs = [ program_config.ops[i].attrs diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py index d6a0aac75c966c..17955c6e007d9b 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertYoloBoxTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py index 3ac185fbb04aca..edd033f28c0ed4 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py @@ -81,7 +81,7 @@ def __init__(self, methodName='runTest'): def create_inference_config(self, use_trt=True) -> paddle_infer.Config: config = paddle_infer.Config() - # config.disable_glog_info() + config.disable_glog_info() config.enable_use_gpu(100, 0) config.set_optim_cache_dir(self.trt_cache_dir) if use_trt: @@ -276,11 +276,11 @@ def run_test(self, quant=False): str(prog_config) + ' vs ' + self.inference_config_str( pred_config) + '\033[1;31m \nERROR INFO: {}\033[0m'.format(str(e))) - status = False + if not skip_flag: + status = False continue self.success_log('RUN ' + str(prog_config) + ' vs ' + self.inference_config_str(pred_config)) - # In the first step, we found the problem, and after the subsequent repairs, the assert assertion will be enabled - # self.assertTrue(status) + # self.assertTrue(status) From 605e7f0849eab68deac0c1972441e24824ba1b63 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 20 Oct 2021 13:30:11 +0800 Subject: [PATCH 223/298] fix pow2 decay (#36559) --- .../pow2_decay_with_linear_warmup_op.cc | 4 +-- .../pow2_decay_with_linear_warmup_op.h | 28 ++++++++----------- python/paddle/fluid/contrib/layers/nn.py | 7 ++--- .../test_pow2_decay_with_linear_warmup_op.py | 18 ++++++------ 4 files changed, 24 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc index 12362b1bc6401c..4d919c94f616b1 100644 --- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc @@ -54,8 +54,6 @@ class Pow2DecayWithLinearWarmupOpMaker AddAttr( "total_steps", "(int64_t) The total steps for changing the learning rate."); - AddAttr("start_lr", - "(float) The initial value of the learning rate."); AddAttr("base_lr", "(float) The final learning rate value after warmup."); AddAttr("end_lr", @@ -63,7 +61,7 @@ class Pow2DecayWithLinearWarmupOpMaker AddComment(R"DOC( The Pow2DecayWithLinearWarmup learning rate scheduler. -When step_num < warmup_steps, lr = (base_lr - start_lr) * step_num / warmup_steps + start_lr +When step_num < warmup_steps, lr = base_lr * step_num / warmup_steps When warmup_steps <= step_num <= total_steps, factor = 1 - (step_num - warmup_steps) / (total_steps - warmup_steps) diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h index 41e07b0343e728..74cf7627450773 100644 --- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h @@ -28,31 +28,30 @@ struct Pow2DecayWithLinearWarmupFunctor { using RestrictPtr = U *PADDLE_RESTRICT; public: - HOSTDEVICE Pow2DecayWithLinearWarmupFunctor( - RestrictPtr lr, RestrictPtr step, size_t warmup_steps, - size_t total_steps, AttrT start_lr, AttrT base_lr, AttrT end_lr) + HOSTDEVICE Pow2DecayWithLinearWarmupFunctor(RestrictPtr lr, + RestrictPtr step, + size_t warmup_steps, + size_t total_steps, AttrT base_lr, + AttrT end_lr) : lr_(lr), step_(step), warmup_steps_(warmup_steps), total_steps_(total_steps), - start_lr_(start_lr), base_lr_(base_lr), end_lr_(end_lr) {} HOSTDEVICE void operator()(size_t) const { - size_t step = static_cast(*step_); - *step_ = static_cast(step + 1); - if (step < warmup_steps_) { - auto new_lr = - static_cast(base_lr_ - start_lr_) * step / warmup_steps_ + - start_lr_; + size_t step = static_cast(*step_) + 1; + *step_ = static_cast(step); + if (step <= warmup_steps_) { + auto new_lr = static_cast(step) / warmup_steps_ * base_lr_; *lr_ = static_cast(new_lr); } else if (step < total_steps_) { auto factor = 1 - static_cast(step - warmup_steps_) / (total_steps_ - warmup_steps_); auto new_lr = - static_cast(base_lr_ - end_lr_) * factor * factor + end_lr_; + static_cast(base_lr_ - end_lr_) * (factor * factor) + end_lr_; *lr_ = static_cast(new_lr); } else { *lr_ = static_cast(end_lr_); @@ -64,7 +63,6 @@ struct Pow2DecayWithLinearWarmupFunctor { RestrictPtr step_; size_t warmup_steps_; size_t total_steps_; - AttrT start_lr_; AttrT base_lr_; AttrT end_lr_; }; @@ -98,7 +96,6 @@ class Pow2DecayWithLinearWarmupOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(warmup_steps, total_steps, platform::errors::InvalidArgument( "warmup_steps must not be larger than total_steps.")); - auto start_lr = ctx.Attr("start_lr"); auto base_lr = ctx.Attr("base_lr"); auto end_lr = ctx.Attr("end_lr"); @@ -106,11 +103,10 @@ class Pow2DecayWithLinearWarmupOpKernel : public framework::OpKernel { auto *step_data = step_out->data(); auto &dev_ctx = ctx.template device_context(); platform::ForRange for_range(dev_ctx, 1); - using AttrT = float; + using AttrT = double; Pow2DecayWithLinearWarmupFunctor functor( lr_data, step_data, warmup_steps, total_steps, - static_cast(start_lr), static_cast(base_lr), - static_cast(end_lr)); + static_cast(base_lr), static_cast(end_lr)); for_range(functor); } }; diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index 0d0addb17e9ae6..cb26f05b549849 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -1936,18 +1936,18 @@ def build_program(main_program, startup_program): def pow2_decay_with_linear_warmup(warmup_steps, total_steps, - start_lr, base_lr, end_lr, dtype='float32', name=None): if paddle.fluid.in_dygraph_mode(): raise NotImplementedError( - "pow2_warmup does not support dygraph mode yet.") + "pow2_decay_with_linear_warmup does not support dygraph mode yet.") helper = LayerHelper("pow2_decay_with_linear_warmup", **locals()) lr = helper.create_global_variable(persistable=True, dtype=dtype, shape=[1]) - helper.set_variable_initializer(lr, Constant(value=start_lr)) + helper.set_variable_initializer( + lr, Constant(value=float(base_lr) / warmup_steps)) step = helper.create_global_variable( persistable=True, dtype='int64', shape=[1]) @@ -1963,7 +1963,6 @@ def pow2_decay_with_linear_warmup(warmup_steps, attrs={ "warmup_steps": warmup_steps, "total_steps": total_steps, - "start_lr": start_lr, "base_lr": base_lr, "end_lr": end_lr, }) diff --git a/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py b/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py index 641ea3eccf8d2b..056db5b8590ab5 100644 --- a/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py +++ b/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py @@ -19,13 +19,12 @@ import unittest -def gen_pow2_warmup_op_lr(warmup_steps, total_steps, start_lr, base_lr, end_lr, - place): +def gen_pow2_warmup_op_lr(warmup_steps, total_steps, base_lr, end_lr, place): main = paddle.static.Program() startup = paddle.static.Program() with paddle.static.program_guard(main, startup): - lr = pow2_decay_with_linear_warmup(warmup_steps, total_steps, start_lr, - base_lr, end_lr) + lr = pow2_decay_with_linear_warmup(warmup_steps, total_steps, base_lr, + end_lr) exe = paddle.static.Executor(place) with paddle.static.scope_guard(paddle.static.Scope()): exe.run(startup) @@ -35,7 +34,7 @@ def gen_pow2_warmup_op_lr(warmup_steps, total_steps, start_lr, base_lr, end_lr, class Pow2Warmup(LinearWarmup): - def __init__(self, warmup_steps, total_steps, start_lr, base_lr, end_lr): + def __init__(self, warmup_steps, total_steps, base_lr, end_lr): assert total_steps > warmup_steps lr_sch = PolynomialDecay( learning_rate=base_lr, @@ -46,13 +45,13 @@ def __init__(self, warmup_steps, total_steps, start_lr, base_lr, end_lr): super(Pow2Warmup, self).__init__( learning_rate=lr_sch, warmup_steps=warmup_steps, - start_lr=start_lr, + start_lr=0.0, end_lr=base_lr) -def gen_pow2_warmup_py_lr(warmup_steps, total_steps, start_lr, base_lr, end_lr, - place): - lr_sch = Pow2Warmup(warmup_steps, total_steps, start_lr, base_lr, end_lr) +def gen_pow2_warmup_py_lr(warmup_steps, total_steps, base_lr, end_lr, place): + lr_sch = Pow2Warmup(warmup_steps, total_steps, base_lr, end_lr) + lr_sch.step() while True: yield lr_sch() lr_sch.step() @@ -64,7 +63,6 @@ def setUp(self): self.params = { 'warmup_steps': 30, 'total_steps': 100, - 'start_lr': 0.01, 'base_lr': 0.02, 'end_lr': 0.001, } From 873ee4e3802bfdf10eb86b1c8ee46aa2523e18dd Mon Sep 17 00:00:00 2001 From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com> Date: Wed, 20 Oct 2021 14:28:47 +0800 Subject: [PATCH 224/298] adapt to cann5.0.3_alpha3. (#36106) --- cmake/external/ascend.cmake | 4 +++- .../operators/collective/c_embedding_op_npu.cc | 14 ++++++++++++++ paddle/fluid/operators/fill_constant_op_npu.cc | 10 ++++++++++ paddle/fluid/operators/lookup_table_v2_op_npu.cc | 3 +++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index b643923cdd3531..03bc7784e9288d 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -92,6 +92,8 @@ macro(find_ascend_toolkit_version ascend_toolkit_version_info) file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS) string(REGEX MATCH "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}") string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}") + string(REGEX REPLACE "[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION}) + add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}") if(NOT ASCEND_TOOLKIT_VERSION) set(ASCEND_TOOLKIT_VERSION "???") else() @@ -118,4 +120,4 @@ endif() find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info) find_ascend_driver_version(${ASCEND_DIR}/driver/version.info) -endif() \ No newline at end of file +endif() diff --git a/paddle/fluid/operators/collective/c_embedding_op_npu.cc b/paddle/fluid/operators/collective/c_embedding_op_npu.cc index c2d607223868a2..021e5790afe579 100644 --- a/paddle/fluid/operators/collective/c_embedding_op_npu.cc +++ b/paddle/fluid/operators/collective/c_embedding_op_npu.cc @@ -68,10 +68,21 @@ void shard_index(const Tensor &table_t, const Tensor &ids_t, int64_t start_idx, ignore_tensor.Resize(ids_t.dims()); NpuOpRunner sub_runner; +#if (CANN_VERSION_CODE >= 503003) + Tensor factor_tensor(ids_t.type()); + factor_tensor.mutable_data({1}, context.GetPlace()); + TensorFromVector(std::vector{static_cast(start_idx)}, + context.device_context(), &factor_tensor); + sub_runner.SetType("Sub") + .AddInput(ids_t) + .AddInput(factor_tensor) + .AddOutput(id_t); +#else sub_runner.SetType("Sub") .AddInput(ids_t) .AddInput(std::vector{static_cast(start_idx)}) .AddOutput(id_t); +#endif sub_runner.Run(); NpuOpRunner lessequal1_runner; @@ -137,6 +148,9 @@ void NPUGetIdsEmbedding(const framework::ExecutionContext &context) { .AddInput(table_t_pad) .AddInput(ids_t_local) .AddInput(std::vector{0}) +#if (CANN_VERSION_CODE >= 503003) + .AddAttrs({{"batch_dims", 0}}) +#endif .AddOutput(*output_t); runner.Run(); } diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index ae0148a9bf5132..16a2433f5cad6f 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -66,11 +66,21 @@ class FillConstantNPUKernel : public framework::OpKernel { out_var->mutable_data(shape, ctx.GetPlace()); NpuOpRunner runner; +#if (CANN_VERSION_CODE >= 503003) + runner.SetType("FillD") + .AddInput(tensor_value) + .AddOutput(*out_var) + .AddAttrs( + {{ "dims", + framework::vectorize(shape) }}) + .Run(stream); +#else runner.SetType("Fill") .AddInput(framework::vectorize(shape)) .AddInput(tensor_value) .AddOutput(*out_var) .Run(stream); +#endif } }; } // namespace operators diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index 387cd92b69f923..b75ae8a65881a5 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -40,6 +40,9 @@ class LookupTableV2NPUKernel : public framework::OpKernel { .AddInput(*table_t) .AddInput(*ids_t) .AddInput(std::vector{0}) +#if (CANN_VERSION_CODE >= 503003) + .AddAttrs({{"batch_dims", 0}}) +#endif .AddOutput(*output_t); runner.Run(); } From 3f2d6a3f21fee7a95c580d22ffcd708200fd8306 Mon Sep 17 00:00:00 2001 From: Steffy-zxf <48793257+Steffy-zxf@users.noreply.github.com> Date: Wed, 20 Oct 2021 14:55:14 +0800 Subject: [PATCH 225/298] Add FasterTokenizer Operator (#34491) Add Tokenizer related functionalities for Transformer model in order that the process of training and predicting is consistent. * support the text string as an input Tensor * support the "VOCAB"unordered_map as an input Tensor to lookup tokens * Tokenizer used for BERT. This tokenizer applies an end-to-end, text string to wordpiece tokenization. * It first applies basic tokenization, followed by wordpiece tokenization. --- cmake/external/utf8proc.cmake | 51 + cmake/inference_lib.cmake | 5 + cmake/third_party.cmake | 4 + paddle/fluid/framework/CMakeLists.txt | 2 + paddle/fluid/framework/executor.cc | 8 +- paddle/fluid/framework/executor_gc_helper.cc | 1 + paddle/fluid/framework/feed_fetch_method.cc | 20 +- paddle/fluid/framework/feed_fetch_method.h | 4 + paddle/fluid/framework/feed_fetch_type.h | 12 +- paddle/fluid/framework/framework.proto | 9 + paddle/fluid/framework/operator.cc | 4 + paddle/fluid/framework/string_array.cc | 104 ++ paddle/fluid/framework/string_array.h | 48 + paddle/fluid/framework/tensor_util.cc | 3 +- paddle/fluid/framework/tensor_util.h | 14 + paddle/fluid/framework/var_desc.cc | 8 + paddle/fluid/framework/var_type_traits.h | 13 +- paddle/fluid/framework/variable_helper.cc | 5 + paddle/fluid/imperative/variable_wrapper.h | 10 + paddle/fluid/inference/api/CMakeLists.txt | 2 +- .../inference/api/demo_ci/CMakeLists.txt | 7 +- .../inference/api/details/zero_copy_tensor.cc | 57 +- .../api/details/zero_copy_tensor_dummy.cc | 5 +- .../api/details/zero_copy_tensor_test.cc | 3 +- paddle/fluid/inference/api/paddle_api.h | 8 + paddle/fluid/inference/api/paddle_tensor.h | 22 + paddle/fluid/inference/io.cc | 10 +- paddle/fluid/operators/CMakeLists.txt | 7 +- paddle/fluid/operators/controlflow/feed_op.cc | 54 +- .../fluid/operators/controlflow/fetch_op.cc | 12 +- paddle/fluid/operators/load_combine_op.h | 73 +- paddle/fluid/operators/save_combine_op.h | 60 +- paddle/fluid/operators/string/CMakeLists.txt | 6 + .../operators/string/faster_tokenizer_op.cc | 524 +++++++ .../operators/string/faster_tokenizer_op.h | 196 +++ .../operators/string/unity_build_rule.cmake | 8 + paddle/fluid/pybind/imperative.cc | 6 + paddle/fluid/pybind/inference_api.cc | 37 +- paddle/fluid/pybind/op_function_generator.cc | 1 + paddle/fluid/pybind/protobuf.cc | 5 +- paddle/fluid/pybind/pybind.cc | 47 +- python/paddle/fluid/dygraph/jit.py | 17 +- python/paddle/fluid/dygraph/layers.py | 23 +- python/paddle/fluid/dygraph/math_op_patch.py | 7 +- .../fluid/dygraph/varbase_patch_methods.py | 40 +- python/paddle/fluid/executor.py | 8 +- python/paddle/fluid/framework.py | 4 + python/paddle/fluid/inference/wrapper.py | 10 +- .../unittests/test_faster_tokenizer_op.py | 393 ++++++ .../tests/unittests/tokenizer/__init__.py | 13 + .../unittests/tokenizer/bert_tokenizer.py | 517 +++++++ .../unittests/tokenizer/tokenizer_utils.py | 1244 +++++++++++++++++ python/paddle/framework/io.py | 10 +- 53 files changed, 3604 insertions(+), 157 deletions(-) create mode 100644 cmake/external/utf8proc.cmake create mode 100755 paddle/fluid/framework/string_array.cc create mode 100755 paddle/fluid/framework/string_array.h create mode 100644 paddle/fluid/operators/string/CMakeLists.txt create mode 100644 paddle/fluid/operators/string/faster_tokenizer_op.cc create mode 100755 paddle/fluid/operators/string/faster_tokenizer_op.h create mode 100644 paddle/fluid/operators/string/unity_build_rule.cmake create mode 100755 python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py create mode 100644 python/paddle/fluid/tests/unittests/tokenizer/__init__.py create mode 100755 python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py create mode 100644 python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py diff --git a/cmake/external/utf8proc.cmake b/cmake/external/utf8proc.cmake new file mode 100644 index 00000000000000..a5de5c15c3b510 --- /dev/null +++ b/cmake/external/utf8proc.cmake @@ -0,0 +1,51 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(UTF8PROC_PREFIX_DIR ${THIRD_PARTY_PATH}/utf8proc) +SET(UTF8PROC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/utf8proc) +# As we add extra features for utf8proc, we use the non-official repo +SET(UTF8PROC_REPOSITORY ${GIT_URL}/JuliaStrings/utf8proc.git) +SET(UTF8PROC_TAG v2.6.1) + +IF(WIN32) + SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib") + add_definitions(-DUTF8PROC_STATIC) +ELSE(WIN32) + SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a") +ENDIF(WIN32) + +INCLUDE_DIRECTORIES(${UTF8PROC_INSTALL_DIR}/include) + +ExternalProject_Add( + extern_utf8proc + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + GIT_REPOSITORY ${UTF8PROC_REPOSITORY} + GIT_TAG ${UTF8PROC_TAG} + PREFIX ${UTF8PROC_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DBUILD_SHARED=ON + -DBUILD_STATIC=ON + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_INSTALL_PREFIX:PATH=${UTF8PROC_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + BUILD_BYPRODUCTS ${UTF8PROC_LIBRARIES} +) + +ADD_LIBRARY(utf8proc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET utf8proc PROPERTY IMPORTED_LOCATION ${UTF8PROC_LIBRARIES}) +ADD_DEPENDENCIES(utf8proc extern_utf8proc) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 5ffbf15c960a32..dfd93f49e73404 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -124,6 +124,11 @@ function(copy_part_of_thrid_party TARGET DST) SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib) + set(dst_dir "${DST}/third_party/install/utf8proc") + copy(${TARGET} + SRCS ${UTF8PROC_INSTALL_DIR}/include ${UTF8PROC_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + if (WITH_CRYPTO) set(dst_dir "${DST}/third_party/install/cryptopp") copy(${TARGET} diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index b3260ba27b0729..d45b5e07bb8f37 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -210,6 +210,10 @@ include(external/threadpool)# download threadpool include(external/dlpack) # download dlpack include(external/xxhash) # download, build, install xxhash include(external/warpctc) # download, build, install warpctc +include(external/utf8proc) # download, build, install utf8proc + +list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) +list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool extern_utf8proc) include(external/lapack) # download, build, install lapack list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6e57b829ade4ed..4dfcf0985b85e1 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -51,6 +51,8 @@ proto_library(data_feed_proto SRCS data_feed.proto) proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto data_feed_proto) +cc_library(string_array SRCS string_array.cc DEPS utf8proc) + cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) if(WITH_GPU) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index de007c128d7543..5f681ec7ea241f 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -102,14 +102,18 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, if (var->Persistable()) { auto* ptr = const_cast(ancestor_scope)->Var(var->Name()); + + VLOG(3) << "Initialize Variable " << var->Name(); InitializeVariable(ptr, var->GetType()); VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; + << " global, which pointer is " << ptr << " type is " + << static_cast(var->GetType()); } else { auto* ptr = scope->Var(var->Name()); InitializeVariable(ptr, var->GetType()); VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; + << " locally, which pointer is " << ptr << "Variable Type " + << static_cast(var->GetType()); } } } else { diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index 43eb1ce8c77f89..8c64d65ff4be66 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -125,6 +125,7 @@ void DeleteUnusedTensors(const Scope &scope, for (auto &t : *lod_tensor_arr) { garbages.emplace_back(t.MoveMemoryHolder()); } + } else if (var->IsType()) { } else { PADDLE_THROW(platform::errors::Unimplemented( "Type %s of variable %s is not supported eager deletion.", diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 3bd85b2b24b97b..2eac65c90c02fa 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include +#include #include "glog/logging.h" namespace paddle { @@ -35,9 +36,24 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, feed_inputs.resize(index + 1); } // shared data with input tensor - feed_inputs[index].ShareDataWith(input); + auto& val = BOOST_GET(LoDTensor, feed_inputs[index]); + val.ShareDataWith(input); // set lod - feed_inputs[index].set_lod(input.lod()); + val.set_lod(input.lod()); +} + +void SetFeedVariable(Scope* scope, const Strings& input, + const std::string& var_name, size_t index) { + // If var_name Variable is not found in GlobalScope, a new variable will + // be created. + VLOG(3) << "SetFeedStringVariable name=" << var_name << " index=" << index; + Variable* g_feed_value = scope->Var(var_name); + auto& feed_inputs = *(g_feed_value->GetMutable()); + if (index >= feed_inputs.size()) { + feed_inputs.resize(index + 1); + } + // shared data with input tensor + feed_inputs[index] = input; } FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name, diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index a52ef517c8b734..4c2f5b9796a223 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/string_array.h" namespace paddle { namespace framework { @@ -28,6 +29,9 @@ class Scope; void SetFeedVariable(Scope* scope, const LoDTensor& input, const std::string& var_name, size_t index); +void SetFeedVariable(Scope* scope, const Strings& input, + const std::string& var_name, size_t index); + FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h index 1996327fe82bc0..12c111e58f58a0 100644 --- a/paddle/fluid/framework/feed_fetch_type.h +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -13,14 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { -using FeedType = LoDTensor; +using FeedType = boost::variant; using FeedList = std::vector; using FetchType = boost::variant; @@ -43,6 +46,13 @@ inline bool data_is_lod_tensor_array(const FetchType &data) { return false; } +inline bool data_is_string_tensor(const FeedType &data) { + if (data.type() == typeid(Strings)) { + return true; + } + return false; +} + static const char kFeedOpType[] = "feed"; static const char kFetchOpType[] = "fetch"; diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index eb72d9e1420dce..300d5f6e8fad10 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -147,6 +147,11 @@ message VarType { // in operators like nccl_op RAW = 17; TUPLE = 18; + + STRING = 25; + STRINGS = 26; + VOCAB = 27; + FEED_LIST = 28; } required Type type = 1; @@ -175,6 +180,10 @@ message VarType { message Tuple { repeated Type element_type = 1; } optional Tuple tuple = 7; + + optional TensorDesc string = 8; + optional TensorDesc strings = 9; + optional TensorDesc vocab = 10; } message VarDesc { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 2a543d48791a3d..0cd17cdb10d55c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -76,6 +76,8 @@ static DDim GetDimsDebug(const Scope& scope, const std::string& name, } else { return var->Get().GetCompleteDims(); } + } else if (var->IsType()) { + return DDim({static_cast(var->Get().size())}); } else { return DDim({-1}); } @@ -106,6 +108,8 @@ static std::string GetDtype(const Scope& scope, const std::string& name) { } else { return DataTypeToString(tensor.type()); } + } else if (var->IsType()) { + return "strings"; } else { return ""; } diff --git a/paddle/fluid/framework/string_array.cc b/paddle/fluid/framework/string_array.cc new file mode 100755 index 00000000000000..3071e6bf4cff33 --- /dev/null +++ b/paddle/fluid/framework/string_array.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/string_array.h" + +namespace paddle { +namespace framework { + +std::wstring_convert> kConverter; + +// Convert the std::string type to the std::wstring type. +bool ConvertStrToWstr(const std::string& src, std::wstring* res) { + try { + *res = kConverter.from_bytes(src); + } catch (std::range_error& e) { + VLOG(3) << "The string " << src << " was converted to unicode failedly! "; + return false; + } + return true; +} + +// Convert the std::wstring type to the std::string type. +void ConvertWstrToStr(const std::wstring& src, std::string* res) { + *res = kConverter.to_bytes(src); +} + +// Normalization Form Canonical Decomposition. +void NFD(const std::string& s, std::string* ret) { + *ret = ""; + char* result = reinterpret_cast( + utf8proc_NFD(reinterpret_cast(s.c_str()))); + if (result) { + *ret = std::move(std::string(result)); + free(result); + } +} + +// Write the data which is type of +// std::unordered_map to ostream. +void StringMapToStream(std::ostream& os, + const std::unordered_map& data) { + { + // firstly write the data size. + size_t t = data.size(); + os.write(reinterpret_cast(&t), sizeof(t)); + } + { + // then write the data + for (auto it = data.begin(); it != data.end(); ++it) { + std::string token = it->first; + int32_t token_id = it->second; + // write the token + size_t length = token.size(); + os.write(reinterpret_cast(&length), sizeof(length)); + os.write(token.c_str(), length); + // write the token_id + os.write(reinterpret_cast(&token_id), sizeof(token_id)); + } + } +} + +// Read the data which is type of +// std::unordered_map from istream. +void StringMapFromStream(std::istream& is, + std::unordered_map* data) { + // first read the map size + size_t map_size; + is.read(reinterpret_cast(&map_size), sizeof(map_size)); + data->reserve(map_size); + // then read the data + for (size_t i = 0; i < map_size; ++i) { + // read the token + size_t token_length; + is.read(reinterpret_cast(&token_length), sizeof(token_length)); + char* tmp = new char[token_length]; + is.read(tmp, token_length); + std::string token(tmp, tmp + token_length); + delete[] tmp; + // read the token_id + int32_t token_id; + is.read(reinterpret_cast(&token_id), sizeof(token_id)); + + data->emplace(token, token_id); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/string_array.h b/paddle/fluid/framework/string_array.h new file mode 100755 index 00000000000000..b874fbac4c9e7c --- /dev/null +++ b/paddle/fluid/framework/string_array.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +using String = std::string; +using Strings = std::vector; +using Vocab = std::unordered_map; + +// Convert the std::string type to the std::string type. +bool ConvertStrToWstr(const std::string& src, std::wstring* res); +// Convert the std::wstring type to the std::string type. +void ConvertWstrToStr(const std::wstring& src, std::string* res); +// Normalization Form Canonical Decomposition. +void NFD(const std::string& s, std::string* ret); + +// Write the data which is type of +// std::unordered_map to ostream. +void StringMapToStream(std::ostream& os, + const std::unordered_map& data); + +// Read the data which is type of +// std::unordered_map from istream. +void StringMapFromStream(std::istream& is, + std::unordered_map* data); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index ee30a82aff6ef0..1c43219330bfe7 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/tensor_util.h" - #include #include #include @@ -22,6 +20,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index f4bbbaa2e70cf5..73829898be961d 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -13,11 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include +#include +#include #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/dlpack_tensor.h" #include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #ifdef PADDLE_WITH_ASCEND_CL @@ -48,6 +54,14 @@ class PrintOptions { PrintOptions() {} }; +void TensorToStream(std::ostream& os, const Tensor& tensor, + const platform::DeviceContext& dev_ctx); +void TensorFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx); +void TensorFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx, + const size_t& seek, const std::vector& shape); + // NOTE(zcd): Because TensorCopy is an async operation, when the src_place // and dst_place are two different GPU, to ensure that the operation can // be carried out correctly, there is a src_ctx wait operation in TensorCopy. diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index c3bdd6ae7f135c..41fe9fbbc0396e 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -209,6 +209,10 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { return desc_.type().lod_tensor().tensor(); case proto::VarType::LOD_TENSOR_ARRAY: return desc_.type().tensor_array().tensor(); + case proto::VarType::STRINGS: + return desc_.type().strings(); + case proto::VarType::VOCAB: + return desc_.type().vocab(); default: PADDLE_THROW(platform::errors::Unavailable( "Getting 'tensor_desc' is not supported by the %s type variable.", @@ -249,6 +253,10 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { return desc_.mutable_type()->mutable_lod_tensor()->mutable_tensor(); case proto::VarType::LOD_TENSOR_ARRAY: return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor(); + case proto::VarType::STRINGS: + return desc_.mutable_type()->mutable_strings(); + case proto::VarType::VOCAB: + return desc_.mutable_type()->mutable_vocab(); default: PADDLE_THROW( platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not " diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 473df85aa0421e..c8c3cf364e0fc0 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -18,10 +18,12 @@ #include #include #include +#include #include #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA #include @@ -162,8 +164,8 @@ struct VarTypeRegistryImpl { // Paddle would generate unique Ids for each registered variable types. using VarTypeRegistry = detail::VarTypeRegistryImpl< Tensor, LoDTensor, SelectedRows, std::vector, LoDRankTable, - LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, - operators::reader::LoDTensorBlockingQueueHolder, FetchList, + Strings, LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *, + operators::reader::LoDTensorBlockingQueueHolder, FetchList, FeedList, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -177,8 +179,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< #if defined(PADDLE_WITH_XPU_BKCL) BKCLUniqueId, platform::BKCLCommunicator, #endif - int, float>; - + int, float, Vocab>; template struct VarTypeTrait { static_assert(VarTypeRegistry::IsRegistered(), "Must be registered type"); @@ -208,9 +209,13 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); +REG_PROTO_VAR_TYPE_TRAIT(FeedList, proto::VarType::FEED_LIST); REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST); REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32); +REG_PROTO_VAR_TYPE_TRAIT(Vocab, proto::VarType::VOCAB); +REG_PROTO_VAR_TYPE_TRAIT(String, proto::VarType::STRING); +REG_PROTO_VAR_TYPE_TRAIT(Strings, proto::VarType::STRINGS); /** End of variable type registration */ diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc index bdcdd4e64e3314..37ec5d7bc83bda 100644 --- a/paddle/fluid/framework/variable_helper.cc +++ b/paddle/fluid/framework/variable_helper.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -41,6 +42,10 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { var->GetMutable(); + } else if (var_type == proto::VarType::STRINGS) { + var->GetMutable(); + } else if (var_type == proto::VarType::VOCAB) { + var->GetMutable(); } else if (var_type == proto::VarType::PLACE_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::READER) { diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index 758e8e62718e7a..9fbbe7d06f8ad8 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -20,6 +20,7 @@ #include #include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/op_base.h" @@ -153,6 +154,15 @@ class VariableWrapper { tensor = &(var_.Get()); } else if (type_ == framework::proto::VarType::SELECTED_ROWS) { tensor = &(var_.Get().value()); + } else if (type_ == framework::proto::VarType::VOCAB) { + const framework::Vocab* data = nullptr; + data = &(var_.Get()); + if (data && data->size() != 0) { + VLOG(6) << "The tensor of variable " << name_ + << " is not initialized"; + return data_type_; + } + return framework::proto::VarType::VOCAB; } else { VLOG(6) << "Variable " << name_ << " is not initialized"; return data_type_; diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index bbec3eab1cadff..53b92c13363020 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -26,7 +26,7 @@ if(WITH_MKLDNN) set(mkldnn_quantizer_cfg ${mkldnn_quantizer_cfg} PARENT_SCOPE) endif() -cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer) +cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer utf8proc) cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 47abe3298aa7c4..1fdc5cd730e53a 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -34,12 +34,14 @@ include_directories("${PADDLE_LIB}/") set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}utf8proc/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}utf8proc/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") @@ -151,12 +153,13 @@ if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf xxhash cryptopp + glog gflags protobuf xxhash cryptopp utf8proc ${EXTERNAL_LIB}) else() set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} - glog gflags_static libprotobuf xxhash cryptopp-static ${EXTERNAL_LIB}) + glog gflags_static libprotobuf xxhash cryptopp-static utf8proc_static + ${EXTERNAL_LIB}) set(DEPS ${DEPS} shlwapi.lib) endif(NOT WIN32) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index a9c6ef13177c20..bb537f0c652857 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -43,15 +43,33 @@ void Tensor::Reshape(const std::vector &shape) { tensor->Resize(paddle::framework::make_ddim(shape)); } -#define EAGER_GET_TENSOR \ - if (!tensor_) { \ - tensor_ = FindTensor(); \ - } \ - auto *tensor = static_cast(tensor_); +void Tensor::ReshapeStrings(const size_t &shape) { + PADDLE_ENFORCE_EQ( + name_.empty(), false, + paddle::platform::errors::PreconditionNotMet( + "Need to SetName first, so that the corresponding tensor can " + "be retrieved.")); + PADDLE_ENFORCE_EQ(input_or_output_, true, + paddle::platform::errors::PermissionDenied( + "Can't reshape the output tensor, it is readonly")); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE_NOT_NULL( + var, paddle::platform::errors::PreconditionNotMet( + "No tensor called [%s] in the runtime scope", name_)); + paddle_infer::Strings *tensor = var->GetMutable(); + tensor->resize(shape); +} + +#define EAGER_GET_TENSOR(tensor_type) \ + if (!tensor_) { \ + tensor_ = FindTensor(); \ + } \ + auto *tensor = static_cast(tensor_); template T *Tensor::mutable_data(PlaceType place) { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GT( tensor->numel(), 0, paddle::platform::errors::PreconditionNotMet( @@ -83,7 +101,7 @@ T *Tensor::mutable_data(PlaceType place) { template T *Tensor::data(PlaceType *place, int *size) const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto *res = tensor->data(); if (paddle::platform::is_cpu_place(tensor->place())) { @@ -103,7 +121,7 @@ T *Tensor::data(PlaceType *place, int *size) const { } DataType Tensor::type() const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto type = tensor->type(); if (type == paddle::framework::proto::VarType::FP32) { return DataType::FLOAT32; @@ -125,7 +143,7 @@ PlaceType Tensor::place() const { return place_; } template void Tensor::CopyFromCpu(const T *data) { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GE(tensor->numel(), 0, paddle::platform::errors::PreconditionNotMet( "You should call Tensor::Reshape(const " @@ -186,10 +204,20 @@ void Tensor::CopyFromCpu(const T *data) { } } +void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { + EAGER_GET_TENSOR(paddle_infer::Strings); + PADDLE_ENFORCE_GE(tensor->size(), 0, + paddle::platform::errors::PreconditionNotMet( + "You should call Tensor::Reshape(const " + "std::size_t &shape)function before copying" + "the string data from cpu.")); + *tensor = *data; +} + template void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, void *cb_params) const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto ele_num = tensor->numel(); auto *t_data = tensor->data(); auto t_place = tensor->place(); @@ -371,6 +399,7 @@ Tensor::Tensor(void *scope) : scope_{scope} { "set to the pointer of scope.")); } +template void *Tensor::FindTensor() const { PADDLE_ENFORCE_EQ( name_.empty(), false, @@ -382,12 +411,12 @@ void *Tensor::FindTensor() const { PADDLE_ENFORCE_NOT_NULL( var, paddle::platform::errors::PreconditionNotMet( "No tensor called [%s] in the runtime scope", name_)); - auto *tensor = var->GetMutable(); + auto *tensor = var->GetMutable(); return tensor; } std::vector Tensor::shape() const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_NOT_NULL( tensor_, paddle::platform::errors::PreconditionNotMet( "Not found tensor called %s in the scope", name_)); @@ -395,7 +424,7 @@ std::vector Tensor::shape() const { } void Tensor::SetLoD(const std::vector> &x) { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); paddle::framework::LoD lod; for (auto &level : x) { lod.emplace_back(level); @@ -404,7 +433,7 @@ void Tensor::SetLoD(const std::vector> &x) { } std::vector> Tensor::lod() const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); std::vector> res; for (auto &level : tensor->lod()) { res.emplace_back(level); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc index 1f1be136103791..eb134874c3aa8a 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -36,7 +36,10 @@ template PD_INFER_DECL int64_t *Tensor::data(PlaceType *place, template float *Tensor::mutable_data(PlaceType place); template int64_t *Tensor::mutable_data(PlaceType place); -void *Tensor::FindTensor() const { return nullptr; } +template +void *Tensor::FindTensor() const { + return nullptr; +} std::vector Tensor::shape() const { return {}; } diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc index 0c092a8684d1ad..4b6f90f3f0652e 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc @@ -88,7 +88,8 @@ bool SetPlaceAndCheck(PlaceType place, size_t length) { const std::vector> lod{{0, length}}; scope.Var(name); auto tensor = CreateTensor(place, &scope, name); - tensor->Reshape({static_cast(length)}); + std::vector shape{static_cast(length)}; + tensor->Reshape(shape); tensor->mutable_data(place); tensor->SetLoD(lod); diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index de6b28de27557c..b137b7ba6f97e2 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -174,6 +174,14 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor { void copy_from_cpu(const T* data) { return CopyFromCpu(data); } + + /// \brief Experimental interface. + /// It's usually used to set the input tensor data with Strings data type. + /// \param data The pointer of the data, from which the tensor will copy. + void copy_strings_from_cpu(const paddle_infer::Strings* data) { + return CopyStringsFromCpu(data); + } + /// \brief Copy the tensor data to the host memory. /// It's usually used to get the output tensor data. /// \param[out] data The tensor will copy the data to the address. diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index f6dce74c30ded1..24a72a0b9dadbd 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -14,10 +14,16 @@ #pragma once +#include + #include "paddle_infer_declare.h" // NOLINT namespace paddle_infer { +/// \brief Experimental. +/// Strings for text data. +using Strings = std::vector; + typedef void (*CallbackFunc)(void*); #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST) @@ -57,6 +63,14 @@ class PD_INFER_DECL Tensor { /// \param shape The shape to set. void Reshape(const std::vector& shape); + /// \brief Experimental interface. + /// Reset the shape of the Strings tensor. + /// Generally it's only used for the input tensor. + /// Reshape must be called before calling + /// ZeroCopyStringTensorCreate() or PaddleInferTensorCreate() + /// \param shape The shape to set. + void ReshapeStrings(const std::size_t& shape); + /// \brief Get the memory pointer in CPU or GPU with specific data type. /// Please Reshape the tensor first before call this. /// It's usually used to get input data pointer. @@ -78,6 +92,11 @@ class PD_INFER_DECL Tensor { template void CopyFromCpu(const T* data); + /// \brief Experimental interface. + /// It's usually used to set the input tensor data with Strings data type. + /// \param data The pointer of the data, from which the tensor will copy. + void CopyStringsFromCpu(const paddle_infer::Strings* data); + /// \brief Copy the tensor data to the host memory. /// It's usually used to get the output tensor data. /// \param[out] data The tensor will copy the data to the address. @@ -122,7 +141,10 @@ class PD_INFER_DECL Tensor { protected: explicit Tensor(void* scope); + + template void* FindTensor() const; + void SetPlace(PlaceType place, int device = -1); void SetName(const std::string& name); diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index d2bc95e7c3eb3d..f976e217bab1a0 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -17,11 +17,13 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/pybind/pybind.h" DEFINE_string(devices, "", "The devices to be used which is joined by comma."); @@ -85,10 +87,12 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, framework::VarDesc* new_var = load_block->Var(var->Name()); new_var->SetShape(var->GetShape()); new_var->SetDataType(var->GetDataType()); - new_var->SetType(var->GetType()); + auto var_type = var->GetType(); + new_var->SetType(var_type); - if (var->GetType() != - framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) { + if ((var_type != + framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) && + (var_type != framework::proto::VarType::VOCAB)) { new_var->SetLoDLevel(var->GetLoDLevel()); } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 78cbc7e8a583b8..937bfea3a59efe 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -17,6 +17,7 @@ add_subdirectory(metrics) add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) +add_subdirectory(string) add_subdirectory(jit) if(WITH_MKLDNN) add_subdirectory(mkldnn) @@ -78,10 +79,12 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() -register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op sparse_attention_op lstm_op run_program_op eye_op recurrent_op - sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) +register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op + recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) +op_library(save_combine_op DEPS string_array) +op_library(load_combine_op DEPS string_array) if (WITH_GPU OR WITH_ROCM) if(WITH_ROCM) diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index 9597dd25ec530f..bc29c92b094262 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -1,11 +1,8 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,6 +26,39 @@ class OpBase; namespace paddle { namespace operators { + +// FeedVariableVisitor is to feed the variable data +// according to data type (LoDTensor or Strings). +class FeedVariableVisitor : public boost::static_visitor { + public: + explicit FeedVariableVisitor(framework::Variable *out_var, + const platform::Place &place) + : out_var_(out_var), place_(place) {} + + void operator()(const framework::LoDTensor &in_tensor) const { + framework::LoDTensor *out_tensor = + out_var_->GetMutable(); + if (platform::is_same_place(in_tensor.place(), place_)) { + out_tensor->ShareDataWith(in_tensor); + } else { + platform::DeviceContext *context = + platform::DeviceContextPool::Instance().Get(place_); + framework::TensorCopy(in_tensor, place_, *context, out_tensor); + } + out_tensor->set_lod(in_tensor.lod()); + } + + void operator()(const framework::Strings &in_str) const { + framework::Strings *out_str = out_var_->GetMutable(); + out_str->resize(in_str.size()); + *out_str = in_str; + } + + private: + framework::Variable *out_var_; + const platform::Place &place_; +}; + class FeedOp : public framework::OperatorBase { public: FeedOp(const std::string &type, const framework::VariableNameMap &inputs, @@ -79,15 +109,9 @@ class FeedOp : public framework::OperatorBase { col, feed_list.size())); auto &feed_item = feed_list.at(static_cast(col)); - auto *out_item = out_var->GetMutable(); - if (platform::is_same_place(feed_item.place(), place)) { - out_item->ShareDataWith(feed_item); - } else { - auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); - framework::TensorCopy(feed_item, place, *dev_ctx, out_item); - } - out_item->set_lod(feed_item.lod()); + FeedVariableVisitor visitor(out_var, place); + boost::apply_visitor(visitor, feed_item); } }; @@ -95,17 +119,17 @@ class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "(vector) A feeding list of LoDTensor, which may have " + "(vector) " + "A feeding list of LoDTensor, which may have " "different dimension and data type."); AddOutput("Out", - "(LoDTensor) The LoDTensor which is a copy of the col-th feeding " + "(LoDTensor) The LoDTensor which is a copy " + "of the col-th feeding " "object."); AddAttr("col", "(int) The column index of current feeding object."); AddComment(R"DOC( Feed Operator. - It should not be configured by users directly. - )DOC"); } }; diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index d86b6b48422d94..99b16d9b692538 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -109,6 +109,10 @@ class FetchOp : public framework::OperatorBase { auto &src_item = fetch_var->Get(); auto *dst_item = &(BOOST_GET(framework::LoDTensor, fetch_list->at(col))); DataCopy(src_item, fetch_var_name, dst_item); + } else if (fetch_var->IsType()) { + auto &src_item = fetch_var->Get(); + auto *dst_item = &(BOOST_GET(framework::Vocab, fetch_list->at(col))); + *dst_item = src_item; } else { auto &src_item = fetch_var->Get(); framework::LoDTensorArray tmp(src_item.size()); @@ -128,9 +132,11 @@ class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "(LoDTensor) The resulted LoDTensor which is expected to return " "to users."); - AddOutput("Out", - "(vector) A fetching list of LoDTensor which may have " - "different dimension, shape and data type."); + AddOutput( + "Out", + "(vector|unordered_map) A fetching list" + " of LoDTensor|unordered_map which may have " + "different dimension, shape and data type."); AddAttr("col", "(int) The column index of fetching object."); AddComment(R"DOC( Fetch Operator. diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h index 589df8821b3e7f..a02b0e61d9278e 100644 --- a/paddle/fluid/operators/load_combine_op.h +++ b/paddle/fluid/operators/load_combine_op.h @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/string_array.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -75,38 +77,57 @@ class LoadCombineOpKernel : public framework::OpKernel { out_vars[i], platform::errors::InvalidArgument( "The variable %s to be loaded cannot be found.", out_var_names[i])); - - auto *tensor = out_vars[i]->GetMutable(); - // Error checking PADDLE_ENFORCE_EQ( static_cast(*buffer), true, platform::errors::Unavailable( "An error occurred while loading model parameters. " "Please check whether the model file is complete or damaged.")); - - // Get data from fin to tensor - DeserializeFromStream(*buffer, tensor, dev_ctx); - - auto in_dtype = tensor->type(); - auto out_dtype = - load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - // convert to float16 tensor - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor fp16_tensor; - // copy LoD info to the new tensor - fp16_tensor.set_lod(tensor->lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, - &fp16_tensor); - - // reset output tensor - out_vars[i]->Clear(); - tensor = out_vars[i]->GetMutable(); - tensor->set_lod(fp16_tensor.lod()); - tensor->ShareDataWith(fp16_tensor); + if (out_vars[i]->IsType()) { + auto *tensor = out_vars[i]->GetMutable(); + tensor->clear(); + std::unordered_map data; + framework::StringMapFromStream(*buffer, &data); + for (auto it = data.begin(); it != data.end(); ++it) { + std::string tmp; + framework::NFD(it->first, &tmp); + if (tmp.empty()) { + VLOG(0) << "The string " << it->first + << " was converted to unicode failedly! " + << "Then dropped to load it."; + continue; + } + std::wstring token; + bool status = framework::ConvertStrToWstr(tmp, &token); + if (!status) continue; + tensor->emplace(token, it->second); + } + } else { + auto *tensor = out_vars[i]->GetMutable(); + + // Get data from fin to tensor + DeserializeFromStream(*buffer, tensor, dev_ctx); + + auto in_dtype = tensor->type(); + auto out_dtype = + load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + // convert to float16 tensor + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor fp16_tensor; + // copy LoD info to the new tensor + fp16_tensor.set_lod(tensor->lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, + &fp16_tensor); + + // reset output tensor + out_vars[i]->Clear(); + tensor = out_vars[i]->GetMutable(); + tensor->set_lod(fp16_tensor.lod()); + tensor->ShareDataWith(fp16_tensor); + } } } buffer->peek(); diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h index 939768693a2431..6e6c826a22892d 100644 --- a/paddle/fluid/operators/save_combine_op.h +++ b/paddle/fluid/operators/save_combine_op.h @@ -19,11 +19,13 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/port.h" @@ -66,34 +68,48 @@ class SaveCombineOpKernel : public framework::OpKernel { inp_vars[i], platform::errors::InvalidArgument("Cannot find variable %s to save.", inp_var_names[i])); - PADDLE_ENFORCE_EQ(inp_vars[i]->IsType(), true, + PADDLE_ENFORCE_EQ(inp_vars[i]->IsType() || + inp_vars[i]->IsType(), + true, platform::errors::InvalidArgument( "SaveCombine operator only supports saving " - "LoDTensor variable, %s has wrong type.", + "LoDTensor or Vocab variable, %s has wrong type.", inp_var_names[i])); - auto &tensor = inp_vars[i]->Get(); - PADDLE_ENFORCE_EQ( - tensor.IsInitialized(), true, - platform::errors::InvalidArgument( - "The Tensor of Variable(%s) to be saved is not initialized.", - inp_var_names[i])); - // Serialize tensors one by one - // Check types to see if a fp16 transformation is required - auto in_dtype = tensor.type(); - auto out_dtype = - save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + if (inp_vars[i]->IsType()) { + auto &tensor = inp_vars[i]->Get(); + PADDLE_ENFORCE_EQ( + tensor.IsInitialized(), true, + platform::errors::InvalidArgument( + "The Tensor of Variable(%s) to be saved is not initialized.", + inp_var_names[i])); + // Serialize tensors one by one + // Check types to see if a fp16 transformation is required + auto in_dtype = tensor.type(); + auto out_dtype = + save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - if (in_dtype != out_dtype) { - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor out; - // copy LoD info to the new tensor - out.set_lod(tensor.lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); - framework::SerializeToStream(ss, out, dev_ctx); + if (in_dtype != out_dtype) { + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor out; + // copy LoD info to the new tensor + out.set_lod(tensor.lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, tensor, + &out); + framework::SerializeToStream(ss, out, dev_ctx); + } else { + framework::SerializeToStream(ss, tensor, dev_ctx); + } } else { - framework::SerializeToStream(ss, tensor, dev_ctx); + auto &tensor = inp_vars[i]->Get(); + std::unordered_map data; + for (auto it = tensor.begin(); it != tensor.end(); ++it) { + std::string t; + framework::ConvertWstrToStr(it->first, &t); + data.emplace(t, it->second); + } + framework::StringMapToStream(ss, data); } } if (save_to_memory) { diff --git a/paddle/fluid/operators/string/CMakeLists.txt b/paddle/fluid/operators/string/CMakeLists.txt new file mode 100644 index 00000000000000..1da2e8e455da0c --- /dev/null +++ b/paddle/fluid/operators/string/CMakeLists.txt @@ -0,0 +1,6 @@ +include(operators) +if(WITH_UNITY_BUILD) + # Load Unity Build rules for operators in paddle/fluid/operators/sequence_ops. + include(unity_build_rule.cmake) +endif() +register_operators(DEPS op_version_registry utf8proc string_array) diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc new file mode 100644 index 00000000000000..49457af8f00c80 --- /dev/null +++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc @@ -0,0 +1,524 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "paddle/fluid/framework/string_array.h" +#include "paddle/fluid/operators/string/faster_tokenizer_op.h" + +namespace paddle { +namespace operators { + +using std::bad_cast; +using std::codecvt_utf8; +using std::endl; +using std::exception; +using std::ifstream; +using std::int64_t; +using std::min; +using std::runtime_error; +using std::unordered_map; +using std::unordered_set; +using std::shared_ptr; +using std::size_t; +using std::int64_t; +using std::string; +using std::vector; +using std::wstring; + +const wstring kStripChars = L" \t\n\r\v\f"; + +inline bool IsControl(const wchar_t& ch) { + if (ch == L'\t' || ch == L'\n' || ch == L'\r') return false; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_CC || cat == UTF8PROC_CATEGORY_CF) return true; + return false; +} + +inline bool IsChineseChar(const wchar_t& ch) { + if ((ch >= 0x4E00 && ch <= 0x9FFF) || (ch >= 0x3400 && ch <= 0x4DBF) || + (ch >= 0x20000 && ch <= 0x2A6DF) || (ch >= 0x2A700 && ch <= 0x2B73F) || + (ch >= 0x2B740 && ch <= 0x2B81F) || (ch >= 0x2B820 && ch <= 0x2CEAF) || + (ch >= 0xF900 && ch <= 0xFAFF) || (ch >= 0x2F800 && ch <= 0x2FA1F)) + return true; + return false; +} + +inline bool IsWhiteSpace(const wchar_t& ch) { + if (ch == L' ' || ch == L'\t' || ch == L'\n' || ch == L'\r') return true; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_ZS) return true; + return false; +} + +inline bool IsPunctuation(const wchar_t& ch) { + if ((ch >= 33 && ch <= 47) || (ch >= 58 && ch <= 64) || + (ch >= 91 && ch <= 96) || (ch >= 123 && ch <= 126)) + return true; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PS || + cat == UTF8PROC_CATEGORY_PE || cat == UTF8PROC_CATEGORY_PC || + cat == UTF8PROC_CATEGORY_PO // sometimes ¶ belong SO + || cat == UTF8PROC_CATEGORY_PI || cat == UTF8PROC_CATEGORY_PF) + return true; + return false; +} + +BasicTokenizer::BasicTokenizer(bool do_lower_case /* = true */) + : do_lower_case_(do_lower_case) {} + +wchar_t BasicTokenizer::do_lower_case(wchar_t ch) const { + wchar_t new_ch = utf8proc_tolower(ch); + return new_ch; +} + +void BasicTokenizer::Tokenize(const string& text, vector* res) const { + std::wstring unicode_text; + bool status = framework::ConvertStrToWstr(text, &unicode_text); + if (!status) { + // String is converted into wstring failedly. + return; + } + + std::wstring dest_text; + for (auto ch : unicode_text) { + if (ch == 0 || ch == 0xfffd || IsControl(ch)) { + continue; + } + if (do_lower_case_) { + ch = do_lower_case(ch); + } + if (IsChineseChar(ch) || IsPunctuation(ch)) { + dest_text += ' '; + dest_text += ch; + dest_text += ' '; + } else if (IsWhiteSpace(ch)) { + dest_text += ' '; + } else { + dest_text += ch; + } + } + boost::split(*res, dest_text, boost::is_any_of(kStripChars)); +} + +WordPieceTokenizer::WordPieceTokenizer( + framework::Vocab* vocab, const wstring& unk_token /* = L"[UNK]"*/, + const size_t max_input_chars_per_word /* = 100 */) + : vocab_(vocab), + unk_token_(unk_token), + max_input_chars_per_word_(max_input_chars_per_word) { + unk_token_id_ = (*vocab_)[unk_token_]; +} + +void WordPieceTokenizer::Tokenize(const wstring& text, + vector* token_ids) const { + size_t len = text.size(); + if (len > max_input_chars_per_word_) { + token_ids->emplace_back(std::move(unk_token_id_)); + return; + } + + auto it = vocab_->find(text); + if (it != vocab_->end()) { + token_ids->emplace_back(std::move(it->second)); + return; + } + + size_t start = 0; + vector wordpiece_ids; + while (start < len) { + size_t end = len; + std::wstring cur_substr; + int64_t cur_substr_id; + while (start < end) { + std::wstring sub = text.substr(start, end - start); + if (start > 0) { + sub = L"##" + sub; + } + auto it = vocab_->find(sub); + if (it != vocab_->end()) { + cur_substr = sub; + cur_substr_id = it->second; + break; + } + end -= 1; + } + + if (cur_substr.empty()) { + token_ids->emplace_back(std::move(unk_token_id_)); + return; + } else { + start = end; + wordpiece_ids.emplace_back(std::move(cur_substr_id)); + } + } + for (auto& token_id : wordpiece_ids) { + token_ids->emplace_back(std::move(token_id)); + } +} + +BertTokenizer::BertTokenizer(framework::Vocab* vocab, + bool do_lower_case /* = false */, + const wstring& unk_token /* = L"[UNK]" */, + const wstring& pad_token /* = L"[PAD]" */, + const wstring& cls_token /* = L"[CLS]" */, + const wstring& mask_token /* = L"[MASK]" */, + const wstring& sep_token /* = L"[SEP]" */, + const string& padding_site /* = "right" */) + : do_lower_case_(do_lower_case), + unk_token_(unk_token), + pad_token_(pad_token), + cls_token_(cls_token), + mask_token_(mask_token), + sep_token_(sep_token), + padding_site_(padding_site), + vocab_(vocab), + basic_tokenizer_(do_lower_case_), + word_piece_tokenizer_(vocab_, unk_token) { + unk_token_id_ = (*vocab_)[unk_token_]; + pad_token_id_ = (*vocab_)[pad_token_]; + cls_token_id_ = (*vocab_)[cls_token_]; + mask_token_id_ = (*vocab_)[mask_token_]; + sep_token_id_ = (*vocab_)[sep_token_]; + + all_special_tokens_ = vector( + {unk_token_, pad_token_, cls_token_, mask_token_, sep_token_}); + all_special_token_ids_ = + unordered_set({unk_token_id_, pad_token_id_, cls_token_id_, + mask_token_id_, sep_token_id_}); +} + +void BertTokenizer::Tokenize(const string& text, + vector* split_token_ids) const { + std::vector tmp_tokens; + basic_tokenizer_.Tokenize(text, &tmp_tokens); + if (tmp_tokens.empty()) return; + split_token_ids->reserve(tmp_tokens.size()); + for (auto& w_token : tmp_tokens) { + const auto& vec_size = w_token.size(); + if (vec_size == 1) { + if (IsChineseChar(w_token[0])) { + auto vocab_it = vocab_->find(w_token); + if (vocab_it != vocab_->end()) { + split_token_ids->emplace_back(std::move(vocab_it->second)); + } else { + split_token_ids->emplace_back(std::move(unk_token_id_)); + } + } else { + word_piece_tokenizer_.Tokenize(w_token, split_token_ids); + } + } else if (vec_size > 1) { + word_piece_tokenizer_.Tokenize(w_token, split_token_ids); + } else { + continue; + } + } +} + +void BertTokenizer::BuildInputsWithSpecialTokens( + vector* inputs, const vector& token_ids_0, + const vector& token_ids_1 /* = vector() */) const { + if (token_ids_1.size() == 0) { + inputs->clear(); + inputs->resize(token_ids_0.size() + 2); + inputs->at(0) = std::move(cls_token_id_); + size_t i = 1; + for (auto& token_id : token_ids_0) { + inputs->at(i) = std::move(token_id); + ++i; + } + inputs->at(i) = std::move(sep_token_id_); + } else { + inputs->clear(); + inputs->resize(token_ids_0.size() + token_ids_1.size() + 3); + inputs->at(0) = std::move(cls_token_id_); + size_t i = 1; + for (auto& token_id : token_ids_0) { + inputs->at(i) = std::move(token_id); + ++i; + } + inputs->at(i) = std::move(sep_token_id_); + ++i; + for (auto& token_id : token_ids_1) { + inputs->at(i) = std::move(token_id); + ++i; + } + inputs->at(i) = std::move(sep_token_id_); + } +} + +int64_t BertTokenizer::GetNumSpecialTokensToAdd(const bool pair) const { + if (pair) { + return 3; + } else { + return 2; + } +} + +void BertTokenizer::CreateTokenTypeIdsFromSequences( + vector* token_type_ids, const vector& token_ids_0, + const vector& token_ids_1 /* = vector() */) const { + if (token_ids_1.size() == 0) { + vector tmp(token_ids_0.size() + 2, 0); + token_type_ids->swap(tmp); + } else { + vector tmp(token_ids_0.size() + token_ids_1.size() + 3, 0); + for (size_t i = token_ids_0.size() + 2; i < tmp.size(); i++) { + tmp[i] = 1; + } + token_type_ids->swap(tmp); + } +} + +void BertTokenizer::TruncateSequence( + vector* ids, vector* pair_ids, + const size_t num_tokens_to_remove /* = 0 */, + const size_t stride /* = 0 */) const { + for (size_t i = 0; i < num_tokens_to_remove; i++) { + if ((pair_ids->size() == 0) || (ids->size() > pair_ids->size())) { + ids->pop_back(); + } else { + pair_ids->pop_back(); + } + } +} + +int64_t BertTokenizer::GetPadTokenID() const { return pad_token_id_; } + +int BertTokenizer::Encode( + unordered_map>* encoded_inputs, const string& text, + const string& text_pair /* = "" */, bool is_split_into_words /* = false */, + const size_t max_seq_len /* = 0 */, + bool pad_to_max_seq_len /* = false */) const { + vector ids; + vector pair_ids; + if (!is_split_into_words) { + Tokenize(text, &ids); + if (ids.empty()) return 0; + if (text_pair != "") { + Tokenize(text_pair, &pair_ids); + if (pair_ids.empty()) return 0; + } + } else { + std::wstring unicode_text; + bool status_a = framework::ConvertStrToWstr(text, &unicode_text); + if (!status_a) { + return 0; + } + for (size_t i = 0; i < unicode_text.size(); i++) { + wstring token = unicode_text.substr(i, 1); + auto it = vocab_->find(token); + if (it != vocab_->end()) { + ids.emplace_back(std::move(it->second)); + } else { + ids.emplace_back(std::move(unk_token_id_)); + } + } + } + + bool pair = false; + if (pair_ids.size() != 0) { + pair = true; + } + + size_t len_ids = ids.size(); + size_t len_pair_ids = pair_ids.size(); + + // Truncation: Handle max sequence length + // If max_seq_len == 0, then do nothing and keep the real length. + // If max_seq_len > 0 and + // all the input sequence len is over the max_seq_len, + // then we truncate it. + size_t total_len = len_ids + len_pair_ids + GetNumSpecialTokensToAdd(pair); + if (max_seq_len > 0 && total_len > max_seq_len) { + TruncateSequence(&ids, &pair_ids, total_len - max_seq_len); + } + + // Add special tokens + vector sequence; + BuildInputsWithSpecialTokens(&sequence, ids, pair_ids); + size_t seq_len = sequence.size(); + vector token_type_ids; + CreateTokenTypeIdsFromSequences(&token_type_ids, ids, pair_ids); + + // Build output dictionnary + encoded_inputs->emplace("input_ids", sequence); + encoded_inputs->emplace("token_type_ids", token_type_ids); + // Check lengths + if (max_seq_len > 0 && seq_len > max_seq_len) { + VLOG(3) << "There is something wrong with the input sequence length." + " Please check it."; + // Failed. + return 0; + } + + // Padding + bool needs_to_be_padded = false; + if (pad_to_max_seq_len && max_seq_len > 0 && (seq_len < max_seq_len)) { + needs_to_be_padded = true; + } + + if (needs_to_be_padded) { + int64_t difference = max_seq_len - seq_len; + size_t pad_start = max_seq_len - 1 - difference; + encoded_inputs->at("token_type_ids").resize(max_seq_len); + for (size_t i = max_seq_len - 1; i > pad_start; i--) { + encoded_inputs->at("token_type_ids")[i] = pad_token_id_; + } + + encoded_inputs->at("input_ids").resize(max_seq_len); + for (size_t i = max_seq_len - 1; i > pad_start; i--) { + encoded_inputs->at("input_ids")[i] = pad_token_id_; + } + } + return 1; +} + +void BertTokenizer::BatchEncode( + vector>>* batch_encode_inputs, + const vector& batch_text, + const vector& batch_text_pair /* = vector() */, + bool is_split_into_words /* = false */, const size_t max_seq_len /* = 0 */, + bool pad_to_max_seq_len /* = false */) const { + bool has_text_pair = false; + if (batch_text_pair.size() != 0) { + has_text_pair = true; + } + + size_t batch_size = batch_text.size(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (size_t i = 0; i < batch_size; i++) { + unordered_map> res; + if (has_text_pair) { + auto status = + Encode(&res, batch_text[i], batch_text_pair[i], is_split_into_words, + max_seq_len, pad_to_max_seq_len); + if (!status) { + res["input_ids"] = + std::vector{cls_token_id_, sep_token_id_, cls_token_id_}; + res["token_type_ids"] = std::vector{0, 0, 1}; + } + } else { + auto status = Encode(&res, batch_text[i], {}, is_split_into_words, + max_seq_len, pad_to_max_seq_len); + + if (!status) { + res["input_ids"] = std::vector{cls_token_id_, sep_token_id_}; + res["token_type_ids"] = std::vector{0, 0}; + } + } + batch_encode_inputs->at(i) = std::move(res); + } +} + +class FasterTokenizerOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Text"), "Input", "Text", "Tokenizer"); + OP_INOUT_CHECK(ctx->HasInput("Vocab"), "Input", "Vocab", "Tokenizer"); + OP_INOUT_CHECK(ctx->HasOutput("InputIds"), "Output", "InputIds", + "Tokenizer"); + OP_INOUT_CHECK(ctx->HasOutput("SegmentIds"), "Output", "SegmentIds", + "Tokenizer"); + + ctx->SetOutputDim("InputIds", {-1, -1}); + ctx->SetOutputDim("SegmentIds", {-1, -1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::INT64, + paddle::platform::CPUPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + return framework::OpKernelType(expected_kernel_type.data_type_, + expected_kernel_type.place_, + tensor.layout()); + } +}; + +class FasterTokenizerOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Vocab", + "(std::map), The vocab to map " + "token string to token id."); + AddInput("Text", + "(std::vector), The sequence to be processed. " + "One sequence is a string, a list of strings, " + "or a list of integers depending on whether it " + "has been pretokenized and converted to ids. "); + AddInput("TextPair", + "(std::vector), Same as `text` argument, " + "while it represents for the latter sequence of the " + "sequence pair.") + .AsDispensable(); + AddOutput("InputIds", "(Tensor), The token ids of the input text."); + AddOutput("SegmentIds", "(Tensor), The segments ids of the input text."); + AddAttr( + "do_lower_case", + "(bool), Whether or not to lowercase the input when tokenizing.") + .SetDefault(false); + AddAttr( + "is_split_into_words", + "(bool), Whether or not the input is already pre-tokenized " + "(e.g., split into words). If set to True, the tokenizer " + "assumes the input is already split into words (for instance, " + "by splitting it on whitespace) which it will tokenize. This " + "is useful for NER or token classification.") + .SetDefault(false); + AddAttr("max_seq_len", + "(int), If set to a positive number, will limit the " + "total sequence returned so that it has a maximum length." + " If there are overflowing tokens, those overflowing " + "tokens will be added to the returned dictionary when " + "`return_overflowing_tokens` is `True`.") + .SetDefault(0); + AddAttr("pad_to_max_seq_len", + "(bool), If set to `True`, the returned sequences would be" + " padded up to `max_seq_len` specified length according to" + " padding side and padding token id.") + .SetDefault(false); + AddComment(R"DOC(Performs tokenization and uses the tokenized tokens to " + "prepare model inputs. It supports sequence or sequence pair as input, " + "and batch input is not allowed.)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(faster_tokenizer, ops::FasterTokenizerOp, + ops::FasterTokenizerOpMaker); + +REGISTER_OP_CPU_KERNEL(faster_tokenizer, ops::FasterTokenizerKernel); diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h new file mode 100755 index 00000000000000..d9b7fa26a6704b --- /dev/null +++ b/paddle/fluid/operators/string/faster_tokenizer_op.h @@ -0,0 +1,196 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/string_array.h" + +namespace paddle { +namespace operators { + +using std::endl; +using std::int64_t; +using std::size_t; +using std::string; +using std::shared_ptr; +using std::vector; +using std::unordered_map; +using std::unordered_set; +using std::vector; +using std::wstring; +using std::wcout; + +inline bool IsControl(const wchar_t& ch); +inline bool IsChineseChar(const wchar_t& ch); +inline bool IsWhiteSpace(const wchar_t& ch); + +using Vocab = unordered_map; +using InvVocab = unordered_map; + +class BasicTokenizer { + public: + explicit BasicTokenizer(bool do_lower_case = true); + void Tokenize(const string& text, vector* res) const; + + private: + wchar_t do_lower_case(wchar_t ch) const; + + bool do_lower_case_; +}; + +class WordPieceTokenizer { + public: + explicit WordPieceTokenizer(framework::Vocab* vocab, + const wstring& unk_token = L"[UNK]", + const size_t max_input_chars_per_word = 100); + void Tokenize(const wstring& text, vector* output) const; + + private: + framework::Vocab* vocab_; + wstring unk_token_{L"[UNK]"}; + int64_t unk_token_id_; + size_t max_input_chars_per_word_; +}; + +class BertTokenizer { + public: + explicit BertTokenizer(framework::Vocab* vocab, bool do_lower_case = false, + const wstring& unk_token = L"[UNK]", + const wstring& pad_token = L"[PAD]", + const wstring& cls_token = L"[CLS]", + const wstring& mask_token = L"[MASK]", + const wstring& sep_token = L"[SEP]", + const string& padding_site = "right"); + + void Tokenize(const string& text, vector* split_tokens) const; + void BuildInputsWithSpecialTokens( + vector* res, const vector& token_ids_0, + const vector& token_ids_1 = vector()) const; + void CreateTokenTypeIdsFromSequences( + vector* token_type_ids, const vector& token_ids_0, + const vector& token_ids_1 = vector()) const; + void TruncateSequence(vector* ids, vector* pair_ids, + const size_t num_tokens_to_remove = 0, + const size_t stride = 0) const; + int64_t GetNumSpecialTokensToAdd(const bool pair = false) const; + int Encode(unordered_map>* encoded_inputs, + const string& text, const string& text_pair = "", + bool is_split_into_words = false, const size_t max_seq_len = 0, + bool pad_to_max_seq_len = false) const; + void BatchEncode( + vector>>* batch_encode_inputs, + const vector& batch_text, + const vector& batch_text_pair = vector(), + bool is_split_into_words = false, const size_t max_seq_len = 0, + bool pad_to_max_seq_len = false) const; + + int64_t GetPadTokenID() const; + + private: + bool do_lower_case_; + wstring unk_token_, pad_token_, cls_token_, mask_token_, sep_token_; + string padding_site_; + framework::Vocab* vocab_; + BasicTokenizer basic_tokenizer_; + WordPieceTokenizer word_piece_tokenizer_; + int64_t unk_token_id_, cls_token_id_, mask_token_id_, pad_token_id_, + sep_token_id_; + vector all_special_tokens_; + unordered_set all_special_token_ids_; + InvVocab inv_vocab_; +}; + +template +class FasterTokenizerKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* text = ctx.Input("Text"); + auto* vocab = ctx.Input("Vocab"); + + auto* input_ids = ctx.Output("InputIds"); + auto* seg_ids = ctx.Output("SegmentIds"); + + auto do_lower_case = static_cast(ctx.Attr("do_lower_case")); + auto is_split_into_words = + static_cast(ctx.Attr("is_split_into_words")); + auto max_seq_len = static_cast(ctx.Attr("max_seq_len")); + auto pad_to_max_seq_len = + static_cast(ctx.Attr("pad_to_max_seq_len")); + + auto* text_pair = ctx.Input("TextPair"); + if (text_pair && text->size() != text_pair->size()) { + VLOG(3) << "The input text(list[str]) and text pair (list[str]) must" + << "be the same number of text sequence. Please check the input!"; + return; + } + + BertTokenizer* tokenizer_ptr = + new BertTokenizer(const_cast(vocab), do_lower_case); + size_t batch_max_seq_len = 0; + size_t batch_size = text->size(); + + vector>> batch_encode_inputs( + batch_size); + if (text_pair) { + tokenizer_ptr->BatchEncode(&batch_encode_inputs, *text, *text_pair, + is_split_into_words, max_seq_len, + pad_to_max_seq_len); + } else { + tokenizer_ptr->BatchEncode(&batch_encode_inputs, *text, vector(), + is_split_into_words, max_seq_len, + pad_to_max_seq_len); + } + + for (size_t i = 0; i < batch_size; ++i) { + size_t seq_len = batch_encode_inputs[i]["input_ids"].size(); + if (seq_len > batch_max_seq_len) { + batch_max_seq_len = seq_len; + } + } + + input_ids->Resize( + framework::make_ddim({static_cast(batch_size), + static_cast(batch_max_seq_len)})); + auto* input_ids_data = input_ids->mutable_data(ctx.GetPlace()); + seg_ids->Resize( + framework::make_ddim({static_cast(batch_size), + static_cast(batch_max_seq_len)})); + auto* seg_ids_data = seg_ids->mutable_data(ctx.GetPlace()); + + auto pad_token_id = tokenizer_ptr->GetPadTokenID(); + for (size_t i = 0; i < batch_size; i++) { + auto& encoder_input_ids = batch_encode_inputs[i]["input_ids"]; + auto& encoder_seg_ids = batch_encode_inputs[i]["token_type_ids"]; + const size_t& seq_len = encoder_input_ids.size(); + // Copy the memory + std::memcpy(input_ids_data + i * batch_max_seq_len, + encoder_input_ids.data(), seq_len * sizeof(T)); + std::memcpy(seg_ids_data + i * batch_max_seq_len, encoder_seg_ids.data(), + seq_len * sizeof(T)); + std::memset(input_ids_data + i * batch_max_seq_len + seq_len, + pad_token_id, (batch_max_seq_len - seq_len) * sizeof(T)); + std::memset(seg_ids_data + i * batch_max_seq_len + seq_len, pad_token_id, + (batch_max_seq_len - seq_len) * sizeof(T)); + } + delete tokenizer_ptr; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/string/unity_build_rule.cmake b/paddle/fluid/operators/string/unity_build_rule.cmake new file mode 100644 index 00000000000000..a4b209d2df13e6 --- /dev/null +++ b/paddle/fluid/operators/string/unity_build_rule.cmake @@ -0,0 +1,8 @@ +# This file records the Unity Build compilation rules. +# The source files in a `register_unity_group` called are compiled in a unity +# file. +# Generally, the combination rules in this file do not need to be modified. +# If there are some redefined error in compiling with the source file which +# in combination rule, you can remove the source file from the following rules. +register_unity_group(cc + faster_tokenizer_op.cc) \ No newline at end of file diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index f94afaa56b8dfd..8b01f02ee2c3a6 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -1875,6 +1875,12 @@ void BindImperative(py::module *m_ptr) { } else if (self.Var().IsType()) { return framework::vectorize( self.Var().Get().value().dims()); + } else if (self.Var().IsType()) { + return std::vector{static_cast( + self.Var().Get().size())}; + } else if (self.Var().IsType()) { + return std::vector{ + static_cast(self.Var().Get().size())}; } else { VLOG(2) << "It is meaningless to get shape of " "variable type " diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index e02f25ff636a29..5193724ecedf5d 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -185,6 +185,18 @@ void ZeroCopyTensorCreate( tensor.copy_from_cpu(static_cast(data.data())); } +/// \brief Experimental interface. +/// Create the Strings tensor from data. +/// \param tensor The tensor will be created and +/// the tensor value is same as data. +/// \param data The input text. +void ZeroCopyStringTensorCreate(ZeroCopyTensor &tensor, // NOLINT + const paddle_infer::Strings *data) { + size_t shape = data->size(); + tensor.ReshapeStrings(shape); + tensor.copy_strings_from_cpu(data); +} + template void PaddleInferTensorCreate( paddle_infer::Tensor &tensor, // NOLINT @@ -195,6 +207,19 @@ void PaddleInferTensorCreate( tensor.CopyFromCpu(static_cast(data.data())); } +/// \brief Experimental interface. +/// Create the Strings tensor from data. +/// \param tensor The tensor will be created and +/// the tensor value is same as data. +/// \param data The input text. +void PaddleInferStringTensorCreate(paddle_infer::Tensor &tensor, // NOLINT + const paddle_infer::Strings *data) { + VLOG(3) << "Create PaddleInferTensor, dtype = Strings "; + size_t shape = data->size(); + tensor.ReshapeStrings(shape); + tensor.CopyStringsFromCpu(data); +} + size_t PaddleGetDTypeSize(PaddleDType dt) { size_t size{0}; switch (dt) { @@ -726,11 +751,15 @@ void BindPaddleInferPredictor(py::module *m) { void BindZeroCopyTensor(py::module *m) { py::class_(*m, "ZeroCopyTensor") - .def("reshape", &ZeroCopyTensor::Reshape) + .def("reshape", py::overload_cast &>( + &ZeroCopyTensor::Reshape)) + .def("reshape", py::overload_cast( + &paddle_infer::Tensor::ReshapeStrings)) .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) + .def("copy_from_cpu", &ZeroCopyStringTensorCreate) .def("copy_to_cpu", &ZeroCopyTensorToNumpy) .def("shape", &ZeroCopyTensor::shape) .def("set_lod", &ZeroCopyTensor::SetLoD) @@ -740,12 +769,16 @@ void BindZeroCopyTensor(py::module *m) { void BindPaddleInferTensor(py::module *m) { py::class_(*m, "PaddleInferTensor") - .def("reshape", &paddle_infer::Tensor::Reshape) + .def("reshape", py::overload_cast &>( + &paddle_infer::Tensor::Reshape)) + .def("reshape", py::overload_cast( + &paddle_infer::Tensor::ReshapeStrings)) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) + .def("copy_from_cpu_bind", &PaddleInferStringTensorCreate) .def("copy_to_cpu", &PaddleInferTensorToNumpy) .def("shape", &paddle_infer::Tensor::shape) .def("set_lod", &paddle_infer::Tensor::SetLoD) diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 01d101909b549b..d031709b765811 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -68,6 +68,7 @@ std::map> op_ins_map = { {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}}, {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}}, {"run_program", {"X", "Params"}}, + {"faster_tokenizer", {"Text", "Vocab", "TextPair"}}, {"matrix_rank", {"X", "TolTensor"}}, {"adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 99607d7f9750f5..984f3d1a31cce4 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -227,7 +227,10 @@ void BindVarDsec(pybind11::module *m) { .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY) .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST) .value("READER", pd::proto::VarType::READER) - .value("RAW", pd::proto::VarType::RAW); + .value("RAW", pd::proto::VarType::RAW) + .value("STRING", pd::proto::VarType::STRING) + .value("STRINGS", pd::proto::VarType::STRINGS) + .value("VOCAB", pd::proto::VarType::VOCAB); } void BindOpDesc(pybind11::module *m) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f58c2a5db381c7..529e7c6dab8ceb 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1239,6 +1239,18 @@ All parameter, weight, gradient are variables in Paddle. [](Variable &self) { return py::bytes(*self.GetMutable()); }) + .def("set_string_list", + [](Variable &self, Strings str_list) { + *self.GetMutable() = str_list; + }) + .def("set_vocab", [](Variable &self, + Vocab vocab) { *self.GetMutable() = vocab; }) + .def("get_string_tensor", + [](Variable &self) { return self.GetMutable(); }, + py::return_value_policy::reference) + .def("get_map_tensor", + [](Variable &self) { return self.GetMutable(); }, + py::return_value_policy::reference) .def("get_lod_rank_table", [](Variable &self) { return self.GetMutable(); }, py::return_value_policy::reference) @@ -1872,20 +1884,20 @@ All parameter, weight, gradient are variables in Paddle. .def("__str__", string::to_string); py::class_(m, "Operator") - .def_static("create", - [](py::bytes protobin) { - proto::OpDesc desc; - PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), - true, - platform::errors::InvalidArgument( - "Cannot parse user input to OpDesc")); - PADDLE_ENFORCE_EQ(desc.IsInitialized(), true, - platform::errors::InvalidArgument( - "The provided OpDesc is not " - "initialized, the reason is: %s", - desc.InitializationErrorString())); - return OpRegistry::CreateOp(desc); - }) + .def_static( + "create", + [](py::bytes protobin) { + proto::OpDesc desc; + PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), true, + platform::errors::InvalidArgument( + "Cannot parse user input to OpDesc")); + PADDLE_ENFORCE_EQ( + desc.IsInitialized(), true, + platform::errors::InvalidArgument( + "The provided OpDesc is not initialized, the reason is: %s", + desc.InitializationErrorString())); + return OpRegistry::CreateOp(desc); + }) .def("run", [](OperatorBase &self, const Scope &scope, const platform::CPUPlace &place) { @@ -2139,7 +2151,12 @@ All parameter, weight, gradient are variables in Paddle. }); #endif - m.def("set_feed_variable", framework::SetFeedVariable); + m.def("set_feed_variable", + static_cast(&framework::SetFeedVariable)); + m.def("set_feed_variable", + static_cast(&framework::SetFeedVariable)); m.def("get_fetch_variable", [](const Scope &scope, const std::string &var_name, size_t index) -> py::object { diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index d41c373bf50938..2db9fb5d76a587 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -799,12 +799,17 @@ def fun(inputs): # 3. share parameters from Layer to scope & record var info for param_or_buffer in concrete_program.parameters: # share to scope - param_or_buffer_tensor = scope.var( - param_or_buffer.name).get_tensor() - #src_tensor = param_or_buffer.value().get_tensor() - src_tensor = state_var_dict[param_or_buffer.name].value( - ).get_tensor() - param_or_buffer_tensor._share_data_with(src_tensor) + if param_or_buffer.type == core.VarDesc.VarType.VOCAB: + scr_tensor = param_or_buffer.value().get_map_tensor() + tgt_var = scope.var(param_or_buffer.name) + tgt_var.set_vocab(scr_tensor) + else: + param_or_buffer_tensor = scope.var( + param_or_buffer.name).get_tensor() + #src_tensor = param_or_buffer.value().get_tensor() + src_tensor = state_var_dict[param_or_buffer.name].value( + ).get_tensor() + param_or_buffer_tensor._share_data_with(src_tensor) # record var info if param_or_buffer.name not in extra_var_info: extra_info_dict = dict() diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index e4b6bc01034268..694f9dc25e80c5 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -1409,13 +1409,22 @@ def _check_match(key, param): if state is None: raise ValueError("{} is not found in the provided dict.".format( key)) - state_shape = state.shape() if inspect.ismethod( - state.shape) else state.shape - if list(state_shape) != list(param.shape): - raise ValueError( - "{} receives a shape {}, but the expected shape is {}.". - format(key, list(state_shape), list(param.shape))) - return param, state + if (isinstance(state, dict) or isinstance(state, list)): + if (len(state) != len(param)): + raise ValueError("{} receieves the length of {}, " + "but the expected shape is {}".format( + key, len(state), len(param))) + else: + return param, state + else: + state_shape = state.shape() if inspect.ismethod( + state.shape) else state.shape + + if list(state_shape) != list(param.shape): + raise ValueError( + "{} receives a shape {}, but the expected shape is {}.". + format(key, list(state_shape), list(param.shape))) + return param, state matched_param_state = [] for key, param in self.state_dict().items(): diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py index b92e54d4868dfe..3731976ad18ab7 100644 --- a/python/paddle/fluid/dygraph/math_op_patch.py +++ b/python/paddle/fluid/dygraph/math_op_patch.py @@ -133,7 +133,12 @@ def _int_(var): return int(var.numpy().flatten()[0]) def _len_(var): - return var.shape[0] + if var.type == core.VarDesc.VarType.VOCAB: + return len(var.value().get_map_tensor()) + elif var.type == core.VarDesc.VarType.STRINGS: + return len(var.value().get_string_tensor()) + else: + return var.shape[0] def _index_(var): numel = np.prod(var.shape) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 9d8b1500d5b02f..e2fd36448ba654 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -146,25 +146,35 @@ def set_value(self, value): out = linear(t) # call with different weight """ - assert isinstance(value, (np.ndarray, core.VarBase)), \ - "Variable set_value function, arguments type only support Variable, numpy, VarBase" - - value_np = value - if isinstance(value, core.VarBase): - value_np = value.numpy() + assert isinstance(value, (np.ndarray, core.VarBase, dict, str)), \ + "Variable set_value function, arguments type only support Variable, numpy, VarBase, dict, string." + + if isinstance(value, (dict, str)): + assert len(self) == len( + value + ), "Variable length not match, Variable [ {} ] need tensor with length {} but load set tensor with length {}".format( + self.name, len(self), len(value)) + if isinstance(value, dict): + self.value().set_vocab(value) + else: + self.value().set_string_list(value) + else: + value_np = value + if isinstance(value, core.VarBase): + value_np = value.numpy() - self_tensor_np = self.numpy() + self_tensor_np = self.numpy() - assert self_tensor_np.shape == value_np.shape, \ - "Variable Shape not match, Variable [ {} ] need tensor with shape {} but load set tensor with shape {}".format( - self.name, self_tensor_np.shape, value_np.shape) + assert self_tensor_np.shape == value_np.shape, \ + "Variable Shape not match, Variable [ {} ] need tensor with shape {} but load set tensor with shape {}".format( + self.name, self_tensor_np.shape, value_np.shape) - assert self_tensor_np.dtype == value_np.dtype, \ - "Variable dtype not match, Variable [ {} ] need tensor with dtype {} but load tensor with dtype {}".format( - self.name, self_tensor_np.dtype, value_np.dtype) + assert self_tensor_np.dtype == value_np.dtype, \ + "Variable dtype not match, Variable [ {} ] need tensor with dtype {} but load tensor with dtype {}".format( + self.name, self_tensor_np.dtype, value_np.dtype) - self.value().get_tensor().set(value_np, - framework._current_expected_place()) + self.value().get_tensor().set(value_np, + framework._current_expected_place()) @framework.dygraph_only def backward(self, grad_tensor=None, retain_graph=False): diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 17f8a7291ad8ff..6fba200f54099d 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -792,9 +792,11 @@ def _feed_data(self, program, feed, feed_var_name, scope): feed_target_name = op.desc.output('Out')[0] cur_feed = feed[feed_target_name] var = global_block.var(feed_target_name) - if not isinstance(cur_feed, core.LoDTensor): - cur_feed = _as_lodtensor(cur_feed, self.place, var.dtype) - check_feed_shape_type(var, cur_feed) + if var.dtype != core.VarDesc.VarType.STRINGS: + if not isinstance(cur_feed, core.LoDTensor): + cur_feed = _as_lodtensor(cur_feed, self.place, + var.dtype) + check_feed_shape_type(var, cur_feed) idx = op.desc.attr('col') core.set_feed_variable(scope, cur_feed, feed_var_name, idx) else: diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 60e00238f6cc99..a3cd34c32ebbf4 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -979,6 +979,10 @@ def __init__(self, if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) + if dtype == core.VarDesc.VarType.STRINGS: + type = core.VarDesc.VarType.STRINGS + lod_level = None + self.belong_to_optimizer = belong_to_optimizer self.error_clip = error_clip diff --git a/python/paddle/fluid/inference/wrapper.py b/python/paddle/fluid/inference/wrapper.py index 2c1b2c77504d92..6576ca785b6e15 100644 --- a/python/paddle/fluid/inference/wrapper.py +++ b/python/paddle/fluid/inference/wrapper.py @@ -29,10 +29,14 @@ def tensor_copy_from_cpu(self, data): ''' Support input type check based on tensor.copy_from_cpu. ''' - if not isinstance(data, np.ndarray): + if isinstance(data, np.ndarray) or (isinstance(data, list) and + len(data) > 0 and + isinstance(data[0], str)): + self.copy_from_cpu_bind(data) + else: raise TypeError( - "In copy_from_cpu, we only support numpy ndarray data type.") - self.copy_from_cpu_bind(data) + "In copy_from_cpu, we only support numpy ndarray and list[str] data type." + ) Tensor.copy_from_cpu = tensor_copy_from_cpu diff --git a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py new file mode 100755 index 00000000000000..496f3505ec41bc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py @@ -0,0 +1,393 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import io +import os +import unittest + +import numpy as np +import paddle +import paddle.nn as nn +from paddle.dataset.common import DATA_HOME +from paddle.fluid.framework import core, in_dygraph_mode +from paddle.fluid.layer_helper import LayerHelper + +import sys +sys.path.append("./tokenizer") +from tokenizer.bert_tokenizer import BertTokenizer + + +def to_string_tensor(string_values, name): + """ + Create the tensor that the value holds the list of string. + NOTICE: The value will be holded in the cpu place. + + Args: + string_values(list[string]): The value will be setted to the tensor. + name(string): The name of the tensor. + """ + tensor = paddle.Tensor(core.VarDesc.VarType.STRING, [], name, + core.VarDesc.VarType.STRINGS, False) + tensor.value().set_string_list(string_values) + return tensor + + +def to_map_tensor(string_dict, name): + """ + Create the tensor that the value holds the map, the type of key is the string + and the value is the int. + NOTICE: The value will be holded in the cpu place. + + Args: + string_dict(dict): The value will be setted to the tensor. + name(string): The name of the tensor. + """ + tensor = paddle.Tensor(core.VarDesc.VarType.RAW, [], name, + core.VarDesc.VarType.VOCAB, True) + tensor.value().set_vocab(string_dict) + return tensor + + +class FasterTokenizer(nn.Layer): + def __init__(self, vocab_dict): + super(FasterTokenizer, self).__init__() + vocab_tensor = to_map_tensor(vocab_dict, "vocab") + self.register_buffer("vocab", vocab_tensor, persistable=True) + + def forward(self, + text, + text_pair=None, + do_lower_case=True, + max_seq_len=-1, + is_split_into_words=False, + pad_to_max_seq_len=False): + if in_dygraph_mode(): + input_ids, seg_ids = core.ops.faster_tokenizer( + self.vocab, text, text_pair, "do_lower_case", do_lower_case, + "max_seq_len", max_seq_len, "pad_to_max_seq_len", + pad_to_max_seq_len, "is_split_into_words", is_split_into_words) + return input_ids, seg_ids + + attrs = { + "do_lower_case": do_lower_case, + "max_seq_len": max_seq_len, + "pad_to_max_seq_len": pad_to_max_seq_len, + "is_split_into_words": is_split_into_words, + } + helper = LayerHelper("faster_tokenizer") + input_ids = helper.create_variable_for_type_inference(dtype="int64") + seg_ids = helper.create_variable_for_type_inference(dtype="int64") + if text_pair is None: + helper.append_op( + type='faster_tokenizer', + inputs={'Vocab': self.vocab, + 'Text': text}, + outputs={'InputIds': input_ids, + 'SegmentIds': seg_ids}, + attrs=attrs) + else: + helper.append_op( + type='faster_tokenizer', + inputs={ + 'Vocab': self.vocab, + 'Text': text, + 'TextPair': text_pair + }, + outputs={'InputIds': input_ids, + 'SegmentIds': seg_ids}, + attrs=attrs) + return input_ids, seg_ids + + +class Predictor(object): + def __init__(self, model_dir): + model_file = os.path.join(model_dir, "inference.pdmodel") + params_file = os.path.join(model_dir, "inference.pdiparams") + if not os.path.exists(model_file): + raise ValueError("not find model file path {}".format(model_file)) + if not os.path.exists(params_file): + raise ValueError("not find params file path {}".format(params_file)) + config = paddle.inference.Config(model_file, params_file) + + # fast_tokenizer op only support cpu. + config.disable_gpu() + config.set_cpu_math_library_num_threads(10) + + config.switch_use_feed_fetch_ops(False) + self.predictor = paddle.inference.create_predictor(config) + self.input_handles = [ + self.predictor.get_input_handle(name) + for name in self.predictor.get_input_names() + ] + self.output_handles = [ + self.predictor.get_output_handle(name) + for name in self.predictor.get_output_names() + ] + + def predict(self, data): + + self.input_handles[0].copy_from_cpu(data) + self.predictor.run() + input_ids = self.output_handles[0].copy_to_cpu() + token_type_ids = self.output_handles[1].copy_to_cpu() + return input_ids, token_type_ids + + +class TestBertTokenizerOp(unittest.TestCase): + def setUp(self): + self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") + self.faster_tokenizer = FasterTokenizer(self.bert_tokenizer.vocab) + self.init_data() + self.save_path = os.path.join(DATA_HOME, "fast_tokenizer") + self.param_path = os.path.join(self.save_path, "model.pdparams") + self.inference_path = os.path.join(self.save_path, "inference") + + def init_data(self): + self.text = [ + '选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。' + '酒店装修一般,但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,' + '还算丰富。 服务吗,一般' + ] + self.text_pair = ['非常不错,服务很好,位于市中心区,交通方便,不过价格也高!'] + self.text_tensor = to_string_tensor(self.text, "text") + self.text_pair_tensor = to_string_tensor(self.text_pair, "text_pair") + self.texts = [ + '很好的地理位置,一蹋糊涂的服务,萧条的酒店。', + ' 选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般,' + '但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,还算丰富。 服务吗,一般', + 'Test bert tokenizer. The first text.' + ] + self.text_pairs = [ + '非常不错,服务很好,位于市中心区,交通方便,不过价格也高!', '房间太小。其他的都一般。。。。。。。。。', + 'Test bert tokenizer. The second text.' + ] + self.texts_tensor = to_string_tensor(self.texts, "texts") + self.text_pairs_tensor = to_string_tensor(self.text_pairs, "text_pairs") + + def test_padding(self): + + self.max_seq_len = 128 + self.pad_to_max_seq_len = True + self.is_split_into_words = False + + # case 1: only one text (batch_size = 1) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.text_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + text=self.text, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + # case 2: only one text and one text_pair (batch_size = 1) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.text_tensor, + text_pair=self.text_pair_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + text=self.text, + text_pair=self.text_pair, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + # case 3: only texts (batch_size = 3) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.texts_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + self.texts, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = [i["input_ids"] for i in encoded_inputs] + py_token_type_ids = [i["token_type_ids"] for i in encoded_inputs] + py_input_ids = np.array(py_input_ids).reshape([3, -1]) + py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + # case 4: texts and text pairs (batch_size = 3) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.texts_tensor, + text_pair=self.text_pairs_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + self.texts, + self.text_pairs, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = [i["input_ids"] for i in encoded_inputs] + py_token_type_ids = [i["token_type_ids"] for i in encoded_inputs] + py_input_ids = np.array(py_input_ids).reshape([3, -1]) + py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + def test_no_padding(self): + self.max_seq_len = 128 + self.pad_to_max_seq_len = False + self.is_split_into_words = False + + # case 1: only one text (batch_size = 1) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.text_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + self.text, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + # case 2: only one text and one text_pair (batch_size = 1) + input_ids, token_type_ids = self.faster_tokenizer( + self.text_tensor, + self.text_pair_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + self.text, + self.text_pair, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + def test_is_split_into_words(self): + self.is_split_into_words = True + + input_ids, token_type_ids = self.faster_tokenizer( + self.text_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + encoded_inputs = self.bert_tokenizer( + list(self.text[0]), is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs["token_type_ids"]).reshape( + [1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + def test_inference(self): + if not os.path.exists(self.save_path): + os.makedirs(self.save_path, exist_ok=True) + paddle.save(self.faster_tokenizer.state_dict(), self.param_path) + state_dict = paddle.load(self.param_path) + self.faster_tokenizer.set_dict(state_dict) + + static_model = paddle.jit.to_static( + self.faster_tokenizer, + input_spec=[ + paddle.static.InputSpec( + shape=[None], dtype=core.VarDesc.VarType.STRINGS), # texts + ]) + # Save in static graph model. + paddle.jit.save(static_model, self.inference_path) + predictor = Predictor(self.save_path) + input_ids, token_type_ids = predictor.predict(self.text) + + encoded_inputs = self.bert_tokenizer(self.text) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + def test_feed_string_var(self): + paddle.enable_static() + x = paddle.static.data( + name="x", shape=[-1], dtype=core.VarDesc.VarType.STRINGS) + exe = paddle.static.Executor(paddle.framework.CPUPlace()) + exe.run(paddle.static.default_main_program(), feed={'x': self.text}) + paddle.disable_static() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/tokenizer/__init__.py b/python/paddle/fluid/tests/unittests/tokenizer/__init__.py new file mode 100644 index 00000000000000..b9a7651e449096 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/tokenizer/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py b/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py new file mode 100755 index 00000000000000..00d5f4e7725289 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py @@ -0,0 +1,517 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import io +import json +import os +import six +import unicodedata + +from tokenizer_utils import PretrainedTokenizer +from tokenizer_utils import convert_to_unicode, whitespace_tokenize, _is_whitespace, _is_control, _is_punctuation + + +class BasicTokenizer(object): + """ + Runs basic tokenization (punctuation splitting, lower casing, etc.). + Args: + do_lower_case (bool): + Whether or not to lowercase the input when tokenizing. + Defaults to `True`. + """ + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer.""" + + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """ + Tokenizes a piece of text using basic tokenizer. + Args: + text (str): A piece of text. + Returns: + list(str): A list of tokens. + Examples: + .. code-block:: + from paddlenlp.transformers import BasicTokenizer + basictokenizer = BasicTokenizer() + tokens = basictokenizer.tokenize('He was a puppeteer') + ''' + ['he', 'was', 'a', 'puppeteer'] + ''' + """ + + text = convert_to_unicode(text) + text = self._clean_text(text) + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """ + Strips accents from a piece of text. + """ + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """ + Splits punctuation on a piece of text. + """ + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """ + Adds whitespace around any CJK character. + """ + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """ + Checks whether CP is the codepoint of a CJK character. + """ + + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """ + Performs invalid character removal and whitespace cleanup on text. + """ + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """ + Runs WordPiece tokenization. + Args: + vocab (Vocab|dict): + Vocab of the word piece tokenizer. + unk_token (str): + A specific token to replace all unknown tokens. + max_input_chars_per_word (int): + If a word's length is more than + max_input_chars_per_word, it will be dealt as unknown word. + Defaults to 100. + """ + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + Returns: + list (str): A list of wordpiece tokens. + Examples: + .. code-block:: + from paddlenlp.transformers import BertTokenizer, WordpieceTokenizer + berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + vocab = berttokenizer.vocab + unk_token = berttokenizer.unk_token + wordpiecetokenizer = WordpieceTokenizer(vocab,unk_token) + inputs = wordpiecetokenizer.tokenize("unaffable") + print(inputs) + ''' + ["un", "##aff", "##able"] + ''' + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +class BertTokenizer(PretrainedTokenizer): + """ + Constructs a BERT tokenizer. It uses a basic tokenizer to do punctuation + splitting, lower casing and so on, and follows a WordPiece tokenizer to + tokenize as subwords. + Args: + vocab_file (str): + The vocabulary file path (ends with '.txt') required to instantiate + a `WordpieceTokenizer`. + do_lower_case (bool): + Whether or not to lowercase the input when tokenizing. + Defaults to`True`. + unk_token (str): + A special token representing the *unknown (out-of-vocabulary)* token. + An unknown token is set to be `unk_token` inorder to be converted to an ID. + Defaults to "[UNK]". + sep_token (str): + A special token separating two different sentences in the same input. + Defaults to "[SEP]". + pad_token (str): + A special token used to make arrays of tokens the same size for batching purposes. + Defaults to "[PAD]". + cls_token (str): + A special token used for sequence classification. It is the last token + of the sequence when built with special tokens. Defaults to "[CLS]". + mask_token (str): + A special token representing a masked token. This is the token used + in the masked language modeling task which the model tries to predict the original unmasked ones. + Defaults to "[MASK]". + Examples: + .. code-block:: + from paddlenlp.transformers import BertTokenizer + berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + inputs = berttokenizer.tokenize('He was a puppeteer') + print(inputs) + ''' + {'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]} + ''' + """ + resource_files_names = {"vocab_file": "vocab.txt"} # for save_pretrained + pretrained_resource_files_map = { + "vocab_file": { + "bert-base-uncased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-uncased-vocab.txt", + "bert-large-uncased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-uncased-vocab.txt", + "bert-base-cased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-cased-vocab.txt", + "bert-large-cased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-cased-vocab.txt", + "bert-base-multilingual-uncased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-uncased-vocab.txt", + "bert-base-multilingual-cased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-cased-vocab.txt", + "bert-base-chinese": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt", + "bert-wwm-chinese": + "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese-vocab.txt", + "bert-wwm-ext-chinese": + "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-ext-chinese-vocab.txt", + "macbert-large-chinese": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt", + "macbert-base-chinese": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt", + "simbert-base-chinese": + "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/vocab.txt", + } + } + pretrained_init_configuration = { + "bert-base-uncased": { + "do_lower_case": True + }, + "bert-large-uncased": { + "do_lower_case": True + }, + "bert-base-cased": { + "do_lower_case": False + }, + "bert-large-cased": { + "do_lower_case": False + }, + "bert-base-multilingual-uncased": { + "do_lower_case": True + }, + "bert-base-multilingual-cased": { + "do_lower_case": False + }, + "bert-base-chinese": { + "do_lower_case": False + }, + "bert-wwm-chinese": { + "do_lower_case": False + }, + "bert-wwm-ext-chinese": { + "do_lower_case": False + }, + "macbert-large-chinese": { + "do_lower_case": False + }, + "macbert-base-chinese": { + "do_lower_case": False + }, + "simbert-base-chinese": { + "do_lower_case": True + }, + } + padding_side = 'right' + + def __init__(self, + vocab_file, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]"): + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the " + "vocabulary from a pretrained model please use " + "`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + .format(vocab_file)) + self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token) + self.do_lower_case = do_lower_case + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer( + vocab=self.vocab, unk_token=unk_token) + self.special_tokens_map = { + 'unk_token': unk_token, + 'sep_token': sep_token, + 'pad_token': pad_token, + 'cls_token': cls_token, + 'mask_token': mask_token + } + + @property + def vocab_size(self): + """ + Return the size of vocabulary. + Returns: + int: The size of vocabulary. + """ + + return len(self.vocab) + + def _tokenize(self, text): + """ + End-to-end tokenization for BERT models. + Args: + text (str): The text to be tokenized. + + Returns: + list: A list of string representing converted tokens. + """ + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + return split_tokens + + def tokenize(self, text): + """ + Converts a string to a list of tokens. + Args: + text (str): The text to be tokenized. + + Returns: + List(str): A list of string representing converted tokens. + Examples: + .. code-block:: + from paddlenlp.transformers import BertTokenizer + berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + tokens = berttokenizer.tokenize('He was a puppeteer') + + ''' + ['he', 'was', 'a', 'puppet', '##eer'] + ''' + """ + + return self._tokenize(text) + + def num_special_tokens_to_add(self, pair=False): + """ + Returns the number of added tokens when encoding a sequence with special tokens. + Args: + pair(bool): + Whether the input is a sequence pair or a single sequence. + Defaults to `False` and the input is a single sequence. + Returns: + int: Number of tokens added to sequences. + """ + token_ids_0 = [] + token_ids_1 = [] + return len( + self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 + if pair else None)) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. + + A BERT sequence has the following format: + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + Args: + token_ids_0 (List[int]): + List of IDs to which the special tokens will be added. + token_ids_1 (List[int], optional): + Optional second list of IDs for sequence pairs. Defaults to None. + Returns: + List[int]: List of input_id with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + _cls = [self.cls_token_id] + _sep = [self.sep_token_id] + return _cls + token_ids_0 + _sep + token_ids_1 + _sep + + def create_token_type_ids_from_sequences(self, + token_ids_0, + token_ids_1=None): + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + A BERT sequence pair mask has the following format: + :: + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). + Args: + token_ids_0 (List[int]): + A list of `inputs_ids` for the first sequence. + token_ids_1 (List[int], optional): + Optional second list of IDs for sequence pairs. Defaults to None. + Returns: + List[int]: List of token_type_id according to the given sequence(s). + """ + _sep = [self.sep_token_id] + _cls = [self.cls_token_id] + if token_ids_1 is None: + return len(_cls + token_ids_0 + _sep) * [0] + return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + + _sep) * [1] + + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``encode`` methods. + Args: + token_ids_0 (List[int]): + A list of `inputs_ids` for the first sequence. + token_ids_1 (List[int], optinal): + Optional second list of IDs for sequence pairs. Defaults to None. + already_has_special_tokens (bool, optional): Whether or not the token list is already + formatted with special tokens for the model. Defaults to None. + Returns: + List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return list( + map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, + token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ( + [0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py b/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py new file mode 100644 index 00000000000000..7da3cd56e25b5d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py @@ -0,0 +1,1244 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import io +import json +import os +import unicodedata +from shutil import copyfile +from typing import Iterable, Iterator, Optional, List, Any, Callable, Union + +from paddle.dataset.common import DATA_HOME +from paddle.utils.download import get_path_from_url + + +def convert_to_unicode(text): + """ + Converts `text` to Unicode (if it's not already), assuming utf-8 input. + Args: + text (str|bytes): Text to be converted to unicode. + Returns: + str: converted text. + """ + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + + +def whitespace_tokenize(text): + """ + Runs basic whitespace cleaning and splitting on a peice of text. + Args: + text (str): Text to be tokened. + Returns: + list(str): Token list. + """ + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +def _is_whitespace(char): + """ + Checks whether `chars` is a whitespace character. + """ + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False + + +def is_chinese_char(cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + +def tokenize_chinese_chars(text): + """Adds whitespace around any CJK character.""" + output = [] + buff = "" + for char in text: + cp = ord(char) + if is_chinese_char(cp): + if buff != "": + output.append(buff) + buff = "" + output.append(char) + else: + buff += char + + if buff != "": + output.append(buff) + + return output + + +class PretrainedTokenizer(object): + """ + The base class for all pretrained tokenizers. It mainly provides common methods + for loading (construction and loading) and saving pretrained tokenizers. Loading + and saving also rely on the following class attributes which should be overridden + by derived classes accordingly: + - **tokenizer_config_file** (str): Represents the file name of tokenizer + configuration for configuration saving and loading in local file system. + The value is `tokenizer_config.json`. + - **resource_files_names** (dict): Represents resources to specific file + names mapping for resource saving and loading in local file system. The + keys of dict representing resource items should be argument names in + tokenizer's `__init__` method, and the values are file names for saving + and loading corresponding resources. The mostly used resources here are + vocabulary file and sentence-piece model file. + - **pretrained_init_configuration** (dict): Provides the tokenizer configurations + of built-in pretrained tokenizers (contrasts to tokenizers in local file + system). It has pretrained tokenizer names as keys (the same as pretrained + model names, such as `bert-base-uncased`), and the values are dict preserving + corresponding configuration for tokenizer initialization. + - **pretrained_resource_files_map** (dict): Provides resource URLs of built-in + pretrained tokenizers (contrasts to tokenizers in local file system). It + has the same keys as `resource_files_names`, and the values are also `dict` + mapping specific pretrained tokenizer names (such as `bert-base-uncased`) + to corresponding resource URLs. + Moreover, methods common to tokenizers for tokenization, token/id conversion + and encoding as model inputs are also provided here. + Besides, metaclass `InitTrackerMeta` is used to create `PretrainedTokenizer`, + by which subclasses can track arguments for initialization automatically + and expose special tokens initialization used as attributes. + """ + tokenizer_config_file = "tokenizer_config.json" + pretrained_init_configuration = {} + resource_files_names = {} # keys are arguments of __init__ + pretrained_resource_files_map = {} + padding_side = 'right' + pad_token_type_id = 0 + + def __call__(self, + text, + text_pair=None, + max_seq_len: Optional[int]=None, + stride=0, + is_split_into_words=False, + pad_to_max_seq_len=False, + truncation_strategy="longest_first", + return_position_ids=False, + return_token_type_ids=True, + return_attention_mask=False, + return_length=False, + return_overflowing_tokens=False, + return_special_tokens_mask=False): + """ + Performs tokenization and uses the tokenized tokens to prepare model + inputs. It supports sequence or sequence pair as input, and batch input + is allowed. `self.encode()` or `self.batch_encode()` would be called + separately for single or batch input depending on input format and + `is_split_into_words` argument. + Args: + text (str, List[str] or List[List[str]]): + The sequence or batch of sequences to be processed. One sequence + is a string or a list of strings depending on whether it has been + pretokenized. If each sequence is provided as a list of strings + (pretokenized), you must set `is_split_into_words` as `True` to + disambiguate with a batch of sequences. + text_pair (str, List[str] or List[List[str]], optional): + Same as `text` argument, while it represents for the latter + sequence of the sequence pair. + max_seq_len (int, optional): + If set to a number, will limit the total sequence returned so + that it has a maximum length. If there are overflowing tokens, + those overflowing tokens will be added to the returned dictionary + when `return_overflowing_tokens` is `True`. Defaults to `None`. + stride (int, optional): + Only available for batch input of sequence pair and mainly for + question answering usage. When for QA, `text` represents questions + and `text_pair` represents contexts. If `stride` is set to a + positive number, the context will be split into multiple spans + where `stride` defines the number of (tokenized) tokens to skip + from the start of one span to get the next span, thus will produce + a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample' + and 'offset_mapping' preserving the original example and position + information will be added to the returned dictionary. Defaults to 0. + pad_to_max_seq_len (bool, optional): + If set to `True`, the returned sequences would be padded up to + `max_seq_len` specified length according to padding side + (`self.padding_side`) and padding token id. Defaults to `False`. + truncation_strategy (str, optional): + String selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence + until the input is under `max_seq_len` starting from the longest + one at each token (when there is a pair of input sequences). + - 'only_first': Only truncate the first sequence. + - 'only_second': Only truncate the second sequence. + - 'do_not_truncate': Do not truncate (raise an error if the input + sequence is longer than `max_seq_len`). + Defaults to 'longest_first'. + return_position_ids (bool, optional): + Whether to include tokens position ids in the returned dictionary. + Defaults to `False`. + return_token_type_ids (bool, optional): + Whether to include token type ids in the returned dictionary. + Defaults to `True`. + return_attention_mask (bool, optional): + Whether to include the attention mask in the returned dictionary. + Defaults to `False`. + return_length (bool, optional): + Whether to include the length of each encoded inputs in the + returned dictionary. Defaults to `False`. + return_overflowing_tokens (bool, optional): + Whether to include overflowing token information in the returned + dictionary. Defaults to `False`. + return_special_tokens_mask (bool, optional): + Whether to include special tokens mask information in the returned + dictionary. Defaults to `False`. + Returns: + dict or list[dict] (for batch input): + The dict has the following optional items: + - **input_ids** (list[int]): List of token ids to be fed to a model. + - **position_ids** (list[int], optional): List of token position ids to be + fed to a model. Included when `return_position_ids` is `True` + - **token_type_ids** (list[int], optional): List of token type ids to be + fed to a model. Included when `return_token_type_ids` is `True`. + - **attention_mask** (list[int], optional): List of integers valued 0 or 1, + where 0 specifies paddings and should not be attended to by the + model. Included when `return_attention_mask` is `True`. + - **seq_len** (int, optional): The input_ids length. Included when `return_length` + is `True`. + - **overflowing_tokens** (list[int], optional): List of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **num_truncated_tokens** (int, optional): The number of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1, + with 0 specifying special added tokens and 1 specifying sequence tokens. + Included when `return_special_tokens_mask` is `True`. + - **offset_mapping** (list[int], optional): list of pair preserving the + index of start and end char in original input for each token. + For a special token, the index pair is `(0, 0)`. Included when + `stride` works. + - **overflow_to_sample** (int, optional): Index of example from which this + feature is generated. Included when `stride` works. + """ + # Input type checking for clearer error + assert isinstance(text, str) or ( + isinstance(text, (list, tuple)) and (len(text) == 0 or ( + isinstance(text[0], str) or + (isinstance(text[0], (list, tuple)) and + (len(text[0]) == 0 or isinstance(text[0][0], str))))) + ), ("text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples).") + + assert (text_pair is None or isinstance(text_pair, str) or ( + isinstance(text_pair, (list, tuple)) and (len(text_pair) == 0 or ( + isinstance(text_pair[0], str) or + (isinstance(text_pair[0], (list, tuple)) and + (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))))) + )), ( + "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples).") + + is_batched = bool( + (not is_split_into_words and isinstance(text, (list, tuple))) or + (is_split_into_words and isinstance(text, (list, tuple)) and + text and isinstance(text[0], (list, tuple)))) + + if is_batched: + batch_text_or_text_pairs = list(zip( + text, text_pair)) if text_pair is not None else text + return self.batch_encode( + batch_text_or_text_pairs=batch_text_or_text_pairs, + max_seq_len=max_seq_len, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_max_seq_len=pad_to_max_seq_len, + truncation_strategy="longest_first", + return_position_ids=return_position_ids, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_length=return_length, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask) + else: + return self.encode( + text=text, + text_pair=text_pair, + max_seq_len=max_seq_len, + pad_to_max_seq_len=pad_to_max_seq_len, + truncation_strategy="longest_first", + return_position_ids=return_position_ids, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_length=return_length, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask) + + @property + def all_special_tokens(self): + """ + list: All the special tokens ('', ''...) corresponding to + special token arguments in `__init__` (arguments end with '_end'). + """ + all_toks = [] + set_attr = self.special_tokens_map + for attr_value in set_attr.values(): + all_toks = all_toks + (list(attr_value) if isinstance(attr_value, ( + list, tuple)) else [attr_value]) + all_toks = list(set(all_toks)) + return all_toks + + @property + def all_special_ids(self): + """ + list: All the token ids corresponding to all the special tokens. + """ + all_toks = self.all_special_tokens + all_ids = self.convert_tokens_to_ids(all_toks) + return all_ids + + def convert_tokens_to_ids(self, tokens): + """ + Converts a sequence of tokens into ids using the `vocab` attribute (an + instance of `Vocab`). Override it if needed. + Args: + tokens (list[int]): List of token ids. + Returns: + list: Converted id list. + """ + if isinstance(tokens, list): + token_ids = [] + for token in tokens: + token_id = self.vocab.get(token, self.unk_token_id) + token_ids.append(token_id) + return token_ids + elif isinstance(tokens, str): + token_id = self.vocab.get(tokens, self.unk_token_id) + token_ids.append(token_id) + return token_ids + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): + """ + Creates an instance of `PretrainedTokenizer`. Related resources are loaded + by specifying name of a built-in pretrained model, or a community-contributed + pretrained model, or a local file directory path. + Args: + pretrained_model_name_or_path (str): Name of pretrained model or dir path + to load from. The string can be: + - Name of built-in pretrained model + - Name of a community-contributed pretrained model. + - Local directory path which contains tokenizer related resources + and tokenizer config file ("tokenizer_config.json"). + *args (tuple): position arguments for model `__init__`. If provided, + use these as position argument values for tokenizer initialization. + **kwargs (dict): keyword arguments for model `__init__`. If provided, + use these to update pre-defined keyword argument values for tokenizer + initialization. + Returns: + PretrainedTokenizer: An instance of `PretrainedTokenizer`. + Example: + .. code-block:: + from paddlenlp.transformers import BertTokenizer + # Name of built-in pretrained model + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + # Name of community-contributed pretrained model + tokenizer = BertTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned') + # Load from local directory path + tokenizer = BertTokenizer.from_pretrained('./my_bert/') + """ + pretrained_models = list(cls.pretrained_init_configuration.keys()) + vocab_files = {} + init_configuration = {} + # From built-in pretrained models + if pretrained_model_name_or_path in pretrained_models: + for file_id, map_list in cls.pretrained_resource_files_map.items(): + vocab_files[file_id] = map_list[pretrained_model_name_or_path] + init_configuration = copy.deepcopy( + cls.pretrained_init_configuration[ + pretrained_model_name_or_path]) + # From local dir path + elif os.path.isdir(pretrained_model_name_or_path): + for file_id, file_name in cls.resource_files_names.items(): + full_file_name = os.path.join(pretrained_model_name_or_path, + file_name) + vocab_files[file_id] = full_file_name + vocab_files["tokenizer_config_file"] = os.path.join( + pretrained_model_name_or_path, cls.tokenizer_config_file) + + default_root = os.path.join(DATA_HOME, pretrained_model_name_or_path) + resolved_vocab_files = {} + for file_id, file_path in vocab_files.items(): + if file_path is None or os.path.isfile(file_path): + resolved_vocab_files[file_id] = file_path + continue + path = os.path.join(default_root, file_path.split('/')[-1]) + if os.path.exists(path): + print("Already cached %s" % path) + resolved_vocab_files[file_id] = path + else: + print("Downloading %s and saved to %s" % + (file_path, default_root)) + try: + resolved_vocab_files[file_id] = get_path_from_url( + file_path, default_root) + except RuntimeError as err: + print(err) + raise RuntimeError( + f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" + f"Please make sure that '{pretrained_model_name_or_path}' is:\n" + "- a correct model-identifier of built-in pretrained models,\n" + "- or a correct model-identifier of community-contributed pretrained models,\n" + "- or the correct path to a directory containing relevant tokenizer files.\n" + ) + + # Prepare tokenizer initialization kwargs + # Did we saved some inputs and kwargs to reload ? + tokenizer_config_file = resolved_vocab_files.pop( + "tokenizer_config_file", None) + if tokenizer_config_file is not None: + with io.open(tokenizer_config_file, encoding="utf-8") as f: + init_kwargs = json.load(f) + else: + init_kwargs = init_configuration + # position args are stored in kwargs, maybe better not include + init_args = init_kwargs.pop("init_args", ()) + init_kwargs.pop("init_class", None) + + # Update with newly provided args and kwargs + init_args = init_args if not args else args + init_kwargs.update(kwargs) + + # Merge resolved_vocab_files arguments in init_kwargs if not including. + # Maybe need more ways to load resources. + for args_name, file_path in resolved_vocab_files.items(): + # when `pretrained_model_name_or_path` is a pretrained model name, + # use pretrained_init_configuration as `init_kwargs` to init which + # does not include the vocab file in it, thus add vocab file into + # args. + if args_name not in init_kwargs: + init_kwargs[args_name] = file_path + # when `pretrained_model_name_or_path` is a pretrained model dir, + # use tokenizer_config_file.json as `init_kwargs` to init which + # does include a vocab file path in it. However, if the vocab file + # path included in json does not exist, such as was deleted, to make + # it still work, use the vocab file under this dir. + elif not os.path.isfile(init_kwargs[args_name]) and os.path.isfile( + file_path): + init_kwargs[args_name] = file_path + # TODO(guosheng): avoid reduplication of position args and key word args + tokenizer = cls(*init_args, **init_kwargs) + return tokenizer + + def save_pretrained(self, save_directory): + """ + Save tokenizer configuration and related resources to files under + `save_directory`. The tokenizer configuration would be saved into + `tokenizer_config_file` indicating file (thus `tokenizer_config.json`), + and resources would be saved into `resource_files_names` indicating files + by using `self.save_resources(save_directory)`. + + The `save_directory` can be used in `from_pretrained` as argument value + of `pretrained_model_name_or_path` to re-load the tokenizer. + Args: + save_directory (str): Directory to save files into. + Example: + .. code-block:: + from paddlenlp.transformers import BertTokenizer + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + tokenizer.save_pretrained('trained_model') + # reload from save_directory + tokenizer = BertTokenizer.from_pretrained('trained_model') + """ + assert not os.path.isfile( + save_directory + ), "Saving directory ({}) should be a directory, not a file".format( + save_directory) + os.makedirs(save_directory, exist_ok=True) + + tokenizer_config_file = os.path.join(save_directory, + self.tokenizer_config_file) + # init_config is set in metaclass created `__init__`, + tokenizer_config = self.init_config + with io.open(tokenizer_config_file, "w", encoding="utf-8") as f: + f.write(json.dumps(tokenizer_config, ensure_ascii=False)) + + self.save_resources(save_directory) + + def save_resources(self, save_directory): + """ + Save tokenizer related resources to `resource_files_names` indicating + files under `save_directory` by copying directly. Override it if necessary. + Args: + save_directory (str): Directory to save files into. + """ + for name, file_name in self.resource_files_names.items(): + src_path = self.init_config[name] + dst_path = os.path.join(save_directory, file_name) + if os.path.abspath(src_path) != os.path.abspath(dst_path): + copyfile(src_path, dst_path) + + @staticmethod + def load_vocabulary(filepath, + unk_token=None, + pad_token=None, + bos_token=None, + eos_token=None, + **kwargs): + """ + Instantiate an instance of `Vocab` from a file reserving all tokens + by using `Vocab.from_dict`. The file contains a token per line, and the + line number would be the index of corresponding token. + Args: + filepath (str): path of file to construct vocabulary. + unk_token (str): special token for unknown token. If no need, it also + could be `None`. Defaults to `None`. + pad_token (str): special token for padding token. If no need, it also + could be `None`. Defaults to `None`. + bos_token (str): special token for bos token. If no need, it also + could be `None`. Defaults to `None`. + eos_token (str): special token for eos token. If no need, it also + could be `None`. Defaults to `None`. + **kwargs (dict): keyword arguments for `Vocab.from_dict`. + Returns: + Vocab: An instance of `Vocab`. + """ + token_to_idx = {} + with io.open(filepath, 'r', encoding='utf-8') as f: + for index, line in enumerate(f): + token = line.rstrip('\n') + token_to_idx[token] = int(index) + return token_to_idx + + def __getattr__(self, name): + if name.endswith('_token'): + return self.special_tokens_map[name] + elif name.endswith('_token_id'): + return self.vocab[self.special_tokens_map[name[:-3]]] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, name)) + + def truncate_sequences(self, + ids, + pair_ids=None, + num_tokens_to_remove=0, + truncation_strategy='longest_first', + stride=0): + """ + Truncates a sequence pair in place to the maximum length. + Args: + ids: list of tokenized input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): + number of tokens to remove using the truncation strategy + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_seq_len + starting from the longest one at each token (when there is a pair of input sequences). + Overflowing tokens only contains overflow from the first sequence. + - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_seq_len) + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_seq_len, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + """ + if num_tokens_to_remove <= 0: + return ids, pair_ids, [] + + if truncation_strategy == 'longest_first': + overflowing_tokens = [] + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + overflowing_tokens = [ids[-1]] + overflowing_tokens + ids = ids[:-1] + else: + pair_ids = pair_ids[:-1] + window_len = min(len(ids), stride) + if window_len > 0: + overflowing_tokens = ids[-window_len:] + overflowing_tokens + elif truncation_strategy == 'only_first': + assert len(ids) > num_tokens_to_remove + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] + ids = ids[:-num_tokens_to_remove] + elif truncation_strategy == 'only_second': + assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + elif truncation_strategy == 'do_not_truncate': + raise ValueError( + "Input sequence are too long for max_length. Please select a truncation strategy." + ) + else: + raise ValueError( + "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']" + ) + return (ids, pair_ids, overflowing_tokens) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. + Should be overridden in a subclass if the model has a special way of building those. + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + Returns: + List[int]: List of input_id with the appropriate special tokens. + """ + if token_ids_1 is None: + return token_ids_0 + + return token_ids_0 + token_ids_1 + + def build_offset_mapping_with_special_tokens(self, + offset_mapping_0, + offset_mapping_1=None): + """ + Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. + Should be overridden in a subclass if the model has a special way of building those. + Args: + offset_mapping_0 (List[tuple]): + List of char offsets to which the special tokens will be added. + offset_mapping_1 (List[tuple], optional): + Optional second list of char offsets for offset mapping pairs. + Returns: + List[tuple]: List of char offsets with the appropriate offsets of special tokens. + """ + if offset_mapping_1 is None: + return offset_mapping_0 + + return offset_mapping_0 + offset_mapping_1 + + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``encode`` methods. + Args: + token_ids_0 (List[int]): List of ids of the first sequence. + token_ids_1 (List[int], optional): List of ids of the second sequence. + already_has_special_tokens (bool, optional): Whether or not the token list is already + formatted with special tokens for the model. Defaults to None. + Returns: + results (List[int]): The list of integers in the range [0, 1]: + 1 for a special token, 0 for a sequence token. + """ + return [0] * ((len(token_ids_1) + if token_ids_1 else 0) + len(token_ids_0)) + + def create_token_type_ids_from_sequences(self, + token_ids_0, + token_ids_1=None): + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + Should be overridden in a subclass if the model has a special way of building those. + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + Args: + token_ids_0 (List[int]): + List of IDs. + token_ids_1 (List[int], optional): + Optional second list of IDs for sequence pairs. + Returns: + List[int]: List of token_type_id according to the given sequence(s). + """ + if token_ids_1 is None: + return len(token_ids_0) * [0] + return [0] * len(token_ids_0) + [1] * len(token_ids_1) + + def num_special_tokens_to_add(self, pair): + """ + Returns the number of added tokens when encoding a sequence with special tokens. + Args: + pair (bool, optional): + Whether the number of added tokens should be computed in the case of a sequence pair or a single + sequence. Defaults to `False`. + Returns: + int: Number of special tokens added to sequences. + """ + token_ids_0 = [] + token_ids_1 = [] + return len( + self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 + if pair else None)) + + def encode(self, + text, + text_pair=None, + max_seq_len=512, + pad_to_max_seq_len=False, + truncation_strategy="longest_first", + return_position_ids=False, + return_token_type_ids=True, + return_attention_mask=False, + return_length=False, + return_overflowing_tokens=False, + return_special_tokens_mask=False): + """ + Performs tokenization and uses the tokenized tokens to prepare model + inputs. It supports sequence or sequence pair as input, and batch input + is not allowed. + Args: + text (str, List[str] or List[int]): + The sequence to be processed. One sequence is a string, a list + of strings, or a list of integers depending on whether it has + been pretokenized and converted to ids. + text_pair (str, List[str] or List[List[str]]): + Same as `text` argument, while it represents for the latter + sequence of the sequence pair. + max_seq_len (int, optional): + If set to a number, will limit the total sequence returned so + that it has a maximum length. If there are overflowing tokens, + those overflowing tokens will be added to the returned dictionary + when `return_overflowing_tokens` is `True`. Defaults to `None`. + stride (int, optional): + Only available for batch input of sequence pair and mainly for + question answering usage. When for QA, `text` represents questions + and `text_pair` represents contexts. If `stride` is set to a + positive number, the context will be split into multiple spans + where `stride` defines the number of (tokenized) tokens to skip + from the start of one span to get the next span, thus will produce + a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample' + and 'offset_mapping' preserving the original example and position + information will be added to the returned dictionary. Defaults to 0. + pad_to_max_seq_len (bool, optional): + If set to `True`, the returned sequences would be padded up to + `max_seq_len` specified length according to padding side + (`self.padding_side`) and padding token id. Defaults to `False`. + truncation_strategy (str, optional): + String selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence + until the input is under `max_seq_len` starting from the longest + one at each token (when there is a pair of input sequences). + - 'only_first': Only truncate the first sequence. + - 'only_second': Only truncate the second sequence. + - 'do_not_truncate': Do not truncate (raise an error if the input + sequence is longer than `max_seq_len`). + Defaults to 'longest_first'. + return_position_ids (bool, optional): + Whether to include tokens position ids in the returned dictionary. + Defaults to `False`. + return_token_type_ids (bool, optional): + Whether to include token type ids in the returned dictionary. + Defaults to `True`. + return_attention_mask (bool, optional): + Whether to include the attention mask in the returned dictionary. + Defaults to `False`. + return_length (bool, optional): + Whether to include the length of each encoded inputs in the + returned dictionary. Defaults to `False`. + return_overflowing_tokens (bool, optional): + Whether to include overflowing token information in the returned + dictionary. Defaults to `False`. + return_special_tokens_mask (bool, optional): + Whether to include special tokens mask information in the returned + dictionary. Defaults to `False`. + Returns: + dict: + The dict has the following optional items: + - **input_ids** (list[int]): List of token ids to be fed to a model. + - **position_ids** (list[int], optional): List of token position ids to be + fed to a model. Included when `return_position_ids` is `True` + - **token_type_ids** (list[int], optional): List of token type ids to be + fed to a model. Included when `return_token_type_ids` is `True`. + - **attention_mask** (list[int], optional): List of integers valued 0 or 1, + where 0 specifies paddings and should not be attended to by the + model. Included when `return_attention_mask` is `True`. + - **seq_len** (int, optional): The input_ids length. Included when `return_length` + is `True`. + - **overflowing_tokens** (list[int], optional): List of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **num_truncated_tokens** (int, optional): The number of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1, + with 0 specifying special added tokens and 1 specifying sequence tokens. + Included when `return_special_tokens_mask` is `True`. + """ + + def get_input_ids(text): + if isinstance(text, str): + tokens = self._tokenize(text) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], str): + return self.convert_tokens_to_ids(text) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], int): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + ids = get_input_ids(text) + pair_ids = get_input_ids(text_pair) if text_pair is not None else None + + pair = bool(pair_ids is not None) + len_ids = len(ids) + len_pair_ids = len(pair_ids) if pair else 0 + + encoded_inputs = {} + + # Truncation: Handle max sequence length + total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add( + pair=pair)) + if max_seq_len and total_len > max_seq_len: + + ids, pair_ids, overflowing_tokens = self.truncate_sequences( + ids, + pair_ids=pair_ids, + num_tokens_to_remove=total_len - max_seq_len, + truncation_strategy=truncation_strategy, ) + if return_overflowing_tokens: + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["num_truncated_tokens"] = total_len - max_seq_len + + # Add special tokens + + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences(ids, + pair_ids) + + # Build output dictionnary + encoded_inputs["input_ids"] = sequence + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + if return_special_tokens_mask: + encoded_inputs[ + "special_tokens_mask"] = self.get_special_tokens_mask(ids, + pair_ids) + if return_length: + encoded_inputs["seq_len"] = len(encoded_inputs["input_ids"]) + + # Check lengths + assert max_seq_len is None or len(encoded_inputs[ + "input_ids"]) <= max_seq_len + + # Padding + needs_to_be_padded = pad_to_max_seq_len and \ + max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len + + if needs_to_be_padded: + difference = max_seq_len - len(encoded_inputs["input_ids"]) + if self.padding_side == 'right': + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[ + "input_ids"]) + [0] * difference + if return_token_type_ids: + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + + [self.pad_token_type_id] * difference) + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = encoded_inputs[ + "special_tokens_mask"] + [1] * difference + encoded_inputs["input_ids"] = encoded_inputs[ + "input_ids"] + [self.pad_token_id] * difference + elif self.padding_side == 'left': + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + [ + 1 + ] * len(encoded_inputs["input_ids"]) + if return_token_type_ids: + encoded_inputs["token_type_ids"] = ( + [self.pad_token_type_id] * difference + + encoded_inputs["token_type_ids"]) + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = [ + 1 + ] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs["input_ids"] = [ + self.pad_token_id + ] * difference + encoded_inputs["input_ids"] + else: + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[ + "input_ids"]) + + if return_position_ids: + encoded_inputs["position_ids"] = list( + range(len(encoded_inputs["input_ids"]))) + + return encoded_inputs + + def batch_encode(self, + batch_text_or_text_pairs, + max_seq_len=512, + pad_to_max_seq_len=False, + stride=0, + is_split_into_words=False, + truncation_strategy="longest_first", + return_position_ids=False, + return_token_type_ids=True, + return_attention_mask=False, + return_length=False, + return_overflowing_tokens=False, + return_special_tokens_mask=False): + """ + Performs tokenization and uses the tokenized tokens to prepare model + inputs. It supports batch inputs of sequence or sequence pair. + Args: + batch_text_or_text_pairs (list): + The element of list can be sequence or sequence pair, and the + sequence is a string or a list of strings depending on whether + it has been pretokenized. If each sequence is provided as a list + of strings (pretokenized), you must set `is_split_into_words` as + `True` to disambiguate with a sequence pair. + max_seq_len (int, optional): + If set to a number, will limit the total sequence returned so + that it has a maximum length. If there are overflowing tokens, + those overflowing tokens will be added to the returned dictionary + when `return_overflowing_tokens` is `True`. Defaults to `None`. + stride (int, optional): + Only available for batch input of sequence pair and mainly for + question answering usage. When for QA, `text` represents questions + and `text_pair` represents contexts. If `stride` is set to a + positive number, the context will be split into multiple spans + where `stride` defines the number of (tokenized) tokens to skip + from the start of one span to get the next span, thus will produce + a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample' + and 'offset_mapping' preserving the original example and position + information will be added to the returned dictionary. Defaults to 0. + pad_to_max_seq_len (bool, optional): + If set to `True`, the returned sequences would be padded up to + `max_seq_len` specified length according to padding side + (`self.padding_side`) and padding token id. Defaults to `False`. + truncation_strategy (str, optional): + String selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence + until the input is under `max_seq_len` starting from the longest + one at each token (when there is a pair of input sequences). + - 'only_first': Only truncate the first sequence. + - 'only_second': Only truncate the second sequence. + - 'do_not_truncate': Do not truncate (raise an error if the input + sequence is longer than `max_seq_len`). + Defaults to 'longest_first'. + return_position_ids (bool, optional): + Whether to include tokens position ids in the returned dictionary. + Defaults to `False`. + return_token_type_ids (bool, optional): + Whether to include token type ids in the returned dictionary. + Defaults to `True`. + return_attention_mask (bool, optional): + Whether to include the attention mask in the returned dictionary. + Defaults to `False`. + return_length (bool, optional): + Whether to include the length of each encoded inputs in the + returned dictionary. Defaults to `False`. + return_overflowing_tokens (bool, optional): + Whether to include overflowing token information in the returned + dictionary. Defaults to `False`. + return_special_tokens_mask (bool, optional): + Whether to include special tokens mask information in the returned + dictionary. Defaults to `False`. + Returns: + list[dict]: + The dict has the following optional items: + - **input_ids** (list[int]): List of token ids to be fed to a model. + - **position_ids** (list[int], optional): List of token position ids to be + fed to a model. Included when `return_position_ids` is `True` + - **token_type_ids** (list[int], optional): List of token type ids to be + fed to a model. Included when `return_token_type_ids` is `True`. + - **attention_mask** (list[int], optional): List of integers valued 0 or 1, + where 0 specifies paddings and should not be attended to by the + model. Included when `return_attention_mask` is `True`. + - **seq_len** (int, optional): The input_ids length. Included when `return_length` + is `True`. + - **overflowing_tokens** (list[int], optional): List of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **num_truncated_tokens** (int, optional): The number of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1, + with 0 specifying special added tokens and 1 specifying sequence tokens. + Included when `return_special_tokens_mask` is `True`. + - **offset_mapping** (list[int], optional): list of pair preserving the + index of start and end char in original input for each token. + For a sqecial token, the index pair is `(0, 0)`. Included when + `stride` works. + - **overflow_to_sample** (int, optional): Index of example from which this + feature is generated. Included when `stride` works. + """ + + def get_input_ids(text): + if isinstance(text, str): + tokens = self._tokenize(text) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], str): + return self.convert_tokens_to_ids(text) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], int): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + batch_encode_inputs = [] + for example_id, tokens_or_pair_tokens in enumerate( + batch_text_or_text_pairs): + if not isinstance(tokens_or_pair_tokens, (list, tuple)): + text, text_pair = tokens_or_pair_tokens, None + elif is_split_into_words and not isinstance( + tokens_or_pair_tokens[0], (list, tuple)): + text, text_pair = tokens_or_pair_tokens, None + else: + text, text_pair = tokens_or_pair_tokens + + first_ids = get_input_ids(text) + second_ids = get_input_ids( + text_pair) if text_pair is not None else None + + if stride > 0 and second_ids is not None: + + max_len_for_pair = max_seq_len - len( + first_ids) - self.num_special_tokens_to_add(pair=True) + + token_offset_mapping = self.get_offset_mapping(text) + token_pair_offset_mapping = self.get_offset_mapping(text_pair) + + offset = 0 + while offset < len(second_ids): + encoded_inputs = {} + length = len(second_ids) - offset + if length > max_len_for_pair: + length = max_len_for_pair + + ids = first_ids + pair_ids = second_ids[offset:offset + length] + + mapping = token_offset_mapping + pair_mapping = token_pair_offset_mapping[offset:offset + + length] + + offset_mapping = self.build_offset_mapping_with_special_tokens( + mapping, pair_mapping) + sequence = self.build_inputs_with_special_tokens(ids, + pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences( + ids, pair_ids) + + # Build output dictionnary + encoded_inputs["input_ids"] = sequence + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + if return_special_tokens_mask: + encoded_inputs[ + "special_tokens_mask"] = self.get_special_tokens_mask( + ids, pair_ids) + if return_length: + encoded_inputs["seq_len"] = len(encoded_inputs[ + "input_ids"]) + + # Check lengths + assert max_seq_len is None or len(encoded_inputs[ + "input_ids"]) <= max_seq_len + + # Padding + needs_to_be_padded = pad_to_max_seq_len and \ + max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len + + encoded_inputs['offset_mapping'] = offset_mapping + + if needs_to_be_padded: + difference = max_seq_len - len(encoded_inputs[ + "input_ids"]) + if self.padding_side == 'right': + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len( + encoded_inputs[ + "input_ids"]) + [0] * difference + if return_token_type_ids: + # 0 for padding token mask + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + + [self.pad_token_type_id] * difference) + if return_special_tokens_mask: + encoded_inputs[ + "special_tokens_mask"] = encoded_inputs[ + "special_tokens_mask"] + [1 + ] * difference + encoded_inputs["input_ids"] = encoded_inputs[ + "input_ids"] + [self.pad_token_id] * difference + encoded_inputs['offset_mapping'] = encoded_inputs[ + 'offset_mapping'] + [(0, 0)] * difference + elif self.padding_side == 'left': + if return_attention_mask: + encoded_inputs["attention_mask"] = [ + 0 + ] * difference + [1] * len(encoded_inputs[ + "input_ids"]) + if return_token_type_ids: + # 0 for padding token mask + encoded_inputs["token_type_ids"] = ( + [self.pad_token_type_id] * difference + + encoded_inputs["token_type_ids"]) + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = [ + 1 + ] * difference + encoded_inputs[ + "special_tokens_mask"] + encoded_inputs["input_ids"] = [ + self.pad_token_id + ] * difference + encoded_inputs["input_ids"] + encoded_inputs['offset_mapping'] = [ + (0, 0) + ] * difference + encoded_inputs['offset_mapping'] + else: + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len( + encoded_inputs["input_ids"]) + + if return_position_ids: + encoded_inputs["position_ids"] = list( + range(len(encoded_inputs["input_ids"]))) + + encoded_inputs['overflow_to_sample'] = example_id + batch_encode_inputs.append(encoded_inputs) + if offset + length == len(second_ids): + break + offset += min(length, stride) + + else: + batch_encode_inputs.append( + self.encode( + first_ids, + second_ids, + max_seq_len=max_seq_len, + pad_to_max_seq_len=pad_to_max_seq_len, + truncation_strategy=truncation_strategy, + return_position_ids=return_position_ids, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_length=return_length, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask)) + + return batch_encode_inputs + + def get_offset_mapping(self, text): + """ + Returns the map of tokens and the start and end index of their start and end character. + Modified from https://github.com/bojone/bert4keras/blob/master/bert4keras/tokenizers.py#L372 + Args: + text (str): + Input text. + Returns: + list: The offset map of input text. + + """ + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token + if sub_token != self.unk_token else token) + + normalized_text, char_mapping = '', [] + + for i, ch in enumerate(text): + if self.basic_tokenizer.do_lower_case: + ch = ch.lower() + ch = unicodedata.normalize('NFD', ch) + ch = ''.join([c for c in ch if unicodedata.category(c) != 'Mn']) + + ch = ''.join([ + c for c in ch + if not (ord(c) == 0 or ord(c) == 0xfffd or _is_control(c)) + ]) + normalized_text += ch + + char_mapping.extend([i] * len(ch)) + + text, token_mapping, offset = normalized_text, [], 0 + + for token in split_tokens: + if token[:2] == '##': + token = token[2:] + + start = text[offset:].index(token) + offset + end = start + len(token) + + token_mapping.append( + (char_mapping[start], char_mapping[end - 1] + 1)) + offset = end + + return token_mapping diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py index 7fdce2af646765..8b72f05f363cba 100644 --- a/python/paddle/framework/io.py +++ b/python/paddle/framework/io.py @@ -43,7 +43,10 @@ def _build_saved_state_dict(state_dict): name_table = {} for key, value in state_dict.items(): if isinstance(value, (Variable, core.VarBase)): - save_dict[key] = value.numpy() + if value.type == core.VarDesc.VarType.VOCAB: + save_dict[key] = value.value().get_map_tensor() + else: + save_dict[key] = value.numpy() name_table[key] = value.name else: save_dict[key] = value @@ -938,8 +941,9 @@ def load(path, **configs): if "StructuredToParameterName@@" in load_result: for key in load_result["StructuredToParameterName@@"]: - load_result[key] = _ndarray_to_tensor( - load_result[key], config.return_numpy) + if isinstance(load_result[key], np.ndarray): + load_result[key] = _ndarray_to_tensor( + load_result[key], config.return_numpy) if not config.keep_name_table and "StructuredToParameterName@@" in load_result: del load_result["StructuredToParameterName@@"] From fc5db55a39efe1891c6d4baadf27e97536950334 Mon Sep 17 00:00:00 2001 From: Wilber Date: Wed, 20 Oct 2021 15:59:00 +0800 Subject: [PATCH 226/298] fix fc fuse proble (#36568) --- paddle/fluid/framework/ir/fc_fuse_pass.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 4510aea925e788..bb78cdab677526 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -51,7 +51,12 @@ FCFusePass::FCFusePass() { .IsTensor() .End() .AddAttr("axis") - .IsNumGE(1) + .IsNumMatch([](int axis) -> bool { + if (axis == -1 || axis >= 1) { + return true; + } + return false; + }) .End(); AddOpCompat(OpCompat("relu")) From 6a572a194102a4c01a8b403bb25b86edd72476ff Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Wed, 20 Oct 2021 16:01:18 +0800 Subject: [PATCH 227/298] [NPU] Add kldiv_loss_op for npu (#36494) --- paddle/fluid/operators/kldiv_loss_op_npu.cc | 163 ++++++++++++++++++ .../unittests/npu/test_kldiv_loss_op_npu.py | 154 +++++++++++++++++ 2 files changed, 317 insertions(+) create mode 100644 paddle/fluid/operators/kldiv_loss_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc new file mode 100644 index 00000000000000..7d7cdd4c786712 --- /dev/null +++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc @@ -0,0 +1,163 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/operators/kldiv_loss_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class KLDivLossNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* target = ctx.Input("Target"); + auto* loss = ctx.Output("Loss"); + auto reduction = ctx.Attr("reduction"); + loss->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); + + if ("none" == reduction) { + // log(label) + auto ones_tensor = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& ones_runner = + NpuOpRunner("OnesLike", {*target}, {ones_tensor}, {}); + ones_runner.Run(stream); + + auto sub_tensor = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& sub_runner = + NpuOpRunner("Sub", {*target, ones_tensor}, {sub_tensor}, {}); + sub_runner.Run(stream); + + auto log_target = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& log_runner = + NpuOpRunner("Log1p", {sub_tensor}, {log_target}, {}); + log_runner.Run(stream); + + // log(label) - input + const auto& sub_runner2 = + NpuOpRunner("Sub", {log_target, *input}, {*loss}, {}); + sub_runner2.Run(stream); + + // label * (log(label) - input) + auto min_value = + ctx.AllocateTmpTensor({1}, dev_ctx); + auto max_value = + ctx.AllocateTmpTensor({1}, dev_ctx); + FillNpuTensorWithConstant(&min_value, static_cast(0)); + FillNpuTensorWithConstant(&max_value, std::numeric_limits::max()); + + auto cliped_target = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& clip_runner = NpuOpRunner( + "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {}); + clip_runner.Run(stream); + + const auto& mul_runner = + NpuOpRunner("Mul", {*loss, cliped_target}, {*loss}, {}); + mul_runner.Run(stream); + } else if ("batchmean" == reduction || "sum" == reduction) { + const auto& runner = NpuOpRunner("KLDiv", {*input, *target}, {*loss}, + {{"reduction", reduction}}); + runner.Run(stream); + } else if ("mean" == reduction) { + const auto& runner = NpuOpRunner("KLDiv", {*input, *target}, {*loss}, + {{"reduction", std::string("sum")}}); + runner.Run(stream); + + const int numel = input->numel(); + const auto& muls_runner = + NpuOpRunner("Muls", {*loss}, {*loss}, + {{"value", static_cast(1.0 / numel)}}); + muls_runner.Run(stream); + } + } +}; + +template +class KLDivLossGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* target = ctx.Input("Target"); + auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto reduction = ctx.Attr("reduction"); + input_grad->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); + + Tensor loss_grad_transformed; + if ("none" == reduction) { + loss_grad_transformed.ShareDataWith(*loss_grad); + } else { + loss_grad_transformed.mutable_data(input_grad->dims(), ctx.GetPlace()); + + NpuOpRunner broadcast_runner; + broadcast_runner.SetType("BroadcastTo"); + broadcast_runner.AddInput(*loss_grad); + broadcast_runner.AddInput(framework::vectorize(input_grad->dims())); + broadcast_runner.AddOutput(loss_grad_transformed); + broadcast_runner.Run(stream); + } + auto min_value = + ctx.AllocateTmpTensor({1}, dev_ctx); + auto max_value = + ctx.AllocateTmpTensor({1}, dev_ctx); + FillNpuTensorWithConstant(&min_value, static_cast(0)); + FillNpuTensorWithConstant(&max_value, std::numeric_limits::max()); + + auto cliped_target = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& clip_runner = NpuOpRunner( + "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {}); + clip_runner.Run(stream); + + const auto& mul_runner = NpuOpRunner( + "Mul", {cliped_target, loss_grad_transformed}, {*input_grad}, {}); + mul_runner.Run(stream); + + float k = -1.0f; + + if ("mean" == reduction) { + k = static_cast(-1.0 / input_grad->numel()); + } else if ("batchmean" == reduction) { + k = static_cast(-1.0 / input_grad->dims()[0]); + } + + const auto& muls_runner = + NpuOpRunner("Muls", {*input_grad}, {*input_grad}, {{"value", k}}); + muls_runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(kldiv_loss, ops::KLDivLossNPUKernel, + ops::KLDivLossNPUKernel); + +REGISTER_OP_NPU_KERNEL(kldiv_loss_grad, ops::KLDivLossGradNPUKernel, + ops::KLDivLossGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py new file mode 100644 index 00000000000000..7ed1775fa5e6db --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py @@ -0,0 +1,154 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from test_kldiv_loss_op import kldiv_loss + +paddle.enable_static() + + +class TestKLDivLossOp(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def init_dtype(self): + self.dtype = 'float32' + + def setUp(self): + self.set_npu() + self.init_dtype() + self.initTestCase() + self.op_type = 'kldiv_loss' + x = np.random.uniform(-10, 10, self.x_shape).astype(self.dtype) + target = np.random.uniform(-10, 10, self.x_shape).astype(self.dtype) + + self.attrs = {"reduction": self.reduction} + + self.inputs = { + 'X': x, + 'Target': target, + } + loss = kldiv_loss(x, target, self.reduction) + self.outputs = {'Loss': loss.astype(self.dtype)} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X'], + 'Loss', + no_grad_set=set(["Target"]), + max_relative_error=0.15) + + def initTestCase(self): + self.x_shape = (4, 5, 5) + self.reduction = 'batchmean' + + +class TestKLDivLossOp2(TestKLDivLossOp): + def initTestCase(self): + self.x_shape = (3, 2, 7, 7) + self.reduction = 'none' + + +class TestKLDivLossOp3(TestKLDivLossOp): + def initTestCase(self): + self.x_shape = (2, 3, 5, 7, 9) + self.reduction = 'mean' + + +class TestKLDivLossOp4(TestKLDivLossOp): + def initTestCase(self): + self.x_shape = (5, 20) + self.reduction = 'sum' + + +class TestKLDivLossOp_fp16(TestKLDivLossOp): + def init_dtype(self): + self.dtype = 'float16' + + def test_check_output(self): + self.check_output_with_place(self.place, atol=3e-1) + + def test_check_grad(self): + input_grad = -self.inputs['Target'] * ( + self.inputs['Target'] > 0) / self.inputs['Target'].shape[0] + self.check_grad_with_place( + self.place, ['X'], + 'Loss', + no_grad_set=set(["Target"]), + max_relative_error=0.2, + user_defined_grads=[input_grad]) + + +class TestKLDivLossDygraph(unittest.TestCase): + def run_kl_loss(self, reduction, shape=(5, 20)): + x = np.random.uniform(-10, 10, shape).astype('float32') + target = np.random.uniform(-10, 10, shape).astype('float32') + gt_loss = kldiv_loss(x, target, reduction) + + with paddle.fluid.dygraph.guard(paddle.NPUPlace(0)): + kldiv_criterion = paddle.nn.KLDivLoss(reduction) + pred_loss = kldiv_criterion( + paddle.to_tensor(x), paddle.to_tensor(target)) + self.assertTrue(np.allclose(pred_loss.numpy(), gt_loss)) + + def test_kl_loss_batchmean(self): + self.run_kl_loss('batchmean') + + def test_kl_loss_batchmean_shape(self): + self.run_kl_loss('batchmean', ()) + + def test_kl_loss_mean(self): + self.run_kl_loss('mean') + + def test_kl_loss_sum(self): + self.run_kl_loss('sum') + + def test_kl_loss_none(self): + self.run_kl_loss('none') + + def test_kl_loss_static_api(self): + input = paddle.fluid.data(name='input', shape=[5, 20]) + label = paddle.fluid.data(name='label', shape=[5, 20]) + + pred_loss = paddle.nn.functional.kl_div(input, label) + + +class TestKLDivLossTypePromotion(unittest.TestCase): + def test_kl_div_promotion(self): + with paddle.fluid.dygraph.guard(paddle.NPUPlace(0)): + x1 = paddle.rand([5, 20], dtype='float32') + target1 = paddle.rand([5, 20], dtype='float32') + + kldiv_criterion = paddle.nn.KLDivLoss() + pred_loss1 = kldiv_criterion(x1, target1) + + x2 = paddle.rand([5, 20], dtype='float32') + target2 = paddle.rand([5, 20], dtype='float32') + pred_loss2 = paddle.nn.functional.kl_div(x2, target2) + + +if __name__ == "__main__": + unittest.main() From 17b4dd70a95b9eeec52237c8aa1c6b122b5e93a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AD=A3?= <2042519524@qq.com> Date: Wed, 20 Oct 2021 16:13:22 +0800 Subject: [PATCH 228/298] Fix global gather and global scatter operators (#36517) * fix global gather and global scatter operators --- .../collective/global_scatter_op.cu.cc | 8 ++++---- python/paddle/distributed/utils.py | 20 +++++++------------ 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc index 64765b549e5c1f..bec984c6b57e19 100644 --- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc @@ -47,8 +47,8 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel { if (platform::is_cpu_place(local_count->place())) { cpu_local_count_data = local_count->data(); } else { - framework::TensorCopy(*local_count, platform::CPUPlace(), - &cpu_local_count); + framework::TensorCopySync(*local_count, platform::CPUPlace(), + &cpu_local_count); cpu_local_count_data = cpu_local_count.data(); } auto global_count_len = 0; @@ -57,8 +57,8 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel { cpu_global_count_data = global_count->data(); global_count_len = global_count->numel(); } else { - framework::TensorCopy(*global_count, platform::CPUPlace(), - &cpu_global_count); + framework::TensorCopySync(*global_count, platform::CPUPlace(), + &cpu_global_count); cpu_global_count_data = cpu_global_count.data(); global_count_len = cpu_global_count.numel(); } diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py index 63585e167e8e32..31d5748ce392e7 100644 --- a/python/paddle/distributed/utils.py +++ b/python/paddle/distributed/utils.py @@ -65,14 +65,11 @@ def global_scatter(x, to global_count. Args: - x (Tensor): Tensor. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32 or int64. + x (Tensor): Tensor. The tensor data type should be float16, float32, float64, int32 or int64. local_count (Tensor): Tensor which have n_expert * world_size elements that indicates - how many data needed to be sent. Every element in the list must be a Tensor whose - data type should be int64. + how many data needed to be sent. The tensor data type should be int64. global_count (Tensor): Tensor which have n_expert * world_size elements that indicates - how many data needed to be received. Every element in the list must be a Tensor whose - data type should be int64. + how many data needed to be received. The tensor data type should be int64. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True. @@ -161,19 +158,16 @@ def global_gather(x, to global_count. Args: - x (Tensor): Tensor. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32 or int64. + x (Tensor): Tensor. Tensor whose data type should be float16, float32, float64, int32 or int64. local_count (Tensor): Tensor which have n_expert * world_size elements that indicates - how many data needed to be received. Every element in the list must be a Tensor whose - data type should be int64. + how many data needed to be received. Tensor data type should be int64. global_count (Tensor): Tensor which have n_expert * world_size elements that indicates - how many data needed to be sent. Every element in the list must be a Tensor whose - data type should be int64. + how many data needed to be sent. Tensor data type should be int64. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True. Returns: - None. + out (Tensor): The data received from all experts. Examples: .. code-block:: python From 6a3941e3cb9a1752df2374561a4defc7b908fa62 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Wed, 20 Oct 2021 19:46:03 +0800 Subject: [PATCH 229/298] fix bugs of ClipGradByGlobalNorm in HybridParallel (#36555) * fix bugs of ClipGradByGlobalNorm * add unittests * add unittests --- .../hybrid_parallel_optimizer.py | 78 ++++++++++++++----- .../unittests/hybrid_parallel_mp_fp16.py | 59 ++++++++++++++ .../tests/unittests/hybrid_parallel_pp_amp.py | 4 + .../unittests/hybrid_parallel_pp_fp16.py | 4 + .../test_parallel_dygraph_tensor_parallel.py | 3 + 5 files changed, 128 insertions(+), 20 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index 6cd875905864bd..e7108b3f4f3432 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -50,8 +50,11 @@ def __init__(self, clip, hcg): @imperative_base.no_grad def _dygraph_clip(self, params_grads): params_and_grads = [] - sum_square_list_dist = [] - sum_square_list_not_dist = [] + + sum_square_dist_fp16 = [] + sum_square_dist_fp32 = [] + sum_square_not_dist_fp16 = [] + sum_square_not_dist_fp32 = [] for p, g in params_grads: if g is None: @@ -71,20 +74,51 @@ def _dygraph_clip(self, params_grads): if not_shared_enable: if p.is_distributed: - sum_square_list_dist.append(sum_square) + if p.dtype == paddle.float16: + sum_square_dist_fp16.append(sum_square) + elif p.dtype == paddle.float32: + sum_square_dist_fp32.append(sum_square) else: - sum_square_list_not_dist.append(sum_square) - - global_norm_var_dist = layers.concat(sum_square_list_dist) if len( - sum_square_list_dist) != 0 else layers.concat( - [paddle.to_tensor([0.])]) - global_norm_var_dist = layers.reduce_sum(global_norm_var_dist) - - global_norm_var_not_dist = layers.concat( - sum_square_list_not_dist) if len( - sum_square_list_not_dist) != 0 else layers.concat( - [paddle.to_tensor([0.])]) - global_norm_var_not_dist = layers.reduce_sum(global_norm_var_not_dist) + if p.dtype == paddle.float16: + sum_square_not_dist_fp16.append(sum_square) + elif p.dtype == paddle.float32: + sum_square_not_dist_fp32.append(sum_square) + + # global norm of distributed FP16 params_and_grads + if len(sum_square_dist_fp16) == 0: + global_norm_dist_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) + else: + global_norm_dist_fp16 = layers.concat(sum_square_dist_fp16) + global_norm_dist_fp16 = layers.reduce_sum(global_norm_dist_fp16) + global_norm_dist_fp16 = paddle.cast( + global_norm_dist_fp16, dtype=paddle.float32) + + # global norm of non-distributed FP16 params_and_grads + if len(sum_square_not_dist_fp16) == 0: + global_norm_not_dist_fp16 = paddle.to_tensor( + [0.], dtype=paddle.float32) + else: + global_norm_not_dist_fp16 = layers.concat(sum_square_not_dist_fp16) + global_norm_not_dist_fp16 = layers.reduce_sum( + global_norm_not_dist_fp16) + global_norm_not_dist_fp16 = paddle.cast( + global_norm_not_dist_fp16, dtype=paddle.float32) + + # global norm of distributed FP32 params_and_grads + global_norm_dist_fp32 = layers.concat(sum_square_dist_fp32) if len( + sum_square_dist_fp32) != 0 else paddle.to_tensor( + [0.], dtype=paddle.float32) + global_norm_dist_fp32 = layers.reduce_sum(global_norm_dist_fp32) + + # global norm of non-distributed FP32 params_and_grads + global_norm_not_dist_fp32 = layers.concat( + sum_square_not_dist_fp32) if len( + sum_square_not_dist_fp32) != 0 else paddle.to_tensor( + [0.], dtype=paddle.float32) + global_norm_not_dist_fp32 = layers.reduce_sum(global_norm_not_dist_fp32) + + global_norm_var_dist = global_norm_dist_fp16 + global_norm_dist_fp32 + global_norm_var_not_dist = global_norm_not_dist_fp16 + global_norm_not_dist_fp32 # add all reduce to get global norm of distributed params_and_grads if self._hcg.get_model_parallel_world_size() > 1: @@ -105,22 +139,26 @@ def _dygraph_clip(self, params_grads): global_norm_var_not_dist, group=self._hcg.get_sharding_parallel_group()) - global_norm_var = layers.sqrt(global_norm_var_dist + - global_norm_var_not_dist) + global_norm_var_fp32 = layers.sqrt(global_norm_var_dist + + global_norm_var_not_dist) max_global_norm = layers.fill_constant( - shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) + shape=[1], dtype=global_norm_var_fp32.dtype, value=self.clip_norm) clip_var = layers.elementwise_div( x=max_global_norm, y=layers.elementwise_max( - x=global_norm_var, y=max_global_norm)) + x=global_norm_var_fp32, y=max_global_norm)) + clip_var_fp16 = paddle.cast(clip_var, paddle.float16) for p, g in params_grads: if g is None: continue if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue - new_grad = layers.elementwise_mul(x=g, y=clip_var) + if p.dtype == paddle.float16: + new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16) + else: + new_grad = layers.elementwise_mul(x=g, y=clip_var) params_and_grads.append((p, new_grad)) return params_and_grads diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py new file mode 100644 index 00000000000000..3e5eedbec9aea3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import paddle +import numpy as np +from hybrid_parallel_mp_model import TestDistMPTraning +import paddle.distributed.fleet as fleet +import unittest + + +class TestMPFP16(TestDistMPTraning): + def build_optimizer(self, model): + grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) + scheduler = paddle.optimizer.lr.ExponentialDecay( + learning_rate=0.001, gamma=0.999, verbose=True) + optimizer = paddle.optimizer.SGD(scheduler, + grad_clip=grad_clip, + parameters=model.parameters()) + + model, optimizer = paddle.amp.decorate( + models=model, + optimizers=optimizer, + level='O2', + save_dtype='float32') + + return optimizer + + def train_batch(self, batch, model, optimizer, is_mp): + scaler = paddle.amp.GradScaler(init_loss_scaling=5160) + if is_mp: + scaler = fleet.distributed_scaler(scaler) + with paddle.amp.auto_cast(enable=True, level="O2"): + output = model(batch) + loss = output.mean() + + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + return scaled + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py index 33a04a5e7e1838..84d11670027fef 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py @@ -61,11 +61,14 @@ def test_pp_model(self): rank_id = dist.get_rank() set_random_seed(1024, dp_id, rank_id) + grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) + #construct model a model_a = AlexNet(10) scheduler_a = paddle.optimizer.lr.PiecewiseDecay( boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a, + grad_clip=grad_clip, parameters=model_a.parameters()) scaler_a = paddle.amp.GradScaler(init_loss_scaling=2**5) @@ -80,6 +83,7 @@ def test_pp_model(self): scheduler_b = paddle.optimizer.lr.PiecewiseDecay( boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b, + grad_clip=grad_clip, parameters=model_b.parameters()) model_b = fleet.distributed_model(model_b) optimizer_b = fleet.distributed_optimizer(optimizer_b) diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py index 571459365addfc..9042cdba976753 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py @@ -61,11 +61,14 @@ def test_pp_model(self): rank_id = dist.get_rank() set_random_seed(1024, dp_id, rank_id) + grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) + #construct model a model_a = AlexNet(10) scheduler_a = paddle.optimizer.lr.PiecewiseDecay( boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a, + grad_clip=grad_clip, parameters=model_a.parameters()) scaler_a = paddle.amp.GradScaler(init_loss_scaling=2**5) @@ -75,6 +78,7 @@ def test_pp_model(self): scheduler_b = paddle.optimizer.lr.PiecewiseDecay( boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b, + grad_clip=grad_clip, parameters=model_b.parameters()) param_len = len(model_a.parameters()) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py index 4b9d6764bbb3b6..3705deb5ad856f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py @@ -30,6 +30,9 @@ def test_hybrid_parallel_mp_model(self): def test_hybrid_parallel_mp_amp(self): self.run_mnist_2gpu('hybrid_parallel_mp_amp.py') + def test_hybrid_parallel_mp_fp16(self): + self.run_mnist_2gpu('hybrid_parallel_mp_fp16.py') + def test_hybrid_parallel_mp_clip_grad(self): self.run_mnist_2gpu('hybrid_parallel_mp_clip_grad.py') From ded3e705ef34e5660de17d8aeb7ded3818abb63b Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Wed, 20 Oct 2021 20:21:19 +0800 Subject: [PATCH 230/298] [heterps]fix heterps pipeline training (#36512) * split into PreBuildTask and BuildPull; slove endpass bug;test=develop * change buildcpu into prebuild and buildcpu into build;test=develop --- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 54 ++++++++++++------- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 19 +++---- 2 files changed, 45 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index d3990c1f3dd769..4fb98e526d5fc4 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -40,7 +40,7 @@ namespace framework { std::shared_ptr PSGPUWrapper::s_instance_ = NULL; bool PSGPUWrapper::is_initialized_ = false; -void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { +void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin"; platform::Timer timeline; timeline.Start(); @@ -49,17 +49,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { auto& local_keys = gpu_task->feature_keys_; auto& local_ptr = gpu_task->value_ptr_; - auto& device_keys = gpu_task->device_keys_; - auto& device_vals = gpu_task->device_values_; - auto& device_mutex = gpu_task->mutex_; - std::vector threads; -#ifdef PADDLE_WITH_PSLIB - auto fleet_ptr = FleetWrapper::GetInstance(); -#endif -#ifdef PADDLE_WITH_PSCORE - auto fleet_ptr = paddle::distributed::Communicator::GetInstance(); -#endif // data should be in input channel thread_keys_.resize(thread_keys_thread_num_); @@ -181,6 +171,25 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { VLOG(3) << "GpuPs shard: " << i << " key len: " << local_keys[i].size(); local_ptr[i].resize(local_keys[i].size()); } +} + +void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { + platform::Timer timeline; + int device_num = heter_devices_.size(); + auto& local_keys = gpu_task->feature_keys_; + auto& local_ptr = gpu_task->value_ptr_; + + auto& device_keys = gpu_task->device_keys_; + auto& device_vals = gpu_task->device_values_; + auto& device_mutex = gpu_task->mutex_; + + std::vector threads(thread_keys_shard_num_); +#ifdef PADDLE_WITH_PSLIB + auto fleet_ptr = FleetWrapper::GetInstance(); +#endif +#ifdef PADDLE_WITH_PSCORE + auto fleet_ptr = paddle::distributed::Communicator::GetInstance(); +#endif #ifdef PADDLE_WITH_PSLIB // get day_id: day nums from 1970 @@ -482,29 +491,32 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { void PSGPUWrapper::start_build_thread() { running_ = true; VLOG(3) << "start build CPU&GPU ps thread."; - build_cpu_threads_ = std::thread([this] { build_cpu_thread(); }); - build_gpu_threads_ = std::thread([this] { build_gpu_thread(); }); + pre_build_threads_ = std::thread([this] { pre_build_thread(); }); + build_threads_ = std::thread([this] { build_thread(); }); } -void PSGPUWrapper::build_cpu_thread() { +void PSGPUWrapper::pre_build_thread() { + // prebuild: process load_data while (running_) { std::shared_ptr gpu_task = nullptr; if (!data_ready_channel_->Get(gpu_task)) { continue; } - VLOG(3) << "thread BuildTask start."; + VLOG(3) << "thread PreBuildTask start."; platform::Timer timer; timer.Start(); // build cpu ps data process - BuildTask(gpu_task); + PreBuildTask(gpu_task); timer.Pause(); - VLOG(1) << "thread BuildTask end, cost time: " << timer.ElapsedSec() << "s"; + VLOG(1) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec() + << "s"; buildcpu_ready_channel_->Put(gpu_task); } VLOG(3) << "build cpu thread end"; } -void PSGPUWrapper::build_gpu_thread() { +void PSGPUWrapper::build_thread() { + // build: build_pull + build_gputask while (running_) { std::shared_ptr gpu_task = nullptr; if (!gpu_free_channel_->Get(gpu_task)) { @@ -516,12 +528,14 @@ void PSGPUWrapper::build_gpu_thread() { VLOG(3) << "thread BuildGPUTask start."; platform::Timer timer; timer.Start(); + BuildPull(gpu_task); + timer.Pause(); + timer.Start(); BuildGPUTask(gpu_task); timer.Pause(); VLOG(1) << "thread BuildGPUTask end, cost time: " << timer.ElapsedSec() << "s"; - gpu_task_pool_.Push(gpu_task); train_ready_channel_->Put(gpu_task); } VLOG(3) << "build gpu thread end"; @@ -557,6 +571,8 @@ void PSGPUWrapper::EndPass() { if (keysize_max != 0) { HeterPs_->end_pass(); } + + gpu_task_pool_.Push(current_task_); current_task_ = nullptr; gpu_free_channel_->Put(current_task_); timer.Pause(); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 6f785cad33e2d2..c1f83d2fe9274d 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -84,13 +84,14 @@ class PSGPUWrapper { const int batch_size); void BuildGPUTask(std::shared_ptr gpu_task); - void BuildTask(std::shared_ptr gpu_task); + void PreBuildTask(std::shared_ptr gpu_task); + void BuildPull(std::shared_ptr gpu_task); void LoadIntoMemory(bool is_shuffle); void BeginPass(); void EndPass(); void start_build_thread(); - void build_cpu_thread(); - void build_gpu_thread(); + void pre_build_thread(); + void build_thread(); void Finalize() { VLOG(3) << "PSGPUWrapper Begin Finalize."; @@ -102,10 +103,10 @@ class PSGPUWrapper { gpu_free_channel_->Close(); train_ready_channel_->Close(); running_ = false; - VLOG(3) << "begin stop build_cpu_threads_"; - build_cpu_threads_.join(); - VLOG(3) << "begin stop build_gpu_threads_"; - build_gpu_threads_.join(); + VLOG(3) << "begin stop pre_build_threads_"; + pre_build_threads_.join(); + VLOG(3) << "begin stop build_threads_"; + build_threads_.join(); s_instance_ = nullptr; VLOG(3) << "PSGPUWrapper Finalize Finished."; } @@ -310,8 +311,8 @@ class PSGPUWrapper { train_ready_channel_ = paddle::framework::MakeChannel>(); std::shared_ptr current_task_ = nullptr; - std::thread build_cpu_threads_; - std::thread build_gpu_threads_; + std::thread pre_build_threads_; + std::thread build_threads_; bool running_ = false; protected: From e82c3a5f6da3348845a65670d412d5607c7b9c14 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 21 Oct 2021 10:10:49 +0800 Subject: [PATCH 231/298] Support No DataTransform From GetKernelTypeForVar (#36571) * Add kQueueSync.synchronize_run_ logic * Support No DataTransform From GetKernelTypeForVar --- .../fluid/framework/new_executor/interpretercore.cc | 2 ++ .../framework/new_executor/interpretercore_util.cc | 12 ++++++++++-- .../fluid/framework/new_executor/new_executor_defs.h | 3 +++ .../fluid/framework/new_executor/stream_analyzer.cc | 3 ++- 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index f6157367cd4e2e..b26d213ddf7740 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -118,6 +118,8 @@ void InterpreterCore::Convert() { temp_inst.input_index_ = vec_func_list_[i].input_index; temp_inst.output_index_ = vec_func_list_[i].output_index; temp_inst.type_ = vec_func_list_[i].type_; + temp_inst.no_data_transform_index_ = + vec_func_list_[i].no_data_transform_index; OpInOutInfo info; diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 3438fc3bd4dcd1..7bb0429c6228b2 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -278,6 +278,7 @@ void build_op_func_list(const platform::Place& place, // step 3. Insert memcpy_op if needed VariableValueMap& ins_map_temp = runtime_context.inputs; + std::unordered_set no_data_transform_index; for (auto& var_name_item : ins_map_temp) { for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto var = var_name_item.second[i]; @@ -289,8 +290,14 @@ void build_op_func_list(const platform::Place& place, static_cast(op_base) ->GetKernelTypeForVar(var_name_item.first, *tensor_in, expected_kernel_key); - if (!platform::is_same_place(kernel_type_for_var.place_, - expected_kernel_key.place_)) { + if (platform::is_same_place(kernel_type_for_var.place_, + expected_kernel_key.place_)) { + // record no need data transformer input var_id + auto& var_name = inputs_names[var_name_item.first][i]; + VLOG(3) << op->Type() << " found no data_transform var: " << var_name + << " with id: " << var_scope->name2id[var_name]; + no_data_transform_index.emplace(var_scope->name2id[var_name]); + } else { if (op_base->Type() == "fetch_v2") { op_base->SetAttr("deepcopy", false); } @@ -385,6 +392,7 @@ void build_op_func_list(const platform::Place& place, } } } + op_func_node.no_data_transform_index = std::move(no_data_transform_index); // step 4. Run op kernel op_list->push_back(op_base); VLOG(3) << op_base->Type() diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 19b7b6d5dc299f..e6cff353a659d7 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -511,6 +511,8 @@ struct Instruction { std::map> input_index_; std::map> output_index_; + std::unordered_set no_data_transform_index_; + std::vector gc_check_var_list; NextInstruction next_instruction_; @@ -527,6 +529,7 @@ struct OpFuncNode { // int unsed; std::map> input_index; std::map> output_index; + std::unordered_set no_data_transform_index; OpKernelComputeFunc kernel_func_; platform::DeviceContext* dev_ctx_; // not owned diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc index a9322d8fc88edb..ffc2da499e1f7b 100644 --- a/paddle/fluid/framework/new_executor/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc @@ -38,7 +38,8 @@ std::vector StreamAnalyzer::ParseEventVarIds( std::vector new_event_var_ids; for (auto& item : next_instr.input_index_) { for (auto var_id : item.second) { - if (unique_var_ids.count(var_id) > 0) { + if (unique_var_ids.count(var_id) > 0 && + next_instr.no_data_transform_index_.count(var_id) == 0) { new_event_var_ids.push_back(var_id); } } From 1d38a01347cc7017ba65d93a3283fd7eaa415e2a Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Thu, 21 Oct 2021 10:20:41 +0800 Subject: [PATCH 232/298] refine comments for GradScaler state_dict (#36522) --- python/paddle/amp/grad_scaler.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 83f57fc74e89ae..ca08ce196a983f 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -579,11 +579,15 @@ def state_dict(self): Reurns: A dict of scaler includes: - init_loss_scaling (float, optional): The initial loss scaling factor. - incr_ratio(float, optional): The multiplier to use when increasing the loss scaling. - decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing the loss scaling. - incr_every_n_steps(int, optional): Increases loss scaling every n consecutive steps with finite gradients. - decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n accumulated steps with nan or inf gradients. + scale (tensor): The loss scaling factor. + incr_ratio(float): The multiplier to use when increasing the loss scaling. + decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling. + incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients. + decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients. + incr_count(int): The number of recent consecutive unskipped steps. + decr_count(int): The number of recent consecutive skipped steps. + use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True. + Examples: From f69857749a34755de641444aab324e483eff79a0 Mon Sep 17 00:00:00 2001 From: YipZLF <22539457+YipZLF@users.noreply.github.com> Date: Thu, 21 Oct 2021 10:41:56 +0800 Subject: [PATCH 233/298] Fixed unit test for auto parallel cost model (#36574) --- .../test_auto_parallel_cost_model.py | 53 +++++++++---------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py index 58d033ad658315..000b1db61381e3 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py @@ -16,6 +16,7 @@ import unittest +import copy import paddle import paddle.nn as nn import paddle.static as static @@ -141,28 +142,24 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): loss, train_program, startup_program = mlp_forward(train_program, startup_program) + dist_strategy = fleet.DistributedStrategy() + # auto completion complete_train_program = auto.complete_annotation(train_program, dist_context) + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + # logical partition + auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + dist_params_grads = partitioner.apply_backward( + loss, complete_train_program, startup_program, auto_parallel_main_prog, + auto_parallel_startup_prog) + optimizer = paddle.fluid.optimizer.AdamOptimizer() + opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, + auto_parallel_main_prog, + auto_parallel_startup_prog) - dist_strategy = fleet.DistributedStrategy() - dist_main_prog = [] - dist_startup_prog = [] - for rank_id in range(NUM_RANKS): - partitioner = Partitioner(dist_strategy, dist_context, rank_id) - # logical partition - auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( - complete_train_program, startup_program) - dist_params_grads = partitioner.apply_backward( - loss, complete_train_program, startup_program, - auto_parallel_main_prog, auto_parallel_startup_prog) - optimizer = paddle.fluid.optimizer.AdamOptimizer() - opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, - auto_parallel_main_prog, - auto_parallel_startup_prog) - dist_main_prog.append(auto_parallel_main_prog) - dist_startup_prog.append(auto_parallel_startup_prog) - return dist_main_prog, dist_startup_prog + return auto_parallel_main_prog, auto_parallel_startup_prog def check_runtime_estimation(cost): @@ -210,20 +207,20 @@ def test_empty_program_cost_model(self): self.assertTrue(check_empty_program_memory(cost)) def test_auto_parallel_cost_model(self): - train_program = paddle.static.Program() - startup_program = paddle.static.Program() - dist_context = DistributedContext() standalone_cost_data = get_single_node_data() - distributed_program, dist_startup_prog = get_dist_prog( - train_program, startup_program, dist_context, 0) + dist_program = [] for rank_id in range(NUM_RANKS): - complete_backward_annotation(distributed_program[rank_id], - dist_context) - reshard(distributed_program[rank_id], dist_startup_prog[rank_id], - rank_id, dist_context) + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + distributed_program, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + reshard(distributed_program, dist_startup_prog, rank_id, + dist_context) + dist_program.append(distributed_program) cluster = None cost = estimate_cost( - distributed_program, + dist_program, cluster=cluster, pipeline_config=pp_cfg, standalone_cost_data=standalone_cost_data, From 72533986d9c0885720c3793b2e4ed5e02cca39cd Mon Sep 17 00:00:00 2001 From: liutiexing <74819124+liutiexing@users.noreply.github.com> Date: Thu, 21 Oct 2021 11:07:43 +0800 Subject: [PATCH 234/298] Fix flame graph (#36578) * add align for WorkQueue * add spinlock * merge develop * merge * Add EventsWaiter * Revert "Add EventsWaiter" This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2. * adjust multithread using, fix flame graph * update --- .../framework/new_executor/interpretercore.cc | 35 +++++++++++-------- .../framework/new_executor/interpretercore.h | 3 +- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index b26d213ddf7740..7e16c3619d61c4 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -376,7 +376,8 @@ void InterpreterCore::ExecuteInstructionList( vec_instr.size(), op_run_number_.load())); } -void InterpreterCore::RunNextInstruction(const Instruction& instr) { +void InterpreterCore::RunNextInstructions( + const Instruction& instr, std::queue* reserved_next_ops) { auto& next_instr = instr.next_instruction_; auto& atomic_deps = async_work_queue_.AtomicDeps(); auto IsReady = [&](size_t next_id) { @@ -395,12 +396,12 @@ void InterpreterCore::RunNextInstruction(const Instruction& instr) { // keep all async_ops running in current thread for (auto next_id : next_instr.direct_run_) { if (IsReady(next_id)) { - RunInstructionAsync(next_id); + reserved_next_ops->push(next_id); } } for (auto next_id : next_instr.event_wait_run_) { if (IsReady(next_id)) { - RunInstructionAsync(next_id); + reserved_next_ops->push(next_id); } } } else { @@ -428,25 +429,31 @@ void InterpreterCore::RunNextInstruction(const Instruction& instr) { [&, next_id] { RunInstructionAsync(next_id); }); } } - if (first_op != 0) RunInstructionAsync(first_op); + if (first_op != 0) reserved_next_ops->push(first_op); } } void InterpreterCore::RunInstructionAsync(size_t instr_id) { - auto& instr_node = vec_instruction_[instr_id]; - platform::RecordEvent instruction_event( - instr_node.kernel_func_.operator_base_->Type()); - event_manager_.WaitEvent(instr_node, place_); + std::queue ready_ops; + ready_ops.push(instr_id); + while (!ready_ops.empty()) { + instr_id = ready_ops.front(); + ready_ops.pop(); + auto& instr_node = vec_instruction_[instr_id]; + platform::RecordEvent instruction_event( + instr_node.kernel_func_.operator_base_->Type()); + event_manager_.WaitEvent(instr_node, place_); - RunInstruction(instr_node); + RunInstruction(instr_node); - event_manager_.RecordEvent(instr_node, place_); - op_run_number_.fetch_add(1, std::memory_order_relaxed); + event_manager_.RecordEvent(instr_node, place_); + op_run_number_.fetch_add(1, std::memory_order_relaxed); - // GC infomation - CheckGC(instr_id, instr_node.gc_check_var_list); + // GC infomation + CheckGC(instr_id, instr_node.gc_check_var_list); - RunNextInstruction(instr_node); + RunNextInstructions(instr_node, &ready_ops); + } } void InterpreterCore::CheckGC(size_t instr_id, diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 47f23aff4f00e7..d6c916b9ddc4c8 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -68,7 +68,8 @@ class InterpreterCore { void CheckGC(size_t instr_id, const std::vector& gc_check_list); void RunInstructionAsync(size_t instr_id); - void RunNextInstruction(const Instruction& instr_id); + void RunNextInstructions(const Instruction& instr_id, + std::queue* reserved_next_ops); void AddFetch(const std::vector& fetch_names); void BuildSkipShareLoDInfo(); From d64f7b3bda82cba9b8cd77573fda6a0be1a83887 Mon Sep 17 00:00:00 2001 From: zhaocaibei123 <48509226+zhaocaibei123@users.noreply.github.com> Date: Thu, 21 Oct 2021 11:18:25 +0800 Subject: [PATCH 235/298] add ctr table depends (#36465) * add ctr table depends * code style * fix * fix * fix naming * rename * rename --- .../fluid/distributed/common/local_random.h | 65 +++++ paddle/fluid/distributed/ps.proto | 68 +++++ paddle/fluid/distributed/table/CMakeLists.txt | 6 +- .../distributed/table/depends/feature_value.h | 167 ++++++++++++ .../distributed/table/depends/sparse_utils.h | 5 +- .../distributed/table/sparse_sgd_rule.cc | 243 ++++++++++++++++++ .../fluid/distributed/table/sparse_sgd_rule.h | 134 ++++++++++ paddle/fluid/distributed/test/CMakeLists.txt | 6 + .../distributed/test/feature_value_test.cc | 55 ++++ .../distributed/test/sparse_sgd_rule_test.cc | 191 ++++++++++++++ 10 files changed, 937 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/distributed/common/local_random.h create mode 100644 paddle/fluid/distributed/table/depends/feature_value.h create mode 100644 paddle/fluid/distributed/table/sparse_sgd_rule.cc create mode 100644 paddle/fluid/distributed/table/sparse_sgd_rule.h create mode 100644 paddle/fluid/distributed/test/feature_value_test.cc create mode 100644 paddle/fluid/distributed/test/sparse_sgd_rule_test.cc diff --git a/paddle/fluid/distributed/common/local_random.h b/paddle/fluid/distributed/common/local_random.h new file mode 100644 index 00000000000000..96b8d2d21a5605 --- /dev/null +++ b/paddle/fluid/distributed/common/local_random.h @@ -0,0 +1,65 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +namespace paddle { +namespace distributed { + +// Get time in seconds. +inline double current_realtime() { + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + return tp.tv_sec + tp.tv_nsec * 1e-9; +} + +inline std::default_random_engine& local_random_engine() { + struct engine_wrapper_t { + std::default_random_engine engine; + engine_wrapper_t() { + static std::atomic x(0); // NOLINT + std::seed_seq sseq = { + x++, x++, x++, (unsigned long)(current_realtime() * 1000)}; // NOLINT + engine.seed(sseq); + } + }; + thread_local engine_wrapper_t r; + return r.engine; +} + +template +std::uniform_real_distribution& local_uniform_real_distribution() { + thread_local std::uniform_real_distribution distr; + assert(distr.a() == 0.0 && distr.b() == 1.0); + return distr; +} + +template +T uniform_real() { + return local_uniform_real_distribution()(local_random_engine()); +} + +template +T uniform_real(T a, T b) { + if (a == b) { + return a; + } + return (T)(a + uniform_real() * (b - a)); +} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto index 862ae4a504d9b4..002be15b003eb3 100644 --- a/paddle/fluid/distributed/ps.proto +++ b/paddle/fluid/distributed/ps.proto @@ -119,10 +119,41 @@ message TableParameter { message TableAccessorParameter { optional string accessor_class = 1; + // optional SparseSGDRuleParameter sparse_sgd_param = 2; optional uint32 fea_dim = 4 [ default = 11 ]; optional uint32 embedx_dim = 5 [ default = 8 ]; optional uint32 embedx_threshold = 6 [ default = 10 ]; + optional CtrAccessorParameter ctr_accessor_param = 7; repeated TableAccessorSaveParameter table_accessor_save_param = 8; + // optional SparseCommonSGDRuleParameter sparse_commonsgd_param = 9; + optional SparseCommonSGDRuleParameter embed_sgd_param = 10; + optional SparseCommonSGDRuleParameter embedx_sgd_param = 11; +} + +message CtrAccessorParameter { + optional float nonclk_coeff = 1 + [ default = 0.1 ]; // to calculate show_click_score + optional float click_coeff = 2 + [ default = 1 ]; // to calculate show_click_score + optional float base_threshold = 3 [ + default = 1.5 + ]; // show_click_score > base_threshold, this feature can be saved + optional float delta_threshold = 4 + [ default = + 0.25 ]; // delta_score > delta_threshold, this feature can be saved + optional float delta_keep_days = 5 + [ default = + 16 ]; // unseen_day < delta_keep_days, this feature can be saved + optional float show_click_decay_rate = 6 [ + default = 0.98 + ]; // show/click will update to show/click * show_click_decay_rate after a day + optional float delete_threshold = 7 + [ default = 0.8 ]; // threshold to shrink a feasign + optional float delete_after_unseen_days = 8 + [ default = 30 ]; // unseen_day > delete_after_unseen_days, this feature + // will be delete in shrink_model + optional int32 ssd_unseenday_threshold = 9 + [ default = 1 ]; // threshold to save ssd } message TensorAccessorParameter { @@ -150,3 +181,40 @@ message TableAccessorSaveParameter { optional string converter = 2; optional string deconverter = 3; } + +// message SparseSGDRuleParameter { +// optional double learning_rate = 1 [default = 0.05]; +// optional double initial_g2sum = 2 [default = 3.0]; +// optional double initial_range = 3 [default = 0.0001]; +// repeated float weight_bounds = 4; +//} + +message SparseCommonSGDRuleParameter { + optional string name = 1; + optional SparseNaiveSGDRuleParameter naive = 2; + optional SparseAdagradSGDRuleParameter adagrad = 3; + optional SparseAdamSGDParameter adam = 4; +} + +message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + repeated float weight_bounds = 3; +} + +message + SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_g2sum = 2 [ default = 3.0 ]; + optional double initial_range = 3 [ default = 0.0001 ]; + repeated float weight_bounds = 4; +} + +message SparseAdamSGDParameter { // SparseAdamSGDRule + optional double learning_rate = 1 [ default = 0.001 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + optional double beta1_decay_rate = 3 [ default = 0.9 ]; + optional double beta2_decay_rate = 4 [ default = 0.999 ]; + optional double ada_epsilon = 5 [ default = 1e-08 ]; + repeated float weight_bounds = 6; +} diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt index c928ebe90ceb9e..b4b87e652b7dbc 100644 --- a/paddle/fluid/distributed/table/CMakeLists.txt +++ b/paddle/fluid/distributed/table/CMakeLists.txt @@ -35,4 +35,8 @@ cc_library(tensor_accessor SRCS tensor_accessor.cc DEPS ${TABLE_DEPS} eigen3 ps_ cc_library(tensor_table SRCS tensor_table.cc DEPS eigen3 ps_framework_proto executor scope device_context tensor ${TABLE_DEPS}) set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost) +set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto) + + +cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost sparse_sgd_rule) diff --git a/paddle/fluid/distributed/table/depends/feature_value.h b/paddle/fluid/distributed/table/depends/feature_value.h new file mode 100644 index 00000000000000..ad037a86bce80c --- /dev/null +++ b/paddle/fluid/distributed/table/depends/feature_value.h @@ -0,0 +1,167 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include // NOLINT +#include +#include +#include // NOLINT +#include +#include +#include +#include +#include "gflags/gflags.h" + +#include "butil/object_pool.h" +#include "paddle/fluid/distributed/common/utils.h" +#include "paddle/fluid/distributed/table/depends/initializers.h" +#include "paddle/fluid/distributed/thirdparty/round_robin.h" +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace distributed { + +static const int CTR_SPARSE_SHARD_BUCKET_NUM_BITS = 6; +static const size_t CTR_SPARSE_SHARD_BUCKET_NUM = + static_cast(1) << CTR_SPARSE_SHARD_BUCKET_NUM_BITS; + +class FixedFeatureValue { + public: + FixedFeatureValue() {} + ~FixedFeatureValue() {} + float *data() { return data_.data(); } + size_t size() { return data_.size(); } + void resize(size_t size) { data_.resize(size); } + void shrink_to_fit() { data_.shrink_to_fit(); } + + private: + std::vector data_; +}; + +class SparseTableShard { + public: + typedef typename robin_hood::unordered_map + map_type; + SparseTableShard() {} + ~SparseTableShard() {} + + FixedFeatureValue *Init(const uint64_t &id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + FixedFeatureValue *value = nullptr; + value = butil::get_object(); + table[id] = value; + return value; + } + + // dont judge if (has(id)) + float *Get(const uint64_t &id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + // auto &value = table.at(id); + // return value->data_.data(); + auto res = table.find(id); + FixedFeatureValue *value = res->second; + return value->data(); + } + + // for load, to reset count, unseen_days + FixedFeatureValue *GetValue(const uint64_t &id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + + auto &table = values_[bucket]; + auto res = table.find(id); + return res->second; + } + + void erase(uint64_t feasign) { + size_t hash = hasher_(feasign); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto iter = table.find(feasign); + if (iter != table.end()) { + butil::return_object(iter->second); + iter = table.erase(iter); + } + } + + void clear() {} + + size_t compute_bucket(size_t hash) { + if (CTR_SPARSE_SHARD_BUCKET_NUM == 1) { + return 0; + } else { + return hash >> (sizeof(size_t) * 8 - CTR_SPARSE_SHARD_BUCKET_NUM_BITS); + } + } + + map_type::iterator end() { + return values_[CTR_SPARSE_SHARD_BUCKET_NUM - 1].end(); + } + + map_type::iterator Find(uint64_t id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto got = table.find(id); + if (got == table.end()) { + return end(); + } else { + return got; + } + } + + private: + bool Has(const uint64_t id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto got = table.find(id); + if (got == table.end()) { + return false; + } else { + return true; + } + } + + public: + map_type values_[CTR_SPARSE_SHARD_BUCKET_NUM]; + std::hash hasher_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/depends/sparse_utils.h b/paddle/fluid/distributed/table/depends/sparse_utils.h index c185dd17d792e4..708f7786bf3b09 100644 --- a/paddle/fluid/distributed/table/depends/sparse_utils.h +++ b/paddle/fluid/distributed/table/depends/sparse_utils.h @@ -31,8 +31,9 @@ struct PullSparseValue { feasigns_(nullptr), frequencies_(nullptr) {} - explicit PullSparseValue(std::vector feasigns, - std::vector frequencies, int dim) { + explicit PullSparseValue(std::vector& feasigns, // NOLINT + std::vector& frequencies, // NOLINT + int dim) { numel_ = feasigns.size(); dim_ = dim; is_training_ = true; diff --git a/paddle/fluid/distributed/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/table/sparse_sgd_rule.cc new file mode 100644 index 00000000000000..614656a5a85d30 --- /dev/null +++ b/paddle/fluid/distributed/table/sparse_sgd_rule.cc @@ -0,0 +1,243 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/sparse_sgd_rule.h" +#include +#include "glog/logging.h" + +DEFINE_bool(enable_show_scale_gradient, true, "enable show scale gradient"); + +namespace paddle { +namespace distributed { + +void SparseNaiveSGDRule::load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim) { + _embedding_dim = emb_dim; + auto naive_param = param.naive(); + learning_rate_ = naive_param.learning_rate(); + _initial_range = naive_param.initial_range(); + if (naive_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(naive_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << naive_param.weight_bounds_size(); + _min_bound = naive_param.weight_bounds(0); + _max_bound = naive_param.weight_bounds(1); + } +} + +void SparseNaiveSGDRule::update_value_work(float* w, float* sgd, + const float* push_value, + float scale) { + for (size_t i = 0; i < _embedding_dim; ++i) { + w[i] -= learning_rate_ * push_value[i]; + bound_value(w[i]); + } +} + +void SparseNaiveSGDRule::init_value_work(float* value, float* sgd, + bool zero_init) { + if (zero_init) { + for (size_t i = 0; i < _embedding_dim; ++i) { + value[i] = 0; + } + } else { + for (size_t i = 0; i < _embedding_dim; ++i) { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * 2 - + 1) * + _initial_range; + bound_value(value[i]); + } + } +} +void SparseAdaGradSGDRule::load_config( + const SparseCommonSGDRuleParameter& param, size_t emb_dim) { + _embedding_dim = emb_dim; + auto adagrad_param = param.adagrad(); + learning_rate_ = adagrad_param.learning_rate(); + _initial_g2sum = adagrad_param.initial_g2sum(); + _initial_range = adagrad_param.initial_range(); + + if (adagrad_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(adagrad_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << adagrad_param.weight_bounds_size(); + _min_bound = adagrad_param.weight_bounds(0); + _max_bound = adagrad_param.weight_bounds(1); + } +} + +void SparseAdaGradSGDRule::update_value_work(float* w, float* sgd, + const float* grad, float scale) { + float& g2sum = sgd[g2sum_index()]; + double add_g2sum = 0; + + for (int i = 0; i < _embedding_dim; i++) { + double scaled_grad = grad[i] / scale; + w[i] -= learning_rate_ * scaled_grad * + sqrt(_initial_g2sum / (_initial_g2sum + g2sum)); + bound_value(w[i]); + add_g2sum += scaled_grad * scaled_grad; + } + + g2sum += add_g2sum / _embedding_dim; +} + +void SparseAdaGradSGDRule::init_value_work(float* value, float* sgd, + bool zero_init) { + for (int i = 0; i < _embedding_dim; ++i) { + if (zero_init) { + value[i] = 0.0; + bound_value(value[i]); + } else { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * + 2 - + 1) * + _initial_range; + bound_value(value[i]); + } + } + sgd[g2sum_index()] = 0; +} + +void StdAdaGradSGDRule::load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim) { + _embedding_dim = emb_dim; + auto adagrad_param = param.adagrad(); + learning_rate_ = adagrad_param.learning_rate(); + _initial_g2sum = adagrad_param.initial_g2sum(); + _initial_range = adagrad_param.initial_range(); + + if (adagrad_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(adagrad_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << adagrad_param.weight_bounds_size(); + _min_bound = adagrad_param.weight_bounds(0); + _max_bound = adagrad_param.weight_bounds(1); + } +} + +void StdAdaGradSGDRule::update_value_work(float* w, float* sgd, + const float* grad, float scale) { + for (int i = 0; i < _embedding_dim; i++) { + float& g2sum = sgd[g2sum_index() + i]; + double scaled_grad = grad[i] / scale; + w[i] -= learning_rate_ * scaled_grad * + sqrt(_initial_g2sum / (_initial_g2sum + g2sum)); + bound_value(w[i]); + g2sum += scaled_grad * scaled_grad; + } +} + +void StdAdaGradSGDRule::init_value_work(float* value, float* sgd, + bool zero_init) { + for (int i = 0; i < _embedding_dim; ++i) { + if (zero_init) { + value[i] = 0.0; + bound_value(value[i]); + } else { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * + 2 - + 1) * + _initial_range; + bound_value(value[i]); + } + sgd[g2sum_index() + i] = 0; + } +} + +void SparseAdamSGDRule::load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim) { + _embedding_dim = emb_dim; + auto adam_param = param.adam(); + learning_rate_ = adam_param.learning_rate(); + _initial_range = adam_param.initial_range(); + _beta1_decay_rate = adam_param.beta1_decay_rate(); + _beta2_decay_rate = adam_param.beta2_decay_rate(); + _ada_epsilon = adam_param.ada_epsilon(); + if (adam_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(adam_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << adam_param.weight_bounds_size(); + _min_bound = adam_param.weight_bounds(0); + _max_bound = adam_param.weight_bounds(1); + } +} + +void SparseAdamSGDRule::update_value_work(float* w, float* sgd, + const float* grad, float scale) { + float* gsum = sgd + gsum_index(); + float* g2sum = sgd + g2sum_index(); + float* beta1_pow = sgd + beta1_pow_index(); + float* beta2_pow = sgd + beta2_pow_index(); + const float* g = grad; + + float lr = learning_rate_; + float beta1_pow_ = *beta1_pow; + float beta2_pow_ = *beta2_pow; + + // lr not change in one update + lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_); + for (int i = 0; i < _embedding_dim; i++) { + // Calculation + gsum[i] = _beta1_decay_rate * gsum[i] + (1 - _beta1_decay_rate) * g[i]; + g2sum[i] = + _beta2_decay_rate * g2sum[i] + (1 - _beta2_decay_rate) * g[i] * g[i]; + w[i] = w[i] - lr * (gsum[i] / (sqrt(g2sum[i]) + _ada_epsilon)); + bound_value(w[i]); + } + // update beta_pow_decay + (*beta1_pow) *= _beta1_decay_rate; + (*beta2_pow) *= _beta2_decay_rate; +} + +void SparseAdamSGDRule::init_value_work(float* value, float* sgd, + bool zero_init) { + for (int i = 0; i < _embedding_dim; ++i) { + if (zero_init) { + value[i] = 0.0; + bound_value(value[i]); + } else { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * + 2 - + 1) * + _initial_range; + bound_value(value[i]); + } + } + // init rule gsum and g2sum + for (int i = gsum_index(); i < beta1_pow_index(); i++) { + sgd[i] = 0.0; + } + // init beta1_pow and beta2_pow + *(sgd + beta1_pow_index()) = _beta1_decay_rate; + *(sgd + beta2_pow_index()) = _beta2_decay_rate; +} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/sparse_sgd_rule.h b/paddle/fluid/distributed/table/sparse_sgd_rule.h new file mode 100644 index 00000000000000..ba2baa42f742ab --- /dev/null +++ b/paddle/fluid/distributed/table/sparse_sgd_rule.h @@ -0,0 +1,134 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "glog/logging.h" // for CHECK +#include "paddle/fluid/distributed/common/local_random.h" // for local_uniform_real_distribution +#include "paddle/fluid/distributed/common/registerer.h" +#include "paddle/fluid/distributed/ps.pb.h" + +namespace paddle { +namespace distributed { + +class SparseValueSGDRule { + public: + SparseValueSGDRule() {} + virtual ~SparseValueSGDRule() {} + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim) { + _embedding_dim = emb_dim; + _name = param.name(); + } + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale) = 0; + virtual void init_value_work(float* value, float* sgd, bool zero_init) = 0; + virtual size_t dim() = 0; + const std::string& get_name() const { return _name; } + void init_value(float* value, float* sgd, bool zero_init = true) { + init_value_work(value, sgd, zero_init); + } + void update_value(float* w, float* sgd, const float* push_value, + float scale = 1) { + update_value_work(w, sgd, push_value, scale); + } + template + void bound_value(T& w) { // NOLINT + if (!(w >= _min_bound)) { + w = (T)_min_bound; + } else if (!(w <= _max_bound)) { + w = (T)_max_bound; + } + } + float& min_bound() { return _min_bound; } + float& max_bound() { return _max_bound; } + + protected: + float _min_bound; + float _max_bound; + float _initial_range; + size_t _embedding_dim; + + private: + std::string _name; +}; + +REGISTER_PSCORE_REGISTERER(SparseValueSGDRule); + +class SparseNaiveSGDRule : public SparseValueSGDRule { + public: + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale); + virtual void init_value_work(float* value, float* sgd, bool zero_init); + virtual size_t dim() { return 0; } + + private: + float learning_rate_; +}; + +class SparseAdaGradSGDRule : public SparseValueSGDRule { + public: + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale); + virtual void init_value_work(float* value, float* sgd, bool zero_init); + virtual size_t dim() { return 1; } + size_t g2sum_index() { return 0; } + + private: + float learning_rate_; + float _initial_g2sum; +}; + +class StdAdaGradSGDRule : public SparseValueSGDRule { + public: + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale); + virtual void init_value_work(float* value, float* sgd, bool zero_init); + virtual size_t dim() { return _embedding_dim; } + size_t g2sum_index() { return 0; } + + private: + float learning_rate_; + float _initial_g2sum; +}; + +class SparseAdamSGDRule : public SparseValueSGDRule { + public: + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale); + virtual void init_value_work(float* value, float* sgd, bool zero_init); + virtual size_t dim() { return _embedding_dim * 2 + 2; } + size_t gsum_index() { return 0; } + size_t g2sum_index() { return gsum_index() + _embedding_dim; } + size_t beta1_pow_index() { return g2sum_index() + _embedding_dim; } + size_t beta2_pow_index() { return beta1_pow_index() + 1; } + + protected: + float learning_rate_; + float _beta1_decay_rate; + float _beta2_decay_rate; + float _ada_epsilon; +}; +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index af87e1b6cc61d1..832797ec2fc0ee 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -20,3 +20,9 @@ cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_funct set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) + +set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table) + +set_source_files_properties(sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc DEPS ${COMMON_DEPS} boost table) diff --git a/paddle/fluid/distributed/test/feature_value_test.cc b/paddle/fluid/distributed/test/feature_value_test.cc new file mode 100644 index 00000000000000..9c9f0ffcac321d --- /dev/null +++ b/paddle/fluid/distributed/test/feature_value_test.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include +#include // NOLINT +#include + +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/table/depends/feature_value.h" + +namespace paddle { +namespace distributed { + +TEST(BENCHMARK, LargeScaleKV) { + std::shared_ptr shard = + std::make_shared(); + uint64_t key = 1; + auto itr = shard->Find(key); + ASSERT_TRUE(itr == shard->end()); + + std::vector vec = {0.0, 0.1, 0.2, 0.3}; + + auto* feature_value = shard->Init(key); + feature_value->resize(vec.size()); + memcpy(feature_value->data(), vec.data(), vec.size() * sizeof(float)); + + itr = shard->Find(key); + ASSERT_TRUE(itr != shard->end()); + + feature_value = itr->second; + float* value_data = feature_value->data(); + + ASSERT_FLOAT_EQ(value_data[0], 0.0); + ASSERT_FLOAT_EQ(value_data[1], 0.1); + ASSERT_FLOAT_EQ(value_data[2], 0.2); + ASSERT_FLOAT_EQ(value_data[3], 0.3); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc new file mode 100644 index 00000000000000..e86234f1bd9c76 --- /dev/null +++ b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc @@ -0,0 +1,191 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/table/sparse_sgd_rule.h" +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps.pb.h" + +namespace paddle { +namespace distributed { + +TEST(sparse_value_naive_sgd_test, init_and_update) { + SparseNaiveSGDRule rule; + SparseCommonSGDRuleParameter param; + param.set_name("naive"); + auto* naive_param = param.mutable_naive(); + naive_param->set_learning_rate(0.1); + naive_param->set_initial_range(0.3); + naive_param->add_weight_bounds(-10.0); + naive_param->add_weight_bounds(10.0); + + rule.load_config(param, 10); + + // check init_value for zero + const int kItemSize = 10; + float w[kItemSize]; + float grad[kItemSize]; + rule.init_value(w, w + 9, true); + + for (auto i = 0u; i < kItemSize; ++i) { + ASSERT_FLOAT_EQ(w[i], 0); + } + + // check init_value for random + rule.init_value(w, w + 9, false); + for (auto i = 0u; i < kItemSize; ++i) { + ASSERT_TRUE(w[i] >= rule.min_bound() && w[i] <= rule.max_bound()); + } + + // check update_value for one field + for (auto i = 0u; i < kItemSize; ++i) { + w[i] = 0; + } + for (auto i = 0u; i < kItemSize; ++i) { + grad[i] = (i + 1) * 1.0; + } + float label[] = {-0.100000, -0.200000, -0.300000, -0.400000, -0.500000, + -0.600000, -0.700000, -0.800000, -0.900000, -1.000000}; + const float* ptr_grad = grad; + rule.update_value(w, w + 9, ptr_grad); + + for (auto i = 0u; i < kItemSize; ++i) { + VLOG(3) << w[i] << "\n"; + ASSERT_FLOAT_EQ(w[i], label[i]); + } +} + +TEST(downpour_sparse_adagrad_test, test_init_and_update) { + SparseAdaGradSGDRule rule; + SparseCommonSGDRuleParameter param; + param.set_name("adagrad"); + auto* adagrad_param = param.mutable_adagrad(); + adagrad_param->set_learning_rate(0.1); + adagrad_param->set_initial_g2sum(0.2); + adagrad_param->set_initial_range(0.3); + adagrad_param->add_weight_bounds(-10.0); + adagrad_param->add_weight_bounds(10.0); + + rule.load_config(param, 10); + + // check init_value for zero + const int kValueSize = 11; + int kEmbSize = 10; + float w[kValueSize]; + + rule.init_value(w, w + 10, true); + + for (auto i = 0u; i < kEmbSize; ++i) { + ASSERT_FLOAT_EQ(w[i], 0); + } + ASSERT_FLOAT_EQ(w[kEmbSize], 0); + + // check init_value for random + rule.init_value(w, w + 10, false); + for (auto i = 0u; i < kEmbSize; ++i) { + ASSERT_TRUE(w[i] >= rule.min_bound() && w[i] <= rule.max_bound()); + } + ASSERT_FLOAT_EQ(w[kEmbSize], 0); + + // check update_value for one field + for (auto i = 0u; i < kEmbSize; ++i) { + w[i] = 0; + } + w[kEmbSize] = 0; + float grad[kEmbSize]; + for (auto i = 0u; i < kEmbSize; ++i) { + grad[i] = (i + 1) * 1.0; + } + + const float* ptr_grad = grad; + rule.update_value(w, w + 10, ptr_grad); + float label[] = {-0.100000, -0.200000, -0.300000, -0.400000, + -0.500000, -0.600000, -0.700000, -0.800000, + -0.900000, -1.000000, 38.500000}; + for (auto i = 0u; i < kValueSize; ++i) { + ASSERT_FLOAT_EQ(w[i], label[i]); + } +} + +TEST(downpour_sparse_adam_test, test_init_and_update) { + const int embed_dim = 10; // dims of parameters + SparseCommonSGDRuleParameter param; + param.set_name("adam"); + auto* adam_param = param.mutable_adam(); + adam_param->set_learning_rate(0.1); + adam_param->set_initial_range(0.3); + adam_param->set_beta1_decay_rate(0.9); + adam_param->set_beta2_decay_rate(0.999); + adam_param->set_ada_epsilon(1e-08); + adam_param->add_weight_bounds(-10.0); + adam_param->add_weight_bounds(10.0); + + ASSERT_FLOAT_EQ(param.adam().learning_rate(), 0.1); + ASSERT_FLOAT_EQ(param.adam().initial_range(), 0.3); + ASSERT_FLOAT_EQ(param.adam().beta1_decay_rate(), 0.9); + ASSERT_FLOAT_EQ(param.adam().beta2_decay_rate(), 0.999); + ASSERT_FLOAT_EQ(param.adam().ada_epsilon(), 1e-08); + + SparseAdamSGDRule rule; + + rule.load_config(param, embed_dim); + + // check init_value for zero + const int rule_dim = + rule.dim(); // dims of gsum + g2sum + beta1_pow + beta2_pow in adam + const int value_dim = embed_dim + rule_dim; // total dims of w + rule + float* value = new float[value_dim]; + rule.init_value(value, value + embed_dim, true); + for (auto i = 0u; i < rule.beta1_pow_index(); ++i) { + ASSERT_FLOAT_EQ(value[i], 0); + } + ASSERT_FLOAT_EQ(*(value + embed_dim + rule.beta1_pow_index()), 0.9); + ASSERT_FLOAT_EQ(*(value + embed_dim + rule.beta2_pow_index()), 0.999); + + // check init_value for random + rule.init_value(value, value + embed_dim, false); + for (auto i = 0u; i < embed_dim; ++i) { + ASSERT_TRUE(value[i] >= rule.min_bound() && value[i] <= rule.max_bound()); + } + for (auto i = rule.gsum_index(); i < rule.beta1_pow_index(); ++i) { + ASSERT_FLOAT_EQ(value[i + embed_dim], 0); + } + ASSERT_FLOAT_EQ(*(value + embed_dim + rule.beta1_pow_index()), 0.9); + ASSERT_FLOAT_EQ(*(value + embed_dim + rule.beta2_pow_index()), 0.999); + + // check update_value + rule.init_value(value, value + embed_dim, true); + float* grad = new float[embed_dim]; + for (auto i = 0u; i < embed_dim; ++i) { + grad[i] = (i + 1) * 1.0; + } + + float label[] = {-0.0999999642, -0.099999994, -0.099999994, -0.099999994, + -0.099999994, -0.099999994, -0.099999994, -0.100000001, + -0.100000009, -0.100000001, 0.100000024, 0.200000048, + 0.300000072, 0.400000095, 0.500000119, 0.600000143, + 0.700000167, 0.800000191, 0.900000215, 1.00000024, + 0.000999987125, 0.0039999485, 0.00899988413, 0.015999794, + 0.0249996781, 0.0359995365, 0.0489993691, 0.063999176, + 0.0809989572, 0.0999987125, 0.809999943, 0.998001039}; + + rule.update_value(value, value + embed_dim, grad); + + for (auto i = 0u; i < value_dim; ++i) { // check update + ASSERT_FLOAT_EQ(value[i], label[i]) << "i is " << i; + } +} +} // namespace distributed +} // namespace paddle From 5eb640c6c3d9baa66e7a960f0d213420e2b792d4 Mon Sep 17 00:00:00 2001 From: seemingwang Date: Thu, 21 Oct 2021 11:19:01 +0800 Subject: [PATCH 236/298] Graph engine4 (#36587) --- .../distributed/service/graph_brpc_client.cc | 58 ++++- .../distributed/service/graph_brpc_client.h | 3 +- .../distributed/service/graph_brpc_server.cc | 204 +++++++++++++++++- .../distributed/service/graph_brpc_server.h | 9 + .../distributed/service/graph_py_service.cc | 1 + .../fluid/distributed/service/sendrecv.proto | 1 + paddle/fluid/distributed/service/server.h | 3 +- .../distributed/table/common_graph_table.cc | 18 +- .../distributed/table/common_graph_table.h | 5 +- .../fluid/distributed/test/graph_node_test.cc | 6 + 10 files changed, 292 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc index 68d9c9669b6972..9f65a66708def0 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/service/graph_brpc_client.cc @@ -304,7 +304,63 @@ std::future GraphBrpcClient::remove_graph_node( // char* &buffer,int &actual_size std::future GraphBrpcClient::batch_sample_neighboors( uint32_t table_id, std::vector node_ids, int sample_size, - std::vector>> &res) { + std::vector>> &res, + int server_index) { + if (server_index != -1) { + res.resize(node_ids.size()); + DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + if (closure->check_response(0, PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER) != + 0) { + ret = -1; + } else { + auto &res_io_buffer = closure->cntl(0)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + std::unique_ptr buffer_wrapper(new char[bytes_size]); + char *buffer = buffer_wrapper.get(); + io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + + size_t node_num = *(size_t *)buffer; + int *actual_sizes = (int *)(buffer + sizeof(size_t)); + char *node_buffer = buffer + sizeof(size_t) + sizeof(int) * node_num; + + int offset = 0; + for (size_t node_idx = 0; node_idx < node_num; ++node_idx) { + int actual_size = actual_sizes[node_idx]; + int start = 0; + while (start < actual_size) { + res[node_idx].push_back( + {*(uint64_t *)(node_buffer + offset + start), + *(float *)(node_buffer + offset + start + + GraphNode::id_size)}); + start += GraphNode::id_size + GraphNode::weight_size; + } + offset += actual_size; + } + } + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + ; + closure->request(0)->set_cmd_id(PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER); + closure->request(0)->set_table_id(table_id); + closure->request(0)->set_client_id(_client_id); + closure->request(0)->add_params((char *)node_ids.data(), + sizeof(uint64_t) * node_ids.size()); + closure->request(0)->add_params((char *)&sample_size, sizeof(int)); + ; + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); + closure->cntl(0)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(0), closure->request(0), + closure->response(0), closure); + return fut; + } std::vector request2server; std::vector server2request(server_size, -1); res.clear(); diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h index 8acb2047b8e972..1fbb3fa9b0550e 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.h +++ b/paddle/fluid/distributed/service/graph_brpc_client.h @@ -64,7 +64,8 @@ class GraphBrpcClient : public BrpcPsClient { // given a batch of nodes, sample graph_neighboors for each of them virtual std::future batch_sample_neighboors( uint32_t table_id, std::vector node_ids, int sample_size, - std::vector>>& res); + std::vector>>& res, + int server_index = -1); virtual std::future pull_graph_list(uint32_t table_id, int server_index, int start, diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc index 110d4406fc5569..b404082f7c4102 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/service/graph_brpc_server.cc @@ -61,6 +61,10 @@ int32_t GraphBrpcServer::initialize() { return 0; } +brpc::Channel *GraphBrpcServer::get_cmd_channel(size_t server_index) { + return _pserver_channels[server_index].get(); +} + uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) { std::unique_lock lock(mutex_); @@ -80,6 +84,42 @@ uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) { return 0; } +int32_t GraphBrpcServer::build_peer2peer_connection(int rank) { + this->rank = rank; + auto _env = environment(); + brpc::ChannelOptions options; + options.protocol = "baidu_std"; + options.timeout_ms = 500000; + options.connection_type = "pooled"; + options.connect_timeout_ms = 10000; + options.max_retry = 3; + + std::vector server_list = _env->get_ps_servers(); + _pserver_channels.resize(server_list.size()); + std::ostringstream os; + std::string server_ip_port; + for (size_t i = 0; i < server_list.size(); ++i) { + server_ip_port.assign(server_list[i].ip.c_str()); + server_ip_port.append(":"); + server_ip_port.append(std::to_string(server_list[i].port)); + _pserver_channels[i].reset(new brpc::Channel()); + if (_pserver_channels[i]->Init(server_ip_port.c_str(), "", &options) != 0) { + VLOG(0) << "GraphServer connect to Server:" << server_ip_port + << " Failed! Try again."; + std::string int_ip_port = + GetIntTypeEndpoint(server_list[i].ip, server_list[i].port); + if (_pserver_channels[i]->Init(int_ip_port.c_str(), "", &options) != 0) { + LOG(ERROR) << "GraphServer connect to Server:" << int_ip_port + << " Failed!"; + return -1; + } + } + os << server_ip_port << ","; + } + LOG(INFO) << "servers peer2peer connection success:" << os.str(); + return 0; +} + int32_t GraphBrpcService::clear_nodes(Table *table, const PsRequestMessage &request, PsResponseMessage &response, @@ -160,6 +200,9 @@ int32_t GraphBrpcService::initialize() { &GraphBrpcService::remove_graph_node; _service_handler_map[PS_GRAPH_SET_NODE_FEAT] = &GraphBrpcService::graph_set_node_feat; + _service_handler_map[PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER] = + &GraphBrpcService::sample_neighboors_across_multi_servers; + // shard初始化,server启动后才可从env获取到server_list的shard信息 initialize_shard_info(); @@ -172,10 +215,10 @@ int32_t GraphBrpcService::initialize_shard_info() { if (_is_initialize_shard_info) { return 0; } - size_t shard_num = _server->environment()->get_ps_servers().size(); + server_size = _server->environment()->get_ps_servers().size(); auto &table_map = *(_server->table()); for (auto itr : table_map) { - itr.second->set_shard(_rank, shard_num); + itr.second->set_shard(_rank, server_size); } _is_initialize_shard_info = true; } @@ -209,7 +252,9 @@ void GraphBrpcService::service(google::protobuf::RpcController *cntl_base, int service_ret = (this->*handler_func)(table, *request, *response, cntl); if (service_ret != 0) { response->set_err_code(service_ret); - response->set_err_msg("server internal error"); + if (!response->has_err_msg()) { + response->set_err_msg("server internal error"); + } } } @@ -403,7 +448,156 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table, return 0; } - +int32_t GraphBrpcService::sample_neighboors_across_multi_servers( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl) { + // sleep(5); + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 2) { + set_response_code( + response, -1, + "graph_random_sample request requires at least 2 arguments"); + return 0; + } + size_t node_num = request.params(0).size() / sizeof(uint64_t), + size_of_size_t = sizeof(size_t); + uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); + int sample_size = *(uint64_t *)(request.params(1).c_str()); + // std::vector res = ((GraphTable + // *)table).filter_out_non_exist_nodes(node_data, sample_size); + std::vector request2server; + std::vector server2request(server_size, -1); + std::vector local_id; + std::vector local_query_idx; + size_t rank = get_rank(); + for (int query_idx = 0; query_idx < node_num; ++query_idx) { + int server_index = + ((GraphTable *)table)->get_server_index_by_id(node_data[query_idx]); + if (server2request[server_index] == -1) { + server2request[server_index] = request2server.size(); + request2server.push_back(server_index); + } + } + if (server2request[rank] != -1) { + auto pos = server2request[rank]; + std::swap(request2server[pos], + request2server[(int)request2server.size() - 1]); + server2request[request2server[pos]] = pos; + server2request[request2server[(int)request2server.size() - 1]] = + request2server.size() - 1; + } + size_t request_call_num = request2server.size(); + std::vector> local_buffers; + std::vector local_actual_sizes; + std::vector seq; + std::vector> node_id_buckets(request_call_num); + std::vector> query_idx_buckets(request_call_num); + for (int query_idx = 0; query_idx < node_num; ++query_idx) { + int server_index = + ((GraphTable *)table)->get_server_index_by_id(node_data[query_idx]); + int request_idx = server2request[server_index]; + node_id_buckets[request_idx].push_back(node_data[query_idx]); + query_idx_buckets[request_idx].push_back(query_idx); + seq.push_back(request_idx); + } + size_t remote_call_num = request_call_num; + if (request2server.size() != 0 && request2server.back() == rank) { + remote_call_num--; + local_buffers.resize(node_id_buckets.back().size()); + local_actual_sizes.resize(node_id_buckets.back().size()); + } + cntl->response_attachment().append(&node_num, sizeof(size_t)); + auto local_promise = std::make_shared>(); + std::future local_fut = local_promise->get_future(); + std::vector failed(server_size, false); + std::function func = [&, node_id_buckets, query_idx_buckets, + request_call_num](void *done) { + local_fut.get(); + std::vector actual_size; + auto *closure = (DownpourBrpcClosure *)done; + std::vector> res( + remote_call_num); + size_t fail_num = 0; + for (size_t request_idx = 0; request_idx < remote_call_num; ++request_idx) { + if (closure->check_response(request_idx, PS_GRAPH_SAMPLE_NEIGHBOORS) != + 0) { + ++fail_num; + failed[request2server[request_idx]] = true; + } else { + auto &res_io_buffer = closure->cntl(request_idx)->response_attachment(); + size_t node_size; + res[request_idx].reset(new butil::IOBufBytesIterator(res_io_buffer)); + size_t num; + res[request_idx]->copy_and_forward(&num, sizeof(size_t)); + } + } + int size; + int local_index = 0; + for (size_t i = 0; i < node_num; i++) { + if (fail_num > 0 && failed[seq[i]]) { + size = 0; + } else if (request2server[seq[i]] != rank) { + res[seq[i]]->copy_and_forward(&size, sizeof(int)); + } else { + size = local_actual_sizes[local_index++]; + } + actual_size.push_back(size); + } + cntl->response_attachment().append(actual_size.data(), + actual_size.size() * sizeof(int)); + + local_index = 0; + for (size_t i = 0; i < node_num; i++) { + if (fail_num > 0 && failed[seq[i]]) { + continue; + } else if (request2server[seq[i]] != rank) { + char temp[actual_size[i] + 1]; + res[seq[i]]->copy_and_forward(temp, actual_size[i]); + cntl->response_attachment().append(temp, actual_size[i]); + } else { + char *temp = local_buffers[local_index++].get(); + cntl->response_attachment().append(temp, actual_size[i]); + } + } + closure->set_promise_value(0); + }; + + DownpourBrpcClosure *closure = new DownpourBrpcClosure(remote_call_num, func); + + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + + for (int request_idx = 0; request_idx < remote_call_num; ++request_idx) { + int server_index = request2server[request_idx]; + closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBOORS); + closure->request(request_idx)->set_table_id(request.table_id()); + closure->request(request_idx)->set_client_id(rank); + size_t node_num = node_id_buckets[request_idx].size(); + + closure->request(request_idx) + ->add_params((char *)node_id_buckets[request_idx].data(), + sizeof(uint64_t) * node_num); + closure->request(request_idx) + ->add_params((char *)&sample_size, sizeof(int)); + PsService_Stub rpc_stub( + ((GraphBrpcServer *)get_server())->get_cmd_channel(server_index)); + // GraphPsService_Stub rpc_stub = + // getServiceStub(get_cmd_channel(server_index)); + closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), + closure->response(request_idx), closure); + } + if (server2request[rank] != -1) { + ((GraphTable *)table) + ->random_sample_neighboors(node_id_buckets.back().data(), sample_size, + local_buffers, local_actual_sizes); + } + local_promise.get()->set_value(0); + if (remote_call_num == 0) func(closure); + fut.get(); + return 0; +} int32_t GraphBrpcService::graph_set_node_feat(Table *table, const PsRequestMessage &request, PsResponseMessage &response, @@ -412,7 +606,7 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table, if (request.params_size() < 3) { set_response_code( response, -1, - "graph_set_node_feat request requires at least 2 arguments"); + "graph_set_node_feat request requires at least 3 arguments"); return 0; } size_t node_num = request.params(0).size() / sizeof(uint64_t); diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h index 6b4853fa679923..817fe08331165d 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.h +++ b/paddle/fluid/distributed/service/graph_brpc_server.h @@ -32,6 +32,8 @@ class GraphBrpcServer : public PSServer { virtual ~GraphBrpcServer() {} PsBaseService *get_service() { return _service.get(); } virtual uint64_t start(const std::string &ip, uint32_t port); + virtual int32_t build_peer2peer_connection(int rank); + virtual brpc::Channel *get_cmd_channel(size_t server_index); virtual int32_t stop() { std::unique_lock lock(mutex_); if (stoped_) return 0; @@ -50,6 +52,7 @@ class GraphBrpcServer : public PSServer { mutable std::mutex mutex_; std::condition_variable cv_; bool stoped_ = false; + int rank; brpc::Server _server; std::shared_ptr _service; std::vector> _pserver_channels; @@ -113,12 +116,18 @@ class GraphBrpcService : public PsBaseService { int32_t print_table_stat(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl); + int32_t sample_neighboors_across_multi_servers( + Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + private: bool _is_initialize_shard_info; std::mutex _initialize_shard_mutex; std::unordered_map _msg_handler_map; std::vector _ori_values; const int sample_nodes_ranges = 23; + size_t server_size; + std::shared_ptr<::ThreadPool> task_pool; }; } // namespace distributed diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc index b4159627013174..498805136417f2 100644 --- a/paddle/fluid/distributed/service/graph_py_service.cc +++ b/paddle/fluid/distributed/service/graph_py_service.cc @@ -107,6 +107,7 @@ void GraphPyServer::start_server(bool block) { empty_vec.push_back(empty_prog); pserver_ptr->configure(server_proto, _ps_env, rank, empty_vec); pserver_ptr->start(ip, port); + pserver_ptr->build_peer2peer_connection(rank); std::condition_variable* cv_ = pserver_ptr->export_cv(); if (block) { std::mutex mutex_; diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto index 696c950d9b33ba..42e25258ec3fe1 100644 --- a/paddle/fluid/distributed/service/sendrecv.proto +++ b/paddle/fluid/distributed/service/sendrecv.proto @@ -56,6 +56,7 @@ enum PsCmdID { PS_GRAPH_ADD_GRAPH_NODE = 35; PS_GRAPH_REMOVE_GRAPH_NODE = 36; PS_GRAPH_SET_NODE_FEAT = 37; + PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER = 38; } message PsRequestMessage { diff --git a/paddle/fluid/distributed/service/server.h b/paddle/fluid/distributed/service/server.h index 89b089386f5018..dffe19545ce52b 100644 --- a/paddle/fluid/distributed/service/server.h +++ b/paddle/fluid/distributed/service/server.h @@ -147,7 +147,7 @@ class PsBaseService : public PsService { public: PsBaseService() : _rank(0), _server(NULL), _config(NULL) {} virtual ~PsBaseService() {} - + virtual size_t get_rank() { return _rank; } virtual int32_t configure(PSServer *server) { _server = server; _rank = _server->rank(); @@ -167,6 +167,7 @@ class PsBaseService : public PsService { } virtual int32_t initialize() = 0; + PSServer *get_server() { return _server; } protected: size_t _rank; diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc index 41f4b0dac4d96e..2c20e79b3b2d34 100644 --- a/paddle/fluid/distributed/table/common_graph_table.cc +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -305,12 +305,12 @@ Node *GraphTable::find_node(uint64_t id) { return node; } uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { - return node_id % shard_num % shard_num_per_table % task_pool_size_; + return node_id % shard_num % shard_num_per_server % task_pool_size_; } uint32_t GraphTable::get_thread_pool_index_by_shard_index( uint64_t shard_index) { - return shard_index % shard_num_per_table % task_pool_size_; + return shard_index % shard_num_per_server % task_pool_size_; } int32_t GraphTable::clear_nodes() { @@ -575,6 +575,11 @@ int32_t GraphTable::pull_graph_list(int start, int total_size, actual_size = size; return 0; } + +int32_t GraphTable::get_server_index_by_id(uint64_t id) { + return id % shard_num / shard_num_per_server; +} + int32_t GraphTable::initialize() { _shards_task_pool.resize(task_pool_size_); for (size_t i = 0; i < _shards_task_pool.size(); ++i) { @@ -611,13 +616,12 @@ int32_t GraphTable::initialize() { shard_num = _config.shard_num(); VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx" << _shard_idx; - shard_num_per_table = sparse_local_shard_num(shard_num, server_num); - shard_start = _shard_idx * shard_num_per_table; - shard_end = shard_start + shard_num_per_table; + shard_num_per_server = sparse_local_shard_num(shard_num, server_num); + shard_start = _shard_idx * shard_num_per_server; + shard_end = shard_start + shard_num_per_server; VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start " << shard_start << " shard_end " << shard_end; - // shards.resize(shard_num_per_table); - shards = std::vector(shard_num_per_table, GraphShard(shard_num)); + shards = std::vector(shard_num_per_server, GraphShard(shard_num)); return 0; } } // namespace distributed diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h index f643337a80f7c2..d681262c664807 100644 --- a/paddle/fluid/distributed/table/common_graph_table.h +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -94,6 +94,7 @@ class GraphTable : public SparseTable { int32_t remove_graph_node(std::vector &id_list); + int32_t get_server_index_by_id(uint64_t id); Node *find_node(uint64_t id); virtual int32_t pull_sparse(float *values, @@ -128,9 +129,11 @@ class GraphTable : public SparseTable { const std::vector &feature_names, const std::vector> &res); + size_t get_server_num() { return server_num; } + protected: std::vector shards; - size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num; + size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num; const int task_pool_size_ = 24; const int random_sample_nodes_ranges = 3; diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index 810530cdbec94d..613770220f9d79 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -138,6 +138,10 @@ void testSingleSampleNeighboor( for (auto g : s) { ASSERT_EQ(true, s1.find(g) != s1.end()); } + vs.clear(); + pull_status = worker_ptr_->batch_sample_neighboors(0, {96, 37}, 4, vs, 0); + pull_status.wait(); + ASSERT_EQ(vs.size(), 2); } void testAddNode( @@ -356,6 +360,7 @@ void RunServer() { pserver_ptr_->configure(server_proto, _ps_env, 0, empty_vec); LOG(INFO) << "first server, run start(ip,port)"; pserver_ptr_->start(ip_, port_); + pserver_ptr_->build_peer2peer_connection(0); LOG(INFO) << "init first server Done"; } @@ -373,6 +378,7 @@ void RunServer2() { empty_vec2.push_back(empty_prog2); pserver_ptr2->configure(server_proto2, _ps_env2, 1, empty_vec2); pserver_ptr2->start(ip2, port2); + pserver_ptr2->build_peer2peer_connection(1); } void RunClient( From 921c0917a37b6d5012f6290b6c061a1266d10a22 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Thu, 21 Oct 2021 11:45:38 +0800 Subject: [PATCH 237/298] Fix a bug in ReadData, ReadDataBc and ReadDataReduce when NX != 1 (#36373) * Update the implement of reduceAnyKernel according to kernel primitive api * Fix a bug in ReadData, ReadDataBc and ReadDataReduce when NX != 1 --- .../elementwise/elementwise_op_broadcast.cu.h | 2 +- .../fluid/operators/fused/attn_bias_add.cu.h | 4 +- .../kernel_primitives/compute_primitives.h | 74 +++-- .../kernel_primitives/datamover_primitives.h | 286 +++++++++++++----- .../fluid/operators/reduce_ops/reduce_op.cu.h | 59 ++-- 5 files changed, 286 insertions(+), 139 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index 53ac85802a6f43..549a6be0b4507e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -171,7 +171,7 @@ __device__ __forceinline__ void LoadData( // num: how many data will be deal with in this time if (need_broadcast) { kps::ReadDataBc(dst, src, block_offset, - config, numel, 1, 1); + config, numel); } else { kps::ReadData(dst, src + block_offset, num); } diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index fa3eb19b29995a..18ae932c9325a9 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -72,14 +72,14 @@ __global__ void BroadcastKernelBinary( // load in0 if (use_broadcast[0]) { kernel_primitives::ReadDataBc( - arg0, in0, fix, configlists[0], numel, 1, 1); + arg0, in0, fix, configlists[0], numel); } else { kernel_primitives::ReadData(arg0, in0 + fix, num); } // load in1 if (use_broadcast[1]) { kernel_primitives::ReadDataBc( - arg1, in1, fix, configlists[1], numel, 1, 1); + arg1, in1, fix, configlists[1], numel); } else { kernel_primitives::ReadData(arg1, in1 + fix, num); } diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/fluid/operators/kernel_primitives/compute_primitives.h index a36c76d7881737..73316d66b6cf26 100644 --- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h @@ -135,17 +135,16 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { } // namespace details /** - * @brief Perform unary calculation according to OpFunc. Size of input and + * @brief Perform unary calculation according to OpFunc. Shape of input and * output are the same. * * @template paraments - * InT: Data type of in. - * OutT: Data type of out. + * InT: The data type of in. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following: * template * struct XxxFunctor { @@ -170,21 +169,20 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, } /** - * @brief Binary calculation according to OpFunc. Size of The input and output + * @brief Binary calculation according to OpFunc. Shape of The input and output * are the same. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. - * NX: The number of data columns loaded by each thread. - * NY: The number of data rows loaded by each thread. + * InT: The data type of in1 and in2. + * OutT: The data type of out. + * NX: The number of data columns computed by each thread. + * NY: The number of data rows computed by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following: - * template + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const InT& a, const InT& b) const { + * HOSTDEVICE InT operator()(const InT& a, const InT& b) const { * return ...; * } * }; @@ -193,7 +191,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, * out: The register pointer of out, the size is NX * NY. * in1: The register pointer of fist input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY. - * compute: Compute function which was declared like OpFunc(). + * compute: Compute function which was declared like OpFunc(). */ template @@ -207,21 +205,20 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, } /** - * @brief Ternary calculation according to OpFunc. Size of input and output + * @brief Ternary calculation according to OpFunc. Shape of input and output * are the same. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. + * InT: The data type of in1 and in2. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following - * template + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const InT& a, const InT& b, const InT& c) + * HOSTDEVICE InT operator()(const InT& a, const InT& b, const InT& c) * const { * return ...; * } @@ -232,7 +229,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, * in1: The register pointer of fist input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY. * in3: The register pointer of third input, size is NX * NY. - * compute: Compute function which was declared like OpFunc(). + * compute: Compute function which was declared like OpFunc(). */ template @@ -247,30 +244,29 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, } /** - * @brief Multivariate calculation according to OpFunc. Size of input and output - * are the same. + * @brief Multivariate calculation according to OpFunc. Shape of inputs and + * output are the same. * * @template paraments - * InT: Data type of in1, in2 and in3. - * OutT: Data type of out. + * InT: The data type of in1, in2 and in3. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. - * Arity: The size of ins + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * Arity: The size of ins. * OpFunc: Compute functor which has an operator() as following: - * template + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const InT* args) const { + * HOSTDEVICE InT operator()(const InT* args) const { * return ...; * } * }; * * @param * out: The register pointer of out, the size is NX * NY. - * ins: An array of pointers consisting of multiple inputs. - * compute: Compute function which was declared like OpFunc(). + * ins: A pointers of array consisting of multiple inputs. + * compute: Compute function which was declared like OpFunc(). */ template @@ -293,13 +289,12 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], * shape is [NY, NX]. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. + * InT: The data type of in1 and in2. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following * template * struct XxxFunctor { @@ -339,8 +334,7 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, * NX: The number of data continuously loaded by each thread. * NY: The number of data rows loaded by each thread, only NY = 1 was supported. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * ReduceFunctor: Compute functor which has an operator() as following * template * struct ReduceFunctor { diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h index c720bedf0a3afc..860072bd0c52ec 100644 --- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h @@ -118,8 +118,8 @@ struct BroadcastConfig { } // namespace details /** - * @brief Read 2D data from global memory to registers according to Tx type, and - * store it as Ty type. + * @brief Read 2D data from global memory to register according to Tx type, and + * store it as Ty type into register. * * @template paraments * Tx: The type of data stored in the global memory. @@ -127,8 +127,7 @@ struct BroadcastConfig { * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than * NX x NY x blockDim, boundary judgment is required to avoid memory access @@ -136,20 +135,20 @@ struct BroadcastConfig { * * @param: * dst: The register pointer of the thread, the size is NX * NY. - * src: Data pointer of the current block. - * size_nx: The current block needs to load size_nx columns of data, this - * parameter will be used when IsBoundary = true. - * size_ny: The current block needs to load size_ny rows of data. This parameter - * will be used when IsBoundary = true. - * stride_nx: The stride of cols. - * stride_ny: The stride of rows. + * src: The data pointer of the current block. + * size_nx: The maximum offset of the current block is size_nx elements in the + * lowest dimension. The parameters are only calculated when isboundary = true. + * size_ny: The maximum offset of the current block is size_ny elements in the + * first dimension. The parameters are only calculated when isboundary = true. + * stride_nx: Each read one element stride stride_nx elements in the last dim. + * stride_ny: Each read one element stride stride_ny elements in the first dim. */ template __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, int size_nx, int size_ny, int stride_nx, int stride_ny) { - int thread_offset = threadIdx.x * NX; + int thread_offset = threadIdx.x; int left_size_nx = size_nx - thread_offset; // Each branch is added for better performance @@ -165,7 +164,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, #pragma unroll for (int idy = 0; idy < NY; ++idy) { if (IsBoundary) { - if (idy >= size_ny) { + if (idy * stride_ny >= size_ny) { break; } } @@ -175,7 +174,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, #pragma unroll for (int idx = 0; idx < NX; ++idx) { if (IsBoundary) { - if (idx >= left_size_nx) { + if (idx * stride_nx >= left_size_nx) { break; } } @@ -185,14 +184,14 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, #pragma unroll for (int idx = 0; idx < NX; ++idx) { if (IsBoundary) { - if (idx >= left_size_nx) { + if (idx * stride_nx >= left_size_nx) { break; } } #pragma unroll for (int idy = 0; idy < NY; ++idy) { if (IsBoundary) { - if (idy >= size_ny) { + if (idy * stride_ny >= size_ny) { break; } } @@ -223,25 +222,24 @@ __device__ __forceinline__ void Init(T* dst, T init_data) { } /** - * @brief Read 2D data from global memory to registers. When IsBoundary = true + * @brief Read 1D data from global memory to register. When IsBoundary = true * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to * improve memory access efficiency. * * @template paraments - * T: Data type of src and dst. - * NX: The number of data continuously loaded by each thread. - * NY: The number of data rows loaded by each thread, only NY = 1 was supported. + * T: The type of data. + * NX: Each thread load NX data from global memory continuously. + * NY: Each thread need to load NY rows, only NY = 1 was supported. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * IsBoundary: Whether to make an out-of-bounds judgment on access to memory. * When the number of data processed by this block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @param: * dst: The register pointer of the thread, the size is NX * NY. - * src: Data pointer of the current block. + * src: The data pointer of the current block. * size: The current block needs to load size data continuously. */ template @@ -276,31 +274,29 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, } /** - * @brief Read 2D data from global memory to registers for broadcast. + * @brief Read 2D data from global memory to registers with broadcast form. * * @template paraments * T: The type of data stored in the global memory. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @param: * dst: The register pointer of the thread, the size is NX * NY. - * src: Raw input data pointer of kernel. - * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX; + * src: The original input data pointer of this kernel. + * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX. * config: Calculation configuration of broadcast. It is used to calculate the - * coordinate mapping relationship between output data and input data. Please - * refer to the sample code for specific usage. + * coordinate mapping relationship between output data and input data. * total_num_output: Total number of original output. - * stride_nx: The stride of cols. - * stride_ny: The stride of rows. + * stride_nx: Each read one element stride stride_nx elements in the last dim. + * stride_ny: Each read one element stride stride_ny elements in the first dim. */ template @@ -308,7 +304,7 @@ __device__ __forceinline__ void ReadDataBc( T* dst, const T* __restrict__ src, uint32_t block_offset, details::BroadcastConfig config, int total_num_output, int stride_nx, int stride_ny) { - uint32_t thread_offset = block_offset + threadIdx.x * NX; + uint32_t thread_offset = block_offset + threadIdx.x; uint32_t index_src = 0; #pragma unroll @@ -334,37 +330,33 @@ __device__ __forceinline__ void ReadDataBc( } /** - * @brief Read 2D data from global memory to registers for reduce. + * @brief Read 2D data from global memory to register with reduce form. * * @template paraments - * T: The type of data stored in the global memory. + * T: The type of data. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @param: * dst: The register pointer of the thread, the size is NX * NY. - * src: Raw input data pointer of kernel. - * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX; + * src: The input data pointer of this block. + * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX. * index_cal: Calculation configuration of Reduce. It is used to calculate the - * coordinate mapping relationship between output data and input data. Please - * refer to the sample code for specific usage. - * block_offset: data offset of this block, blockDim.x * blockIdx.x * NX; - * index_cal: get the global index in src, attention config was declared in - * host; + * coordinate mapping relationship between output data and input data. * size_nx: The current block needs to load size_nx columns of data, this - * parameter will be used when IsBoundary = true. - * size_ny: The current block needs to load size_ny rows of data. This parameter + * parameter will participate in the calculation when isboundary = true. + * size_ny: The current block needs to load size_ny rows of data, this parameter + * will participate in the calculation when isboundary = true. * will be used when IsBoundary = true. - * stride_nx: The stride of cols. - * stride_ny: The stride of rows. + * stride_nx: Each read one element stride stride_nx columns. + * stride_ny: Each read one element stride stride_ny raws. * reduce_last_dim: Used to indicate whether the dimension of reduce contains * the lowest dimension. */ @@ -375,10 +367,13 @@ __device__ __forceinline__ void ReadDataReduce( const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx, int stride_ny, bool reduce_last_dim) { int thread_offset = 0; + int left_idx = 0; if (reduce_last_dim) { - thread_offset = block_offset + threadIdx.x; + thread_offset = threadIdx.x; + left_idx = threadIdx.y; } else { - thread_offset = block_offset + threadIdx.y; + thread_offset = threadIdx.y; + left_idx = threadIdx.x; } if (NX == 1) { @@ -389,30 +384,25 @@ __device__ __forceinline__ void ReadDataReduce( break; } } - uint32_t index_src = index_cal(thread_offset); + uint32_t index_src = index_cal(thread_offset + block_offset); dst[ny] = src[index_src]; thread_offset += stride_ny; } } else { #pragma unroll for (int nx = 0; nx < NX; ++nx) { - if (IsBoundary) { - if (nx * stride_nx >= size_nx) { - break; - } - } #pragma unroll for (int ny = 0; ny < NY; ++ny) { if (IsBoundary) { - if (nx * stride_nx >= size_nx) { + if ((thread_offset >= size_ny) || + (left_idx + nx * stride_nx >= size_nx)) { break; } } - uint32_t index_src = index_cal(thread_offset); + uint32_t index_src = index_cal(thread_offset + block_offset); dst[nx + ny * NX] = src[index_src]; thread_offset += stride_ny; } - thread_offset += stride_nx; } } } @@ -424,20 +414,19 @@ __device__ __forceinline__ void ReadDataReduce( * * @template paraments * T: The type of data. - * NX: The number of data continuously loaded by each thread. + * NX: The number of data continuously writed by each thread. * NY: The number of data rows loaded by each thread, only NY = 1 was supported. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @param: - * dst: Data pointer of the current block. - * src: The register pointer of the thread, the size is NX * NY. - * size: The current block needs to load size data continuously. + * dst: The data pointer of the current block. + * src: The register pointer, the size is NX * NY. + * size: The current block needs to load size elements continuously. */ template __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, @@ -467,6 +456,165 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, } } +/** + * @brief Write 2D data from register to global memory according to Tx type, and + * store it as Ty type. + * + * @template paraments + * Tx: The type of data that needs to be stored in registers. + * Ty: The type of data that stored in the global memory. + * NX: The number of data columns loaded by each thread. + * NY: The number of data rows loaded by each thread. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @param: + * dst: The data pointer of the current block. + * src: The register pointer of the thread, the size is NX * NY. + * size_nx: The maximum offset of the current block is size_nx elements in the + * lowest dimension. The parameters are only calculated when isboundary = true. + * size_ny: The maximum offset of the current block is size_ny elements in the + * first dimension. The parameters are only calculated when isboundary = true. + * stride_nx: Each read one element stride stride_nx elements in the last dim. + * stride_ny: Each read one element stride stride_ny elements in the first dim. + */ +template +__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src, + int size_nx, int size_ny, + int stride_nx, int stride_ny) { + int thread_offset = threadIdx.x; + int left_size_nx = size_nx - thread_offset; + + // Each branch is added for better performance + if (NX == 1 && NY == 1) { // for NX == 1 and NY == 1 + if (IsBoundary) { + if (left_size_nx > 0) { + dst[thread_offset] = static_cast(src[0]); + } + } else { + dst[thread_offset] = static_cast(src[0]); + } + } else if (NX == 1) { // for NX == 1 and NY != 1 +#pragma unroll + for (int idy = 0; idy < NY; ++idy) { + if (IsBoundary) { + if (idy * stride_ny >= size_ny) { + break; + } + } + dst[thread_offset + idy * stride_ny] = static_cast(src[idy]); + } + } else if (NY == 1) { // for NY == 1 and NX != 1 +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + if (IsBoundary) { + if (idx * stride_nx >= left_size_nx) { + break; + } + } + dst[thread_offset + idx * stride_nx] = static_cast(src[idx]); + } + } else { // for NX != 1 and NY != 1 +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + if (IsBoundary) { + if (idx * stride_nx >= left_size_nx) { + break; + } + } +#pragma unroll + for (int idy = 0; idy < NY; ++idy) { + if (IsBoundary) { + if (idy * stride_ny >= size_ny) { + break; + } + } + dst[thread_offset + idx * stride_nx + idy * stride_ny] = + static_cast(src[idy * NX + idx]); + } + } + } +} + +/** + * @brief Initialize register with init_data. + * + * @template paraments + * T: Data type of register. + * NX: Number of data to initialize. + * + * @param: + * dst: The register pointer of the thread, the size is NX. + * init_data: The register pointer of init data, the size is NX. + */ +template +__device__ __forceinline__ void Init(T* dst, T* init_data, int num) { +#pragma unroll + for (int i = 0; i < NX; i++) { + if (IsBoundary) { + if (i >= num) { + break; + } + } + dst[i] = init_data[i]; + } +} + +/** + * @brief Read 1D data from global memory to register with broadcast form. + * + * @template paraments + * T: The type of data stored in the global memory. + * NX: The number of data continuously loaded by each thread. + * NY: The number of data rows loaded by each thread, only NY = 1 was supported. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @param: + * dst: The register pointer of the thread, the size is NX * NY. + * src: The original input data pointer of kernel. + * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX; + * config: Calculation configuration of broadcast. It is used to calculate the + * coordinate mapping relationship between output data and input data. + * total_num_output: Total number of original output. + */ +template +__device__ __forceinline__ void ReadDataBc( + T* dst, const T* __restrict__ src, uint32_t block_offset, + details::BroadcastConfig config, int total_num_output) { + uint32_t thread_offset = block_offset + threadIdx.x * NX; + uint32_t index_src = 0; + +#pragma unroll + for (uint32_t nx = 0; nx < NX; ++nx) { + uint32_t index_output = thread_offset + nx; + index_src = 0; + if (IsBoundary) { + if (index_output >= total_num_output) { + break; + } + } +#pragma unroll + for (int i = 0; i < Rank; ++i) { + auto fast_divmoder = config.divmoders[i].Divmod(index_output); + index_output = fast_divmoder.val[0]; + index_src += fast_divmoder.val[1] * config.strides[i]; + } + dst[nx] = src[index_src]; + } +} + } // namespace kernel_primitives } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 28b6ebc2433224..bf451272a47b0a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -529,6 +529,31 @@ __device__ void HigherDimDealSegment(const Tx* x, Ty* y, ReduceOp reducer, kps::WriteData(y + store_offset, &temp_data, size); } +template +__device__ void ReduceAnyKernelImpl(const Tx* input, MPType* reduce_var, + ReduceOp reducer, TransformOp transformer, + MPType init, int reduce_num, int input_idx, + bool reduce_last_dim, + const Calculator& reduce_index_calculator, + int stride, int num) { + Tx input_reg[REDUCE_VEC_SIZE]; + MPType input_compute[REDUCE_VEC_SIZE]; + MPType input_transform[REDUCE_VEC_SIZE]; + + kps::Init(&input_compute[0], init); + kps::ReadDataReduce( + &input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num, + 1, stride, reduce_last_dim); + kps::ElementwiseUnary( + &input_transform[0], &input_reg[0], transformer); + kps::Init(input_compute, input_transform, + num); + kps::Reduce( + reduce_var, &input_compute[0], reducer, reduce_last_dim); +} + // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this // function will be used @@ -570,37 +595,17 @@ __global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer, // 1. reduce for each thread if (left_idx < left_num) { // load REDUCE_VEC_SIZE data once, and then compute - Tx input_reg[REDUCE_VEC_SIZE]; - MPType input_compute[REDUCE_VEC_SIZE]; int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; for (; input_idx + block_size < bound; input_idx += REDUCE_VEC_SIZE * stride) { - kps::ReadDataReduce( - &input_reg[0], input, input_idx, reduce_index_calculator, 1, - reduce_num, 1, stride, reduce_last_dim); - kps::ElementwiseUnary( - &input_compute[0], &input_reg[0], transformer); - kps::Reduce( - &reduce_var, &input_compute[0], reducer, reduce_last_dim); - } - - kps::Init(&input_compute[0], init); - kps::ReadDataReduce( - &input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num, - 1, stride, reduce_last_dim); - input_idx += tid; -#pragma unroll - for (int i = 0; i < REDUCE_VEC_SIZE; ++i) { - if (input_idx >= reduce_num) { - break; - } - input_compute[i] = static_cast(transformer(input_reg[i])); - input_idx += stride; + ReduceAnyKernelImpl( + input, &reduce_var, reducer, transformer, init, reduce_num, input_idx, + reduce_last_dim, reduce_index_calculator, stride, reduce_num); } - kps::Reduce( - &reduce_var, &input_compute[0], reducer, reduce_last_dim); + int num = (reduce_num - input_idx - tid + stride - 1) / stride; + ReduceAnyKernelImpl( + input, &reduce_var, reducer, transformer, init, reduce_num - input_idx, + input_idx, reduce_last_dim, reduce_index_calculator, stride, num); } kps::Reduce( From b6e7f8e9365b0c092f9790722d3896979c82b14a Mon Sep 17 00:00:00 2001 From: xiongkun Date: Thu, 21 Oct 2021 14:07:13 +0800 Subject: [PATCH 238/298] User specified backend (#35745) --- paddle/fluid/framework/fleet/gloo_wrapper.h | 18 ++ paddle/fluid/imperative/gloo_context.cc | 115 ++++++++++- paddle/fluid/imperative/gloo_context.h | 8 + python/paddle/distributed/fleet/launch.py | 51 ++++- .../paddle/distributed/fleet/launch_utils.py | 63 +++++- python/paddle/distributed/parallel.py | 27 +-- python/paddle/distributed/spawn.py | 88 +++++++-- python/paddle/distributed/utils.py | 22 ++- .../fluid/tests/unittests/CMakeLists.txt | 18 ++ .../parallel_dygraph_gradient_check.py | 3 +- .../unittests/parallel_dygraph_se_resnext.py | 1 + .../tests/unittests/test_cpuonly_launch.sh | 42 ++++ .../tests/unittests/test_cpuonly_spawn.py | 72 +++++++ .../fluid/tests/unittests/test_dist_base.py | 179 +++++++++++++++++- .../test_parallel_dygraph_dataparallel.py | 65 +++++++ ..._parallel_dygraph_sparse_embedding_gloo.py | 59 ++++++ ...graph_sparse_embedding_over_height_gloo.py | 44 +++++ .../test_parallel_dygraph_transformer_gloo.py | 61 ++++++ ..._parallel_dygraph_unused_variables_gloo.py | 72 +++++++ .../test_spawn_and_init_parallel_env.py | 5 +- 20 files changed, 948 insertions(+), 65 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_cpuonly_launch.sh create mode 100644 python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h index eafc991fbca0ae..f1ec042dbd7050 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.h +++ b/paddle/fluid/framework/fleet/gloo_wrapper.h @@ -238,6 +238,24 @@ class GlooWrapper { return ret; } + // TODO(xiongkun03): support all gather array of + // numbers with different length + // can use AllgathervOptions, may be work in different + // occasion. Need some survey. + template + void AllGatherVector(T* input_ptr, T* output_ptr, + size_t element_num) { // NOLINT + CHECK_EQ(is_initialized_, true); +#ifdef PADDLE_WITH_GLOO + gloo::AllgatherOptions opts(context_); + opts.setInput(input_ptr, element_num); + opts.setOutput(output_ptr, element_num * size_); + gloo::allgather(opts); +#else + LOG(WARNING) << "AllGather does nothing when WITH_GLOO=OFF"; +#endif + } + protected: bool is_initialized_ = false; #ifdef PADDLE_WITH_GLOO diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc index d7df6ec3c11641..0d93cdf57932fa 100644 --- a/paddle/fluid/imperative/gloo_context.cc +++ b/paddle/fluid/imperative/gloo_context.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/split.h" +#include "paddle/fluid/string/string_helper.h" namespace paddle { namespace framework { @@ -67,8 +68,36 @@ void GLOOParallelContext::AllReduceByStream(const framework::Variable &src, framework::Variable *dst, int ring_id, bool use_calc_stream) { // AllReduce(src, dst, strategy_, ring_id, use_calc_stream); - auto src_tensor = src.Get(); - auto *dst_tensor = dst->GetMutable(); + if (src.IsType()) { + if (!dst->IsType()) { + dst->Clear(); + } + AllReduce(src.Get(), + dst->GetMutable()); + } else if (src.IsType()) { + if (&src != dst) { + if (!dst->IsType()) { + dst->Clear(); + } + AllReduce(src.Get(), + dst->GetMutable()); + } else { + // SelectedRows cannot be allreduce in-place + framework::Variable tmp_dst; + AllReduce(src.Get(), + tmp_dst.GetMutable()); + *dst = std::move(tmp_dst); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported variable type %s for imperative allreduce, only " + "LoDTensor and SelectedRows are supported.", + platform::demangle(framework::ToTypeName(src.Type())))); + } +} + +void GLOOParallelContext::AllReduce(const framework::Tensor &src_tensor, + framework::Tensor *dst_tensor) { auto gloo_wrapper = framework::GlooWrapper::GetInstance(); dst_tensor->Resize(src_tensor.dims()); switch (src_tensor.type()) { @@ -84,6 +113,88 @@ void GLOOParallelContext::AllReduceByStream(const framework::Variable &src, gloo_wrapper->Barrier(); } +#define GLOO_ALL_GATHER_CASE(type, T, gw) \ + case type: { \ + const auto *src_tensor_ptr = src_tensor.data(); \ + gw->AllGatherVector(const_cast(src_tensor_ptr), \ + reinterpret_cast(dst_tensor_ptr), \ + value_sendcount); \ + break; \ + } + +void GLOOParallelContext::AllReduce(const framework::SelectedRows &src, + framework::SelectedRows *dst) { + // auto ; + // int local_rank = strategy_.local_rank_; + int nranks = strategy_.nranks_; + VLOG(3) << "SelectedRows AllReduce start"; + const auto &src_tensor = src.value(); + const auto &place = src_tensor.place(); + auto dtype = src_tensor.type(); + // 1. Gather rows number from all workers. Here use ncclAllGather to do this, + // but we can use other ways to implement is in the future + const auto &src_rows = src.rows(); + auto gloo_wrapper = framework::GlooWrapper::GetInstance(); + size_t local_row_num = src_rows.size(); + std::vector rows_num_vector = + gloo_wrapper->AllGather(local_row_num); + const auto *cpu_rows_num_ptr = rows_num_vector.data(); + auto rows_num = std::accumulate(cpu_rows_num_ptr, cpu_rows_num_ptr + nranks, + static_cast(0)); + dst->set_height(src.height()); + VLOG(3) << "Gather rows: " << string::join_strings(rows_num_vector, ',') + << ", total rows number: " << rows_num + << ", height: " << src.height(); + auto *dst_rows = dst->mutable_rows(); + dst_rows->resize(rows_num); + auto *dst_rows_ptr = dst_rows->MutableData(place); + const int64_t *src_rows_ptr = src_rows.Data(place); + + // VLOG(3) << "Selected Rows of src:" << string::join_strings(dst_rows, ',') + + auto *dst_tensor = dst->mutable_value(); + auto dims = src_tensor.dims(); + dims[0] = rows_num; + auto feature_size = framework::product(dims) / dims[0]; + dst_tensor->Resize(dims); + if (std::all_of(cpu_rows_num_ptr, cpu_rows_num_ptr + nranks, + [&](size_t row) { return row == cpu_rows_num_ptr[0]; })) { + // During sparse communication, the number of each card is same. + // Because gloo wrapper utility class currently don't support + // broadcast, so we only deal the-same case. + VLOG(3) << "Use the gloo all reduce to sync. SRC:" << src_tensor; + // framework::SerializeToStream(VLOG(4), src); + VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce"; + auto value_sendcount = cpu_rows_num_ptr[0] * feature_size; + auto *dst_tensor_ptr = dst_tensor->mutable_data(place, dtype); + + gloo_wrapper->AllGatherVector(const_cast(src_rows_ptr), + static_cast(dst_rows_ptr), + rows_num_vector[0]); + + switch (dtype) { + GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP32, float, + gloo_wrapper); + GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP64, double, + gloo_wrapper); + GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT32, int, gloo_wrapper); + GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT64, int64_t, + gloo_wrapper); + default: { + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid datatype for allreduce")); + } + } + VLOG(3) << "Selected Row DST:" << *dst_tensor; + VLOG(3) << "Selected Rows of DST:" + << string::join_strings(std::vector(*dst_rows), ','); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The number of each card is not the same, gloo only support the-same" + "batch division")); + } +} + paddle::platform::DeviceContext *GLOOParallelContext::GetDeviceContext( int ring_id) { // return the CPUDeviceContext diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h index f54dc1a406a92f..305a75a881153f 100644 --- a/paddle/fluid/imperative/gloo_context.h +++ b/paddle/fluid/imperative/gloo_context.h @@ -16,6 +16,9 @@ #include #include #include +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/parallel_context.h" #include "paddle/fluid/platform/device_context.h" @@ -52,6 +55,11 @@ class GLOOParallelContext : public ParallelContext { void SynchronizeCompute() override; + private: + void AllReduce(const framework::Tensor& src, framework::Tensor* dst); + void AllReduce(const framework::SelectedRows& src, + framework::SelectedRows* dst); + private: std::unique_ptr device_; }; diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index c0a1c359d17c63..16b39e0fc8e453 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -103,7 +103,12 @@ def _parse_args(): type=str, default="log", help="The path for each process's log. Default --log_dir=log/") - + base_group.add_argument( + "--backend", + type=str, + default="auto", + help="Specifize the backend, can be gloo|nccl|bkcl|auto. Default value is auto which perfers nccl or bkcl." + ) base_group.add_argument( "--nproc_per_node", type=int, @@ -230,8 +235,21 @@ def get_cluster_from_args(args, device_mode, devices_per_proc): devices_per_proc) +def cpuonly_check(args): + if args.ips and len(args.ips.split(',')) > 1: + raise RuntimeError( + "CPUONLY launch only support single trainer, that is len(ips)=1, but got %s." + % args.ips) + if args.run_mode: + assert args.run_mode == 'cpuonly', "CPUONLY launch only support run mode is CPUONLY" + if args.servers: + raise RuntimeError("CPUONLY launch can't have --servers as arguments.") + return True + + def launch_collective(args): # parse arguments, used for cloud-single-machine and local + if args.backend == 'gloo': cpuonly_check(args) (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args) trainers_num = cloud_utils.get_trainers_num() logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format( @@ -265,6 +283,7 @@ def launch_collective(args): global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0")) global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3" global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir + global_envs["PADDLE_DISTRI_BACKEND"] = args.backend procs = start_local_trainers( cluster, @@ -349,9 +368,12 @@ def which_distributed_mode(args): if fluid.core.is_compiled_with_cuda(): accelerators = fluid.core.get_cuda_device_count() + args.backend = 'nccl' elif fluid.core.is_compiled_with_npu(): + args.backend = 'unknown' accelerators = fluid.core.get_npu_device_count() elif fluid.core.is_compiled_with_xpu(): + args.backend = 'bkcl' accelerators = fluid.core.get_xpu_device_count() else: accelerators = 0 @@ -372,10 +394,14 @@ def which_distributed_mode(args): else: if not fluid.core.is_compiled_with_cuda( ) and not fluid.core.is_compiled_with_xpu(): - logger.warning( - "Not found distinct arguments and not compiled with cuda or xpu. Default use ps mode" - ) - return DistributeMode.PS + if args.servers: + logger.warning( + "Not found distinct arguments and not compiled with cuda or xpu. \ +But found args.servers not empty, default use ps mode") + return DistributeMode.PS + else: + args.backend = "gloo" + return DistributeMode.COLLECTIVE else: logger.warning( "Not found distinct arguments and compiled with cuda or xpu. Default use collective mode" @@ -556,7 +582,20 @@ def launch(): logger = get_logger() _print_arguments(args) - distribute_mode = which_distributed_mode(args) + if args.backend == 'auto': + distribute_mode = which_distributed_mode(args) + assert args.backend in [ + 'gloo', 'nccl', 'bkcl', 'unknown' + ] # which_distributed_mode must modify args.backend + else: + assert args.run_mode == 'collective' or args.run_mode == None, "When backend is not 'auto', run mode must be collective" + check_backend(args.backend) + distribute_mode = DistributeMode.COLLECTIVE + + block_windows_and_macos( + args.backend) # raise error when using gloo on windows or macos + if args.backend == 'gloo': + logger.warning("launch start with CPUONLY mode") if enable_elastic(args, distribute_mode): launch_elastic(args, distribute_mode) diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index e114670440c065..3aced0ab996cb5 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -22,6 +22,7 @@ import tempfile import shutil from contextlib import closing +import multiprocessing import socket import warnings import six @@ -30,6 +31,7 @@ import paddle import paddle.fluid as fluid from distutils.util import strtobool +import paddle.utils.cpp_extension.extension_utils as utils logger = logging.getLogger("root") logger.propagate = False @@ -669,29 +671,31 @@ def get_xpus(xpus): return res_xpus -def get_device_mode(): +def get_device_mode(backend): if fluid.core.is_compiled_with_npu() and \ fluid.core.get_npu_device_count() > 0: print("launch train in ascend npu mode!") return DeviceMode.ASCEND_NPU - if fluid.core.is_compiled_with_cuda() and \ + if backend == 'nccl' and \ fluid.core.get_cuda_device_count() > 0: print("launch train in GPU mode!") return DeviceMode.GPU - if fluid.core.is_compiled_with_xpu() and fluid.core.get_xpu_device_count( - ) > 0: + if backend == 'bkcl' and fluid.core.get_xpu_device_count() > 0: print("launch train in XPU mode") return DeviceMode.XPU - print("launch train in CPU mode") - return DeviceMode.CPU + if backend == 'gloo': + print("launch train in CPU mode") + return DeviceMode.CPU + + raise RuntimeError("Don't supported devices") def get_device_proc_info(args): # device_mode - device_mode = get_device_mode() + device_mode = get_device_mode(args.backend) # devices devices_per_proc = [] @@ -722,6 +726,9 @@ def get_device_proc_info(args): else: devices_per_proc = xpus elif device_mode == DeviceMode.CPU: + if hasattr(args, "paddle_cpuonly") and args.nproc_per_node is None: + #NOTE (xiongkun03) set it to cpu core number + args.nproc_per_node = multiprocessing.cpu_count() if args.nproc_per_node is None: devices_per_proc = [0] else: @@ -1237,3 +1244,45 @@ def start_pod_heter_worker(self, args, pod): tp.cmd = cmd self.procs["heter_worker"].append(tp) + + +def check_backend(backend): + if backend not in ['nccl', 'gloo', 'bkcl', 'auto']: + raise ValueError( + "paddle.distributed initialize error, " + "backend argument can only be one of 'nccl', 'gloo', 'bkcl', 'auto', but got %s" + % backend) + + if backend == 'nccl' and not fluid.core.is_compiled_with_cuda(): + raise ValueError( + "paddle.distributed initialize error, " + "your paddle is not compiled with cuda but you assign 'nccl' as backend." + ) + + if backend == 'bkcl' and not fluid.core.is_compiled_with_xpu(): + raise ValueError( + "paddle.distributed initialize error, " + "your paddle is not compiled with xpu but you assign 'bkcl' as backend." + ) + + +def block_windows_and_macos(backend): + if backend != 'gloo': return + if utils.OS_NAME.startswith('darwin'): # MACOS , block + raise ValueError( + "You are going to using gloo on macos, but currently is not supported" + ) + if utils.IS_WINDOWS: # MACOS , block + raise ValueError( + "You are going to using gloo on windows, but currently is not supported" + ) + + +def get_backend_by_compile_flag(): + if fluid.core.is_compiled_with_cuda(): + return 'nccl' + + if fluid.core.is_compiled_with_xpu(): + return 'bkcl' + + return 'gloo' diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 7789b17429c4eb..34c74ad30679e4 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -26,6 +26,7 @@ from paddle.fluid import core from paddle.fluid.framework import _set_expected_place from paddle.fluid.dygraph import parallel_helper +from paddle.distributed.fleet.launch_utils import check_backend from paddle.fluid.dygraph.parallel import ParallelEnv from paddle.distributed.fleet.base.private_helper_function import wait_server_ready # noqa: F401 @@ -55,25 +56,8 @@ def _start_kv_server(port, http_server_d, size): http_server.stop() -def _check_backend(backend): - if backend not in ['nccl', 'gloo', 'bkcl', 'auto']: - raise ValueError( - "paddle.distributed initialize error, " - "backend argument can only be one of 'nccl', 'gloo', 'bkcl', 'auto', but got %s" - % backend) - - if backend == 'nccl' and not core.is_compiled_with_cuda(): - raise ValueError( - "paddle.distributed initialize error, " - "your paddle is not compiled with cuda but you assign 'nccl' as backend." - ) - - if backend == 'bkcl' and not core.is_compiled_with_xpu(): - raise ValueError( - "paddle.distributed initialize error, " - "your paddle is not compiled with xpu but you assign 'bkcl' as backend." - ) - +def _is_cpuonly(backend): + check_backend(backend) if backend in ['auto', 'nccl', 'bkcl'] and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu()): # passes 'auto' and can use cuda or xpu, use the default logics. so return False @@ -82,7 +66,7 @@ def _check_backend(backend): return True -def init_parallel_env(backend='auto'): +def init_parallel_env(): """ Initialize parallel training environment in dynamic graph mode. @@ -154,7 +138,8 @@ def train(): return # NOTE(xiongkun): support cpu gloo only, add this environment variable to # enable cpu only gloo prarllel training) - is_cpu_only = _check_backend(backend) + backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto') + is_cpu_only = _is_cpuonly(backend) # 1. gpu xpu check, must be gpu or xpu, if not (is_cpu_only or core.is_compiled_with_cuda() or core.is_compiled_with_xpu()): diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index a60e4642e494da..cea831d9d90b55 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -24,8 +24,10 @@ from paddle.distributed.utils import _print_arguments from paddle.distributed.utils import _prepare_trainer_env from paddle.distributed.utils import get_host_name_ip -from paddle.distributed.cloud_utils import get_cluster_and_pod +from paddle.distributed.cloud_utils import get_cluster_and_pod, _get_trainers_num +from paddle.distributed.fleet.launch import get_cluster_from_args from paddle.distributed.fleet.cloud_utils import use_paddlecloud +from paddle.distributed.fleet.launch_utils import DeviceMode, check_backend, block_windows_and_macos from paddle.device import get_device # deprecated module import @@ -71,7 +73,9 @@ def _py_supported_check(): def _options_valid_check(options): # `print_config` keeped as a debug options, not show to users - supported_options = ['start_method', 'ips', 'gpus', 'xpus', 'print_config'] + supported_options = [ + 'start_method', 'ips', 'gpus', 'xpus', 'print_config', 'backend' + ] deprecated_options = [ 'selected_devices', 'started_port', 'cluster_node_ips', 'node_ip', 'use_paddlecloud' @@ -95,6 +99,22 @@ def _get_default_nprocs(): return core.get_cuda_device_count() elif 'xpu' in device: return core.get_xpu_device_count() + elif 'cpu' in device: + return multiprocessing.cpu_count() + else: + raise RuntimeError( + "`paddle.distributed.spawn` does not support parallel training on device `{}` now.". + format(device)) + + +def _get_default_backend(): + device = get_device() + if 'gpu' in device: + return 'nccl' + elif 'xpu' in device: + return 'bkcl' + elif 'cpu' in device: + return 'gloo' else: raise RuntimeError( "`paddle.distributed.spawn` does not support parallel training on device `{}` now.". @@ -112,6 +132,16 @@ def _get_node_ip(ips): def _get_subprocess_env_list(nprocs, options): + # NOTE (xiongkun03) Why put backend deduction here ? + # Becase _get_subprocess_env_list is used by many testcases. + # So for campability, we put backend deduction here + + # logic for handle backend option + if 'backend' not in options or options['backend'] == 'auto': + options['backend'] = _get_default_backend() + check_backend(options['backend']) + block_windows_and_macos(options['backend']) + # contruct processes env list processes_env_list = [] @@ -133,7 +163,7 @@ def _get_subprocess_env_list(nprocs, options): # if we set FLAGS_selected_gpus or FLAGS_selected_xpus to be `0,1,2,3`, it may cause error # when using `ParallelEnv` # NOTE(chenweihang): use absolute gpu or xpu card id - if core.is_compiled_with_cuda(): + if options['backend'] == 'nccl': args.selected_devices = options.get('gpus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) @@ -168,7 +198,7 @@ def _get_subprocess_env_list(nprocs, options): "CUDA_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) - elif core.is_compiled_with_xpu(): + elif options['backend'] == 'bkcl': args.selected_devices = options.get('xpus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) @@ -202,6 +232,23 @@ def _get_subprocess_env_list(nprocs, options): raise ValueError("The selected xpu card %s cannot found in " "XPU_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) + elif options['backend'] == 'gloo': + # TODO check gpu / xpu flag must not exist + warnings.warn( + "Your model will be trained under CPUONLY mode by using GLOO," + "because CPUPlace is specified manually or your installed PaddlePaddle only support CPU Device." + ) + args.paddle_cpuonly = True + args.selected_devices = None + args.ips = args.cluster_node_ips + assert options.get( + 'use_paddlecloud', + None) is None, "CPUONLY spawn doesn't support use paddle cloud" + assert len( + args.cluster_node_ips.split(',') + ) <= 1, "CPUONLY spawn only support single trainer, that is len(ips)=1, but got %s." + assert _get_trainers_num( + ) == 1, "CPUONLY spawn doesn't support multi-trainer" # set other inner args args.node_ip = options.get('node_ip', None) @@ -215,11 +262,17 @@ def _get_subprocess_env_list(nprocs, options): args.use_paddlecloud = use_paddlecloud() # get cluster and pod config - cluster, pod = get_cluster_and_pod(args) + if options['backend'] == 'gloo': + devices_per_proc = [x for x in range(0, nprocs)] + cluster, pod = get_cluster_from_args(args, DeviceMode.CPU, + devices_per_proc) + else: + cluster, pod = get_cluster_and_pod(args) # prepare subprocess env list for trainer in pod.trainers: - processes_env_list.append(_prepare_trainer_env(cluster, trainer)) + processes_env_list.append( + _prepare_trainer_env(cluster, trainer, options['backend'])) # [Debug] print config args.print_config = options.get('print_config', False) @@ -236,27 +289,35 @@ def _remove_risky_env(): os.environ.pop("https_proxy", None) -def _set_trainer_env(env_dict): +def _set_trainer_env(env_dict, backend): # NOTE(chenweihang): [ Why need set FLAGS_selected_gpus or FLAGS_selected_xpus here? ] # When the child process starts, it will inherit the configuration of the # main process and set the FLAGS once, but the environment variable has # not been set at this time, which leads to the FLAGS_selected_gpus or FLAGS_selected_xpus # is keep same with mainprocess(usually empty), so manually update the flags here - if core.is_compiled_with_cuda(): + + # NOTE(xiongkun): why put backend here? because if gloo, we shouldn't set FLAGS_selectedXXX + # + + if backend == 'nccl': set_flags({'FLAGS_selected_gpus': env_dict['FLAGS_selected_gpus']}) - elif core.is_compiled_with_xpu(): + elif backend == 'bkcl': set_flags({'FLAGS_selected_xpus': env_dict['FLAGS_selected_xpus']}) else: - raise ValueError("PaddlePaddle should be compiled with XPU or CUDA.") + #NOTE(xiongkun) why not raise Error ? + # So far, we added support for CPU parallel, and will be applied when paddle is not + # compiled with cuda or xp. just do nothing. + pass + for var_name in env_dict: os.environ[var_name] = env_dict[var_name] -def _func_wrapper(func, args, error_queue, return_queue, env_dict): +def _func_wrapper(func, args, error_queue, return_queue, env_dict, backend): try: # config subprocess environment variables _remove_risky_env() - _set_trainer_env(env_dict) + _set_trainer_env(env_dict, backend) # execute function result = func(*args) # record function return value @@ -487,7 +548,8 @@ def train(print_result=False): return_queue = mp.SimpleQueue() process = mp.Process( target=_func_wrapper, - args=(func, args, error_queue, return_queue, procs_env_list[i])) + args=(func, args, error_queue, return_queue, procs_env_list[i], + options['backend'])) process.daemon = daemon process.start() error_queues.append(error_queue) diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py index 31d5748ce392e7..1c27a0018fc025 100644 --- a/python/paddle/distributed/utils.py +++ b/python/paddle/distributed/utils.py @@ -25,6 +25,7 @@ from contextlib import closing import socket from paddle.fluid import core +from paddle.distributed.fleet.launch_utils import get_backend_by_compile_flag from distutils.util import strtobool from paddle.fluid.layer_helper import LayerHelper @@ -613,8 +614,10 @@ def __free_port(): return None -def _prepare_trainer_env(cluster, trainer): - if core.is_compiled_with_xpu(): +def _prepare_trainer_env(cluster, trainer, backend=None): + if backend is None: + backend = get_backend_by_compile_flag() # for compatibility + if backend == 'bkcl': proc_env = { "FLAGS_selected_xpus": "%s" % ",".join([str(g) for g in trainer.gpus]), @@ -623,7 +626,7 @@ def _prepare_trainer_env(cluster, trainer): "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) } - elif core.is_compiled_with_cuda(): + elif backend == 'nccl': proc_env = { "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]), @@ -632,6 +635,19 @@ def _prepare_trainer_env(cluster, trainer): "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) } + elif backend == 'gloo': + # NOTE (xiongkun) default fall back into cpu only + proc_env = { + "PADDLE_TRAINER_ID": "%d" % trainer.rank, + "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, + "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), + "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), + "PADDLE_DISTRI_BACKEND": + backend, # only add here, other will be auto + } + else: + raise ValueError("backend must be one of 'gloo, nccl, bkcl'") + return proc_env diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index ac7471f8edfa4f..1c9ce2bef5e173 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -200,8 +200,14 @@ endif() list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_hybrid_parallel) +LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer_gloo) # NOTE: @xiongkun03, cpu is too slow, fix it in next PR + if (NOT WITH_GLOO) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel_cpuonly) + + LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_unused_variables_gloo) + LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height_gloo) + LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_gloo) endif() if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) @@ -491,6 +497,10 @@ if (APPLE OR WIN32) list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dataset) endif() +if (NOT WITH_GLOO) + LIST(REMOVE_ITEM TEST_OPS test_cpuonly_spawn) +endif() + if(NOT WITH_GPU OR WIN32 OR APPLE) list(REMOVE_ITEM TEST_OPS test_build_strategy_fusion_group_pass) endif() @@ -654,6 +664,9 @@ if(WITH_DISTRIBUTE) endforeach(TEST_OP) # solve it later. bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) + if (WITH_GLOO) + bash_test_modules(test_cpuonly_launch START_BASH test_cpuonly_launch.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) + endif() bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) endif(NOT APPLE) endif() @@ -1070,3 +1083,8 @@ set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120) set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400) set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000) set_tests_properties(test_tensordot PROPERTIES LABELS "RUN_TYPE=NIGHTLY") +if (WITH_GLOO) + set_tests_properties(test_parallel_dygraph_unused_variables_gloo PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_sparse_embedding_gloo PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_sparse_embedding_over_height_gloo PROPERTIES TIMEOUT 120) +endif() diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py index 048c9b399d8040..781d606f33b8fc 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py @@ -66,8 +66,7 @@ def forward(self, x): class TestDistTraning(unittest.TestCase): def test_multiple_gpus(self): - backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto') - dist.init_parallel_env(backend) + dist.init_parallel_env() self.trainer_id = dist.get_rank() model_a = SimpleNet(self.trainer_id) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py index 4ce67676c3e85e..0387de32c91454 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py @@ -324,6 +324,7 @@ def run_one_loop(self, model, opt, data): bs = len(data) dy_x_data = np.array([x[0].reshape(3, 224, 224) for x in data]).astype('float32') + dy_x_data = dy_x_data / 255.0 y_data = np.array([x[1] for x in data]).astype('int64').reshape(bs, 1) img = to_variable(dy_x_data) label = to_variable(y_data) diff --git a/python/paddle/fluid/tests/unittests/test_cpuonly_launch.sh b/python/paddle/fluid/tests/unittests/test_cpuonly_launch.sh new file mode 100644 index 00000000000000..1c35166cf44344 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cpuonly_launch.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +function test_launch_cpuonly(){ + python -m paddle.distributed.launch --nproc_per_node=4 --backend=gloo \ + parallel_dygraph_gradient_check.py 2>ut.elog + if grep -q "ABORT" ut.elog; then + echo "test cpu only failed" + exit -1 + else + if grep -q "CPUONLY" ut.elog; then + echo "test_launch_cpuonly successfully" + else + echo "test_launch_cpuonly failed" + exit -1 + fi + fi +} +function test_launch_error_case1(){ + python -m paddle.distributed.launch --nproc_per_node=4 --backend=random_str \ + parallel_dygraph_gradient_check.py 2>ut.elog + if grep -q "ValueError" ut.elog; then + echo "test_launch_error_case1 successfully" + else + exit -1 + fi +} + +test_launch_cpuonly +test_launch_error_case1 diff --git a/python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py b/python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py new file mode 100644 index 00000000000000..1def2ffd82ad7a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py @@ -0,0 +1,72 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest + +import paddle +import paddle.nn as nn +import paddle.optimizer as opt +import paddle.distributed as dist + + +class LinearNet(nn.Layer): + def __init__(self): + super(LinearNet, self).__init__() + self._linear1 = nn.Linear(10, 10) + self._linear2 = nn.Linear(10, 1) + + def forward(self, x): + return self._linear2(self._linear1(x)) + + +def train(print_result=False): + # 1. initialize parallel environment + dist.init_parallel_env() + + # 2. create data parallel layer & optimizer + layer = LinearNet() + dp_layer = paddle.DataParallel(layer) + + loss_fn = nn.MSELoss() + adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) + + # 3. run layer + inputs = paddle.randn([10, 10], 'float32') + outputs = dp_layer(inputs) + labels = paddle.randn([10, 1], 'float32') + loss = loss_fn(outputs, labels) + + if print_result is True: + print("loss:", loss.numpy()) + + loss.backward() + print("Grad is", layer._linear1.weight.grad) + adam.step() + adam.clear_grad() + + +class TestSpawn(unittest.TestCase): + def test_spawn(self): + dist.spawn(train, backend='gloo', nprocs=4) + + def test_wrong_backend(self): + try: + dist.spawn(train, backend='something', nprocs=4) + except ValueError as e: + self.assertEqual(type(e), ValueError) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index eceb484a0184c9..63985415c51f6d 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -209,7 +209,11 @@ def run_use_fleet_api_20_trainer(self, args): def get_data(): origin_batch = next(reader_generator) - if args.update_method != "local" and args.use_reader_alloc: + if paddle.distributed.get_world_size( + ) == 1 and args.update_method == 'gloo': # Gloo single mode + return origin_batch + + elif args.update_method != "local" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: @@ -506,7 +510,10 @@ def run_one_loop(self, model, opt, data): "train_one_loop should be implemented by the child classes.") def _get_data(self, batch, args): - if args.update_method != "local": + if paddle.distributed.get_world_size( + ) == 1 and args.update_method == 'gloo': # Gloo single mode + return batch + elif args.update_method != "local": new_batch = [] for offset, item in enumerate(batch): if offset % 2 == args.trainer_id: @@ -518,14 +525,16 @@ def _get_data(self, batch, args): def run_trainer(self, args): seed = 90 - if fluid.core.is_compiled_with_cuda(): + if args.update_method == 'gloo': + place = fluid.CPUPlace() + elif fluid.core.is_compiled_with_cuda(): device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) elif fluid.core.is_compiled_with_xpu(): device_id = int(os.getenv("FLAGS_selected_xpus", "0")) place = fluid.XPUPlace(device_id) else: - assert ("Only support CUDAPlace or XPUPlace for now.") + assert ("Only support CUDAPlace or XPUPlace or CPU(Gloo) for now.") with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = seed @@ -554,6 +563,16 @@ def run_trainer(self, args): model = dygraph.parallel.DataParallel( model, strategy, find_unused_parameters=True) print_to_err(type(self).__name__, "model built in dygraph") + + elif args.update_method == "gloo": + paddle.distributed.init_parallel_env() + if not args.find_unused_parameters: + model = dygraph.parallel.DataParallel( + model, find_unused_parameters=False) + else: + model = dygraph.parallel.DataParallel( + model, find_unused_parameters=True) + out_losses = [] print_to_err(type(self).__name__, "begin to run dygraph training") for step_id, data in enumerate(train_reader()): @@ -588,12 +607,12 @@ def run_trainer_with_spawn(self, args): args.trainer_id = paddle.distributed.get_rank() # 3. init parallel env - if args.update_method == "nccl2": + if args.update_method in ["nccl2", "gloo"]: paddle.distributed.init_parallel_env() # 4. train model model, train_reader, opt = self.get_model() - if args.update_method == "nccl2": + if args.update_method in ["nccl2", "gloo"]: if args.find_unused_parameters: model = paddle.DataParallel(model, find_unused_parameters=True) else: @@ -668,7 +687,9 @@ def runtime_main(test_class): '--update_method', type=str, default="local", - choices=["pserver", "nccl2", "bkcl", "local", "nccl2_reduce_layer"]) + choices=[ + "pserver", "nccl2", "bkcl", "local", "nccl2_reduce_layer", "gloo" + ]) parser.add_argument('--trainer_id', type=int, required=False, default=0) parser.add_argument('--trainers', type=int, required=False, default=1) parser.add_argument('--nccl_comm_num', type=int, required=False, default=1) @@ -685,6 +706,7 @@ def runtime_main(test_class): '--current_endpoint', type=str, required=False, default="") parser.add_argument('--sync_mode', action='store_true') parser.add_argument('--use_cuda', action='store_true') + parser.add_argument('--use_cpu', action='store_true') parser.add_argument('--use_xpu', action='store_true') parser.add_argument('--use_dgc', action='store_true') parser.add_argument('--accumulate_gradient', action='store_true') @@ -713,6 +735,9 @@ def runtime_main(test_class): args = parser.parse_args() + if args.update_method == 'gloo': + paddle.set_device("cpu") + model = test_class() if args.role == "pserver" and args.update_method == "pserver": model.run_pserver(args) @@ -770,6 +795,7 @@ def setUp(self): self._use_reader_alloc = True self._nccl2_mode = False self._bkcl_mode = False + self._gloo_mode = False # now, support gloo backend self._pipeline_mode = False self._mp_mode = False # FIXME(typhoonzero): I added this stupid argument to enable @@ -875,7 +901,7 @@ def _run_local(self, batch_size=DEFAULT_BATCH_SIZE, batch_merge_repeat=1, log_name="", - devices="0"): + devices="1"): cmd = self._python_interp @@ -947,6 +973,21 @@ def _run_local(self, return pickle.loads(local_out) + def _run_local_gloo(self, + model, + envs, + check_error_log=False, + batch_size=DEFAULT_BATCH_SIZE, + batch_merge_repeat=1, + log_name="", + devices="0"): + saved_endpoints = self._ps_endpoints + self._ps_endpoints = self._ps_endpoints.split(',')[0] + result = self._run_cluster_gloo(model, envs, 'gloo', check_error_log, + log_name) + self._ps_endpoints = saved_endpoints + return result + def _run_cluster(self, model, envs, check_error_log, log_name): # Run dist train to compare with local results ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver( @@ -1037,6 +1078,62 @@ def _run_cluster(self, model, envs, check_error_log, log_name): return pickle.loads(tr0_out), pickle.loads(tr1_out) + def _get_gloo_trainer_cmd(self, model, ep, update_method, trainer_id, + trainer_num): + env = {} + tr_cmd = "%s -u" + + if os.getenv('WITH_COVERAGE', 'OFF') == 'ON': + tr_cmd += " -m coverage run --branch -p" + + tr_cmd += " %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method %s --lr %f" + + tr_cmd = tr_cmd % \ + (self._python_interp, model, self._ps_endpoints, + trainer_id, ep, update_method, self._lr) + + if self._use_reduce: + tr_cmd += " --use_reduce" + if self._use_reader_alloc: + tr_cmd += " --use_reader_alloc" + #assert self._use_reduce == False, "gloo not support _use_reduce" + #assert self._use_reader_alloc == False, "gloo not support _use_reduce" + if self._save_model: + tr_cmd += " --save_model" + self.__use_cuda = False + self.__use_xpu = False + assert self.__use_cuda == False, "gloo not support use cuda" + assert self.__use_xpu == False, "gloo not support use xpu" + tr_cmd += " --use_cpu" + env.update({ + "PADDLE_TRAINERS_NUM": "{}".format(trainer_num), + "PADDLE_TRAINER_ID": "{}".format(trainer_id), + "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints, + "PADDLE_CURRENT_ENDPOINT": ep, + "PADDLE_CURRENT_ENDPOINT": ep, + "PADDLE_DISTRI_BACKEND": "gloo", + "GLOG_v": "2", + }) + + assert self._use_dgc == False, "gloo not support use dgc" + if self._accumulate_gradient: + tr_cmd += " --accumulate_gradient" + + if self._find_unused_parameters: + tr_cmd += " --find_unused_parameters" + + assert self._pipeline_mode == False, "gloo not support use pipeline" + + if self._enable_backward_deps: # build strategy, save it + tr_cmd += " --enable_backward_deps" + + if self._fuse_all_reduce is not None: + tr_cmd += " --fuse_all_reduce {}".format(self._fuse_all_reduce) + + assert self._use_fleet_api == False, "gloo not support use fleet api" + assert self._use_fleet_api_20 == False, "gloo not support use fleet api" + return tr_cmd, env + def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id, trainer_num): env = {} @@ -1123,6 +1220,57 @@ def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id, return tr_cmd, env + def _run_cluster_gloo(self, model, envs, update_method, check_error_log, + log_name): + assert update_method == "gloo", "_run_cluster_gloo must have update_method: gloo, but get %s" % update_method + assert not self._use_hallreduce, "_run_cluster_gloo must have _use_hallreduce = false" + + worker_endpoints = self._ps_endpoints.split(",") + + trainer_num = len(worker_endpoints) + + procs = [] + pipes = [] + for i in range(0, trainer_num): + tr_cmd, tr_env = self._get_gloo_trainer_cmd( + model, worker_endpoints[i], update_method, i, trainer_num) + tr_env.update(envs) + tr_env["GLOG_vmodule"] = 'gloo_context=4' + tr_env["GLOG_v"] = '3' + print("use_hallreduce:{} tr_cmd:{}, env: {}".format( + self._use_hallreduce, tr_cmd, tr_env)) + + tr_pipe = open(log_name + "_tr{}_err.log".format(i), "wb") + + print_to_err( + type(self).__name__, + "going to start process {} with nccl2".format(i)) + tr_proc = subprocess.Popen( + tr_cmd.strip().split(" "), + stdout=subprocess.PIPE, + stderr=tr_pipe, + env=tr_env) + + procs.append(tr_proc) + pipes.append(tr_pipe) + + outs = [] + for i in range(0, trainer_num): + tr_out, tr_err = procs[i].communicate() + outs.append(tr_out) + pipes[i].close() + sys.stderr.write('trainer {} stderr: {}\n'.format(i, tr_err)) + + if trainer_num == 1: + if check_error_log: print("outs[0]:", outs[0]) + return pickle.loads(outs[0]) + + else: + if check_error_log: + print("outs[0]:", outs[0]) + print("outs[1]:", outs[1]) + return pickle.loads(outs[0]), pickle.loads(outs[1]) + def _run_cluster_nccl2(self, model, envs, update_method, check_error_log, log_name): if self._use_hallreduce: @@ -1262,7 +1410,12 @@ def check_with_place(self, required_envs = self._get_required_envs(check_error_log, need_envs) - local_losses \ + if self._gloo_mode: + local_losses \ + = self._run_local_gloo(model_file, required_envs, + check_error_log, log_name=log_name) + else: + local_losses \ = self._run_local(model_file, required_envs, check_error_log, log_name=log_name) @@ -1288,6 +1441,14 @@ def check_with_place(self, update_method='bkcl', check_error_log=check_error_log, log_name=log_name) + elif self._gloo_mode: + # gloo mode, cpu only parallel train @xiongkun03 + tr0_losses, tr1_losses = self._run_cluster_gloo( + model_file, + required_envs, + update_method='gloo', + check_error_log=check_error_log, + log_name=log_name) elif self._pipeline_mode: tr0_losses, tr1_losses = self._run_pipeline( diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py index c97cd56e8a7a40..edf9aed04f5e0a 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py @@ -49,6 +49,51 @@ def get_gpus(selected_gpus): return selected_gpus +def start_local_trainers_cpu(trainer_endpoints, + training_script, + training_script_args, + log_dir=None): + current_env = copy.copy(os.environ.copy()) + current_env.pop("http_proxy", None) + current_env.pop("https_proxy", None) + + procs = [] + n_rank = len(trainer_endpoints) + print(trainer_endpoints) + for rank_id, endpoint in enumerate(trainer_endpoints): + proc_env = { + "PADDLE_DISTRI_BACKEND": "gloo", + "PADDLE_TRAINER_ID": "%d" % rank_id, + "PADDLE_CURRENT_ENDPOINT": "%s" % endpoint, + "PADDLE_TRAINERS_NUM": "%d" % n_rank, + "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints) + } + + current_env.update(proc_env) + + print("trainer proc env:{}".format(current_env)) + + assert os.getenv('WITH_COVERAGE', + 'OFF') == 'OFF', "Gloo don't support WITH_COVERAGE." + cmd = "python -u " + training_script + + print("start trainer proc:{} env:{}".format(cmd, proc_env)) + + fn = None + + proc = subprocess.Popen(cmd.split(" "), env=current_env) + + tp = TrainerProc() + tp.proc = proc + tp.rank = rank_id + tp.log_fn = fn + tp.cmd = cmd + + procs.append(tp) + + return procs + + def start_local_trainers(cluster, pod, training_script, @@ -116,6 +161,26 @@ def run_mnist_2gpu(self, target_file_name): training_script=target_file_name, training_script_args=[]) + while True: + alive = watch_local_trainers(procs, cluster.trainers_endpoints()) + + if not alive: + print("Local procs complete, POD info:{}".format(pod)) + break + time.sleep(3) + + +class TestMultipleWithGloo(unittest.TestCase): + def run_mnist_2cpu(self, target_file_name): + + cluster, pod = get_cluster_from_args( + [0, 1]) #tmp use. for getting trainer_nranks() + + procs = start_local_trainers_cpu( + cluster.trainers_endpoints(), + training_script=target_file_name, + training_script_args=[]) + while True: alive = watch_local_trainers(procs, cluster.trainers_nranks()) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py new file mode 100644 index 00000000000000..56fcf806c47170 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import unittest + +import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_sparse_embedding import TestSparseEmbedding +from parallel_dygraph_sparse_embedding_fp64 import TestSparseEmbeddingFP64 + +flag_name = os.path.splitext(__file__)[0] + + +class TestParallelDygraphSparseEmdedding_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_sparse_embedding(self): + self.check_with_place( + "parallel_dygraph_sparse_embedding.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphSparseEmdeddingFP64_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_sparse_embedding_fp64(self): + self.check_with_place( + "parallel_dygraph_sparse_embedding_fp64.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py new file mode 100644 index 00000000000000..ba43e26e23a4ec --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py @@ -0,0 +1,44 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import unittest + +import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_sparse_embedding_over_height import TestSparseEmbeddingOverHeight + +flag_name = os.path.splitext(__file__)[0] + + +class TestParallelDygraphSparseEmdeddingOverHeight_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_sparse_embedding(self): + self.check_with_place( + "parallel_dygraph_sparse_embedding_over_height.py", + delta=1e-7, + check_error_log=True, + log_name=flag_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py new file mode 100644 index 00000000000000..d3619cc1b9a00a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import unittest + +import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_transformer import TestTransformer + +flag_name = os.path.splitext(__file__)[0] + + +class TestParallelDygraphTransformer_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_transformer(self): + self.check_with_place( + "parallel_dygraph_transformer.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphTransformerAccGrad_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + self._accumulate_gradient = True + self._find_unused_parameters = False + + def test_transformer(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_transformer.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py new file mode 100644 index 00000000000000..89373fcb6eebc7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py @@ -0,0 +1,72 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import unittest + +import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_unused_variables import TestSparseEmbeddingUnusedVars + +flag_name = os.path.splitext(__file__)[0] + + +class TestParallelDygraphUnusedVar_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_net(self): + self.check_with_place( + "parallel_dygraph_unused_variables.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphNoVar_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_net(self): + self.check_with_place( + "parallel_dygraph_none_var.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphSharedUnusedVariables_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_mnist(self): + self.check_with_place( + "parallel_dygraph_shared_unused_var.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py index 14547eca5aca2c..dccc117f6bc159 100644 --- a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py +++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py @@ -24,6 +24,7 @@ from paddle.fluid import core from paddle.fluid.dygraph import parallel_helper +import multiprocessing # NOTE(chenweihang): Coverage CI is currently not able to count python3 # unittest, so the unittests here covers some cases that will only be @@ -89,8 +90,8 @@ def test_options_valid_check(self): def test_get_default_nprocs(self): paddle.set_device('cpu') - with self.assertRaises(RuntimeError): - nprocs = _get_default_nprocs() + nprocs = _get_default_nprocs() + self.assertEqual(nprocs, multiprocessing.cpu_count()) paddle.set_device('gpu') nprocs = _get_default_nprocs() From 7bf2aa3883066cb880e4bca8f8691dcdaf470c51 Mon Sep 17 00:00:00 2001 From: TTerror Date: Thu, 21 Oct 2021 14:28:24 +0800 Subject: [PATCH 239/298] add fill_any_like/flatten ops to train ssd on kunlun (#36550) * add some ops to train ssd on kunlun * update test_fill_any_like_op_xpu.py --- .../fluid/operators/fill_any_like_op_xpu.cc | 79 +++++ paddle/fluid/operators/flatten_op_xpu.cc | 67 ++++ paddle/fluid/platform/xpu/xpu2_op_list.h | 36 ++ .../fluid/tests/unittests/op_test_xpu.py | 24 +- .../xpu/test_fill_any_like_op_xpu.py | 77 +++++ .../unittests/xpu/test_flatten2_op_xpu.py | 83 +++++ .../test_flatten_contiguous_range_op_xpu.py | 320 ++++++++++++++++++ .../unittests/xpu/test_flatten_op_xpu.py | 77 +++++ 8 files changed, 761 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/operators/fill_any_like_op_xpu.cc create mode 100644 paddle/fluid/operators/flatten_op_xpu.cc create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py diff --git a/paddle/fluid/operators/fill_any_like_op_xpu.cc b/paddle/fluid/operators/fill_any_like_op_xpu.cc new file mode 100644 index 00000000000000..76cf339fbf5cca --- /dev/null +++ b/paddle/fluid/operators/fill_any_like_op_xpu.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/fill_any_like_op.h" + +namespace paddle { +namespace operators { + +template +class FillAnyLikeXPUKernel : public framework::OpKernel { + public: + using CommonType = typename std::common_type< + float, + typename std::conditional::value, + float, T>::type>::type; + using XPUInTDType = typename XPUTypeTrait::Type; + + void Compute(const framework::ExecutionContext& context) const override { + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + float value = context.Attr("value"); + + auto common_type_value = static_cast(value); + + PADDLE_ENFORCE_EQ( + (common_type_value >= + static_cast(std::numeric_limits::lowest())) && + (common_type_value <= + static_cast(std::numeric_limits::max())), + true, + platform::errors::InvalidArgument( + "The filled value is out of range for target type, " + "current kernel type is %s, the range should between %f " + "and %f, but now value is %f.", + typeid(T).name(), + static_cast(std::numeric_limits::lowest()), + static_cast(std::numeric_limits::max()), value)); + + PADDLE_ENFORCE_EQ( + std::isnan(value), false, + platform::errors::InvalidArgument("The filled value is NaN.")); + + auto& dev_ctx = + context.template device_context(); + auto out_data = reinterpret_cast(out->data()); + int ret = xpu::constant(dev_ctx.x_context(), out_data, out->numel(), + static_cast(value)); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU CONSTANT API return wrong value[%d %s].", ret, + XPUAPIErrorMsg[ret])); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL(fill_any_like, ops::FillAnyLikeXPUKernel, + ops::FillAnyLikeXPUKernel, + ops::FillAnyLikeXPUKernel, + ops::FillAnyLikeXPUKernel); + +#endif diff --git a/paddle/fluid/operators/flatten_op_xpu.cc b/paddle/fluid/operators/flatten_op_xpu.cc new file mode 100644 index 00000000000000..53c0c688fd9e9d --- /dev/null +++ b/paddle/fluid/operators/flatten_op_xpu.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/flatten_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL( + flatten, ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel); +REGISTER_OP_XPU_KERNEL( + flatten_grad, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel); +REGISTER_OP_XPU_KERNEL( + flatten2, ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel); +REGISTER_OP_XPU_KERNEL( + flatten2_grad, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel); +REGISTER_OP_XPU_KERNEL( + flatten_contiguous_range, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel); +REGISTER_OP_XPU_KERNEL( + flatten_contiguous_range_grad, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel); +#endif diff --git a/paddle/fluid/platform/xpu/xpu2_op_list.h b/paddle/fluid/platform/xpu/xpu2_op_list.h index 5d45e5d9d5050e..0a9a9453b53e3d 100644 --- a/paddle/fluid/platform/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/xpu/xpu2_op_list.h @@ -119,6 +119,42 @@ XPUOpMap& get_kl2_ops() { {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, + {"fill_any_like", + XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"flatten", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"flatten_grad", + XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"flatten2", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"flatten2_grad", + XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + + {"flatten_contiguous_range", + XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"flatten_contiguous_range_grad", + XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, // AddMore }; diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py index 133367a5f3625a..239708cc174492 100644 --- a/python/paddle/fluid/tests/unittests/op_test_xpu.py +++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py @@ -91,11 +91,31 @@ def is_mkldnn_op_test(): # case in NO_FP64_CHECK_GRAD_CASES and op in NO_FP64_CHECK_GRAD_OP_LIST should be fixed if not hasattr(cls, "no_need_check_grad") \ and not is_empty_grad_op(cls.op_type): - if cls.dtype is not None and \ - cls.dtype != np.float32: + if cls.dtype is None or \ + (cls.dtype == np.float16 \ + and cls.op_type not in op_accuracy_white_list.NO_FP16_CHECK_GRAD_OP_LIST \ + and not hasattr(cls, "exist_check_grad")): raise AssertionError("This test of %s op needs check_grad." % cls.op_type) + # check for op test with fp64 precision, but not check mkldnn op test for now + if cls.dtype in [np.float32, np.float64] \ + and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \ + and not hasattr(cls, 'exist_fp64_check_grad') \ + and not is_xpu_op_test() \ + and not is_mkldnn_op_test() \ + and not is_rocm_op_test() \ + and not is_npu_op_test(): + raise AssertionError( + "This test of %s op needs check_grad with fp64 precision." % + cls.op_type) + + if not cls.input_shape_is_large \ + and cls.op_type not in check_shape_white_list.NEED_TO_FIX_OP_LIST: + raise AssertionError( + "Input's shape should be large than or equal to 100 for " + + cls.op_type + " Op.") + def try_call_once(self, data_type): if not self.call_once: self.call_once = True diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py new file mode 100644 index 00000000000000..27c101b20f6849 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py @@ -0,0 +1,77 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +sys.path.append("..") + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import Program, program_guard +import paddle.compat as cpt +import unittest +import numpy as np +from op_test import OpTest +from op_test_xpu import XPUOpTest + +paddle.enable_static() + + +class TestFillAnyLikeOp(OpTest): + def setUp(self): + self.op_type = "fill_any_like" + self.dtype = np.float32 + self.use_xpu = True + self.use_mkldnn = False + self.value = 0.0 + self.init() + self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)} + self.attrs = {'value': self.value, 'use_xpu': True} + self.outputs = {'Out': self.value * np.ones_like(self.inputs["X"])} + + def init(self): + pass + + def test_check_output(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + +class TestFillAnyLikeOpFloat32(TestFillAnyLikeOp): + def init(self): + self.dtype = np.float32 + self.value = 0.0 + + +class TestFillAnyLikeOpValue1(TestFillAnyLikeOp): + def init(self): + self.value = 1.0 + + +class TestFillAnyLikeOpValue2(TestFillAnyLikeOp): + def init(self): + self.value = 1e-9 + + +class TestFillAnyLikeOpFloat16(TestFillAnyLikeOp): + def init(self): + self.dtype = np.float16 + self.value = 0.05 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py new file mode 100644 index 00000000000000..9cbc83950d1e8f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py @@ -0,0 +1,83 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import sys +sys.path.append("..") +import numpy as np +import paddle +import paddle.fluid as fluid +from op_test import OpTest +from op_test_xpu import XPUOpTest +paddle.enable_static() + + +class TestFlatten2Op(XPUOpTest): + def setUp(self): + self.set_xpu() + self.op_type = "flatten2" + self.place = paddle.XPUPlace(0) + self.init_test_case() + self.inputs = {"X": np.random.random(self.in_shape).astype("float32")} + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.in_shape).astype("float32") + } + + def set_xpu(self): + self.__class__.use_xpu = True + + def test_check_output(self): + self.check_output_with_place(self.place, no_check_set=["XShape"]) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.in_shape = (3, 2, 4, 5) + self.axis = 1 + self.new_shape = (3, 40) + + def init_attrs(self): + self.attrs = {"axis": self.axis} + + +class TestFlatten2OpWithCornerAxis(TestFlatten2Op): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.axis = 0 + self.new_shape = (1, 120) + + +class TestFlatten2OpWithDefaultAxis(TestFlatten2Op): + def init_test_case(self): + self.in_shape = (10, 2, 2, 3) + self.new_shape = (10, 12) + + def init_attrs(self): + self.attrs = {} + + +class TestFlatten2OpSixDims(TestFlatten2Op): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.axis = 4 + self.new_shape = (36, 16) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py new file mode 100644 index 00000000000000..dcad3c479f446e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py @@ -0,0 +1,320 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +sys.path.append("..") + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +from op_test_xpu import XPUOpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + + +class TestFlattenOp(XPUOpTest): + def setUp(self): + self.set_xpu() + self.op_type = "flatten_contiguous_range" + self.place = paddle.XPUPlace(0) + self.use_xpu = True + self.use_mkldnn = False + + self.start_axis = 0 + self.stop_axis = -1 + self.dtype = np.float32 + self.init_test_case() + self.inputs = {"X": np.random.random(self.in_shape).astype(self.dtype)} + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.in_shape).astype("float32") + } + + def set_xpu(self): + self.__class__.use_xpu = True + + def test_check_output(self): + self.check_output_with_place(self.place, no_check_set=["XShape"]) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = -1 + self.new_shape = (120) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis, + 'use_xpu': True, + } + + +class TestFlattenOp_1(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 1 + self.stop_axis = 2 + self.new_shape = (3, 10, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_2(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_3(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 2 + self.new_shape = (30, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_4(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = -2 + self.stop_axis = -1 + self.new_shape = (3, 2, 20) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_5(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 2 + self.stop_axis = 2 + self.new_shape = (3, 2, 5, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOpSixDims(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.start_axis = 3 + self.stop_axis = 5 + self.new_shape = (3, 2, 3, 32) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_Float32(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.float32 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_int32(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.int32 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis, + 'use_xpu': True + } + + def test_check_grad(self): + pass + + +class TestFlattenOp_int8(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.int8 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + def test_check_grad(self): + pass + + +class TestFlattenOp_int64(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.int64 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + def test_check_grad(self): + pass + + +class TestFlatten2OpError(unittest.TestCase): + def test_errors(self): + image_shape = (2, 3, 4, 4) + x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * + image_shape[3]).reshape(image_shape) / 100. + x = x.astype('float32') + + def test_ValueError1(): + x_var = paddle.static.data( + name="x", shape=image_shape, dtype='float32') + out = paddle.flatten(x_var, start_axis=2, stop_axis=1) + + self.assertRaises(ValueError, test_ValueError1) + + def test_ValueError2(): + x_var = paddle.static.data( + name="x", shape=image_shape, dtype='float32') + paddle.flatten(x_var, start_axis=10, stop_axis=1) + + self.assertRaises(ValueError, test_ValueError2) + + def test_ValueError3(): + x_var = paddle.static.data( + name="x", shape=image_shape, dtype='float32') + paddle.flatten(x_var, start_axis=2, stop_axis=10) + + self.assertRaises(ValueError, test_ValueError3) + + def test_type(): + # dtype must be float32, float64, int8, int32, int64 + x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * + image_shape[3]).reshape(image_shape) / 100. + x2 = x2.astype('float16') + x2_var = paddle.fluid.data( + name='x2', shape=[3, 2, 4, 5], dtype='float16') + paddle.flatten(x2_var) + + self.assertRaises(TypeError, test_type) + + def test_InputError(): + out = paddle.flatten(x) + + self.assertRaises(ValueError, test_InputError) + + +class TestStaticFlattenPythonAPI(unittest.TestCase): + def execute_api(self, x, start_axis=0, stop_axis=-1): + return paddle.flatten(x, start_axis, stop_axis) + + def test_static_api(self): + paddle.enable_static() + np_x = np.random.rand(2, 3, 4, 4).astype('float32') + + main_prog = paddle.static.Program() + with paddle.static.program_guard(main_prog, paddle.static.Program()): + x = paddle.static.data( + name="x", shape=[2, 3, 4, 4], dtype='float32') + out = self.execute_api(x, start_axis=-2, stop_axis=-1) + + exe = paddle.static.Executor(place=paddle.XPUPlace(0)) + fetch_out = exe.run(main_prog, feed={"x": np_x}, fetch_list=[out]) + self.assertTrue((2, 3, 16) == fetch_out[0].shape) + + +class TestStaticInplaceFlattenPythonAPI(TestStaticFlattenPythonAPI): + def execute_api(self, x, start_axis=0, stop_axis=-1): + return x.flatten_(start_axis, stop_axis) + + +class TestFlattenPython(unittest.TestCase): + def test_python_api(self): + image_shape = (2, 3, 4, 4) + x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * + image_shape[3]).reshape(image_shape) / 100. + x = x.astype('float32') + + def test_InputError(): + out = paddle.flatten(x) + + self.assertRaises(ValueError, test_InputError) + + def test_Negative(): + paddle.disable_static(paddle.XPUPlace(0)) + img = paddle.to_tensor(x) + out = paddle.flatten(img, start_axis=-2, stop_axis=-1) + return out.numpy().shape + + res_shape = test_Negative() + self.assertTrue((2, 3, 16) == res_shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py new file mode 100644 index 00000000000000..ed435198353caa --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py @@ -0,0 +1,77 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import sys +sys.path.append("..") +import numpy as np +import paddle +import paddle.fluid as fluid +from op_test import OpTest +from op_test_xpu import XPUOpTest +paddle.enable_static() + + +class TestFlattenOp(XPUOpTest): + def setUp(self): + self.op_type = "flatten" + self.use_xpu = True + self.place = paddle.XPUPlace(0) + self.init_test_case() + self.inputs = {"X": np.random.random(self.in_shape).astype("float32")} + self.init_attrs() + self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.in_shape = (3, 2, 2, 10) + self.axis = 1 + self.new_shape = (3, 40) + + def init_attrs(self): + self.attrs = {"axis": self.axis} + + +class TestFlattenOp1(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 2, 10) + self.axis = 0 + self.new_shape = (1, 120) + + +class TestFlattenOpWithDefaultAxis(TestFlattenOp): + def init_test_case(self): + self.in_shape = (10, 2, 2, 3) + self.new_shape = (10, 12) + + def init_attrs(self): + self.attrs = {} + + +class TestFlattenOpSixDims(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.axis = 4 + self.new_shape = (36, 16) + + +if __name__ == "__main__": + unittest.main() From 66f4b29220b1417ba65f25d9636eba84d280cc13 Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Thu, 21 Oct 2021 15:23:17 +0800 Subject: [PATCH 240/298] fix hdfs download_dir (#36590) --- python/paddle/distributed/fleet/utils/fs.py | 4 ++-- python/paddle/fluid/tests/unittests/hdfs_test_utils.py | 2 +- python/paddle/fluid/tests/unittests/test_hdfs3.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index f56580f8ca2fe6..8895a529526f76 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -842,8 +842,8 @@ def __subprocess_download(local_path, datas): if self.is_file(fs_path): return self._try_download(fs_path, local_path) # download dir - _, all_files = self.ls_dir(fs_path) - + _, all_filenames = self.ls_dir(fs_path) + all_files = [fs_path + i for i in all_filenames] procs = [] for i in range(multi_processes): process_datas = self._split_files(all_files, i, multi_processes) diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py index 6b49049073948f..69ccc7088b834e 100644 --- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py +++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py @@ -195,7 +195,7 @@ def _test_download_dir(self, fs): fs.download(src_file, dst_file) local = LocalFS() - self.assertTrue(local.is_exist(dst_file)) + self.assertTrue(local.is_exist(file1)) local.delete(dst_file) fs.delete(src_file) diff --git a/python/paddle/fluid/tests/unittests/test_hdfs3.py b/python/paddle/fluid/tests/unittests/test_hdfs3.py index d214768b2e32f9..57b0b1ba45f244 100644 --- a/python/paddle/fluid/tests/unittests/test_hdfs3.py +++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py @@ -40,6 +40,7 @@ def test_hdfs(self): self._test_upload(fs) self._test_upload_dir(fs) self._test_download(fs) + self._test_download_dir(fs) def test_local(self): fs = LocalFS() From 6072aecba10908241f8883a005d2fc12c2a24352 Mon Sep 17 00:00:00 2001 From: Jack Zhou Date: Thu, 21 Oct 2021 16:05:53 +0800 Subject: [PATCH 241/298] Add viterbi decode (#35778) * add viterbi decode cpu kernel * add viterbi decoder api in paddle.text * add a data buffer once to avoid create many small pieces of data buffer frequently * fix viterbi max_seq_length bug * fix seq_len=1 bug * fix device context * move split out of for loop * remove INVERSE_SUB * remove 2 GET_CAST_MASK * remove 1 loop * remove Functor * add to_static deploy code * use MAX_FUNC instead of ELE_MAX * add MaxFunctor * impl max_func * remove MaxFunctor * remove cast op * use REGISTER_OP_WITHOUT_GRADIENT * add viterbi cuda kernel * add FIX_BLOCKDIM_CASE macro * add MKL add, mul; add get data mask * add arange mkl impl * add CPU Argmax * add cpu gather * use EXECUTE_MKL_ELEMENT_BINARY_OP instead of some ADD, MUL * use SameDimsBinaryOP instead of EXECUTE_MKL_ELEMENT_BINARY_OP * use SAME_DIMS_ELEMENT_BINARY_OP * add SimpleBroadcastBinaryOP * use int instead of int64_t to accelerate * optimize SimpleBroadcastBinaryOP * optimize SimpleBroadcastBinaryOP * optimize performance in both single thread and multithread situation * remove useless line * remove useless code * add CREATE_TENSOR_BUFFER macro * add INIT_REQUIRED_TENSOR macro * add comment * fix windows ci * add viterbi unittest * remove cuda add functor * remove cuda equal * remove a template function * fix windows ci * fix windows dtype * remove some template instance * remove useless header file * remove some blockdim * remove transpose impl * accelerate cpu performance on single thread situation * viterbi_decode->crf_decode * rename crf params name * add viterbi api test * remove useless import * add enable_static * use viterbi decoder * fix viterbi len=1 * fix viterbi unittest * remove useless comments * reconstruct viterbi decode * remove ADD,SUB,MUL structure * fix coverage * remove CREATE_TENSOR * add name args * crf.py->ops.py; with_start_stop_tag->include_start_end_tag * update crf_decode en docs * fix viterbi decode en docs * fix some review comments * add FIXED_BLOCK_DIM_CASE in cuda * push_back->emplace_back * crf_decode->viterbi_decode; include_start_end_tag->include_bos_eos_tag * paddle.text.ops.viterbi_decode->paddle.text.viterbi_decode * fix viterbi_decode en docs --- .../elementwise/elementwise_op_function.h | 4 +- paddle/fluid/operators/viterbi_decode_op.cc | 109 +++++ paddle/fluid/operators/viterbi_decode_op.cu | 200 +++++++++ paddle/fluid/operators/viterbi_decode_op.h | 415 ++++++++++++++++++ .../tests/unittests/test_viterbi_decode_op.py | 134 ++++++ python/paddle/text/__init__.py | 6 +- python/paddle/text/viterbi_decode.py | 132 ++++++ 7 files changed, 996 insertions(+), 4 deletions(-) create mode 100644 paddle/fluid/operators/viterbi_decode_op.cc create mode 100644 paddle/fluid/operators/viterbi_decode_op.cu create mode 100644 paddle/fluid/operators/viterbi_decode_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_viterbi_decode_op.py create mode 100644 python/paddle/text/viterbi_decode.py diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 312978a010b30c..2df7dd06f2cc89 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -240,7 +240,7 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, x_dims, y_dims, x_dims_array[i], y_dims_array[i], i)); if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) || (x_dims_array[i] == 1 && y_dims_array[i] == 1)) { - out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]); + out_dims_array[i] = (std::max)(x_dims_array[i], y_dims_array[i]); } else { out_dims_array[i] = -1; } @@ -1779,7 +1779,7 @@ void CommonElementwiseBroadcastForward( const framework::Tensor *y, framework::Tensor *z, const framework::DDim &x_dims, const framework::DDim &y_dims, Functor func, int axis, const bool is_xsize_larger = true) { - int max_dim = std::max(x_dims.size(), y_dims.size()); + int max_dim = (std::max)(x_dims.size(), y_dims.size()); axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); PADDLE_ENFORCE_GE( axis, 0, diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc new file mode 100644 index 00000000000000..bf1cdeed65a842 --- /dev/null +++ b/paddle/fluid/operators/viterbi_decode_op.cc @@ -0,0 +1,109 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/viterbi_decode_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class ViterbiDecodeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode"); + OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition", + "ViterbiDecode"); + OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode"); + OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores", + "ViterbiDecode"); + OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode"); + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(in_dims.size(), 3, + platform::errors::InvalidArgument( + "The rank of Input in ViterbiDecode must be 3. But " + "received Input's rank is %d.", + in_dims.size())); + auto length_dims = ctx->GetInputDim("Length"); + PADDLE_ENFORCE_EQ(length_dims.size(), 1, + platform::errors::InvalidArgument( + "The rank of Length in ViterbiDecode must be 1. But " + "received Length's rank is %d.", + length_dims.size())); + auto transition_dims = ctx->GetInputDim("Transition"); + PADDLE_ENFORCE_EQ( + transition_dims.size(), 2, + platform::errors::InvalidArgument( + "The rank of Transition in ViterbiDecode must be 2. But " + "received Transition's rank is %d.", + transition_dims.size())); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ( + in_dims[0], length_dims[0], + platform::errors::InvalidArgument( + "The batch size of Input and Length should be equal.")); + PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0], + platform::errors::InvalidArgument( + "The number of tags of Input (%d) and Transition " + "(%d) should be equal.", + transition_dims[0], in_dims[2])); + } + ctx->SetOutputDim("Scores", length_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "Input"), + ctx.device_context()); + } +}; + +class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Input", + "The unary emission tensor. The shape of Input must be (batch_size," + "sequence_length, num_tags). "); + AddInput("Transition", + "The transition matrix. The shape of Transition must be ( " + "num_tags, num_tags). "); + AddInput("Length", + "The input length tensor storing real length of each sequence for " + "correctness. The shape of Length MUST be (batch_size)."); + AddOutput("Scores", + "The scores tensor containing the score for the Viterbi " + "sequence. The shape of Scores MUST be (batch_size)."); + AddOutput("Path", + "The paths tensor containing the highest scoring tag indices. " + "The shape of Scores MUST be (batch_size, sequence_length)."); + AddAttr("include_bos_eos_tag", + "If set to True, the last row and the last column of " + "transitions will be considered as start tag.") + .SetDefault(true); + AddComment(R"DOC( + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace platform = paddle::platform; +REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp, + ops::ViterbiDecodeOpMaker); +REGISTER_OP_CPU_KERNEL( + viterbi_decode, ops::ViterbiDecodeKernel, + ops::ViterbiDecodeKernel); diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu new file mode 100644 index 00000000000000..086ff05b084612 --- /dev/null +++ b/paddle/fluid/operators/viterbi_decode_op.cu @@ -0,0 +1,200 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise/elementwise_functor.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/operators/viterbi_decode_op.h" + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +namespace paddle { +namespace operators { + +#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ + case (1 << (log2_block_dim)): { \ + constexpr auto kBlockDim = (1 << (log2_block_dim)); \ + __VA_ARGS__; \ + } break + +#define FIXED_BLOCK_DIM_CASE(...) \ + FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); + +int64_t ComputeBlockSize(int64_t col) { + if (col > 512) + return 1024; + else if (col > 256) + return 512; + else if (col > 128) + return 256; + else if (col > 64) + return 128; + else if (col > 32) + return 64; + else if (col > 16) + return 32; + else if (col > 8) + return 16; + else + return 8; +} + +template