From 1cfad65d60efbab1cd5fb568b448f8d8cb1a283c Mon Sep 17 00:00:00 2001 From: SigureMo Date: Fri, 26 Apr 2024 11:22:37 +0000 Subject: [PATCH 1/6] [PIR] Throw error when OP has no grad OP --- python/paddle/autograd/ir_backward.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py index 4614856ed86ae9..f4879e35f2511b 100644 --- a/python/paddle/autograd/ir_backward.py +++ b/python/paddle/autograd/ir_backward.py @@ -55,6 +55,8 @@ """ __all__ = ['grad', 'calc_gradient', 'calc_gradient_helper'] +ALLOW_NO_GRAD_OPS = ["pd_op.full_like"] + def append_full_like(float_value, copy_value, value, state, backward_ops): with paddle.amp.auto_cast(enable=False): @@ -834,7 +836,10 @@ def append_yield( else: state.op_to_opgrad[op] = [] else: - logging.warning("%s op has no grad op", op.name()) + if op.name() not in ALLOW_NO_GRAD_OPS: + raise ValueError( + f"op {op.name()} has no grad op, consider enable prim to decompose it." + ) state.op_to_opgrad[op] = [] if fwd_block != bwd_block: @@ -1202,9 +1207,11 @@ def append_backward(loss, parameter_list=None, no_grad_set=None): input_inputs_grad.append( ( input, - input_to_inputgrad_map[input][0][0] - if input_to_inputgrad_map[input] != [] - else None, + ( + input_to_inputgrad_map[input][0][0] + if input_to_inputgrad_map[input] != [] + else None + ), ) ) From db8b5753f49910424530745c474ed406882482e3 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Fri, 26 Apr 2024 11:33:19 +0000 Subject: [PATCH 2/6] add ut --- test/dygraph_to_static/test_high_order_net.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 test/dygraph_to_static/test_high_order_net.py diff --git a/test/dygraph_to_static/test_high_order_net.py b/test/dygraph_to_static/test_high_order_net.py new file mode 100644 index 00000000000000..9d116528ea649d --- /dev/null +++ b/test/dygraph_to_static/test_high_order_net.py @@ -0,0 +1,58 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from dygraph_to_static_utils import ( + Dy2StTestBase, + test_ast_only, + test_pir_only, +) + +import paddle + + +class HighOrderNet(paddle.nn.Layer): + def __init__(self): + super().__init__() + self.linear = paddle.nn.Linear(3, 4, bias_attr=False) + + def forward(self, x): + y = self.linear(x) + z = paddle.pow(y, 2) + x_grad = paddle.grad(z, x, create_graph=True)[0] + x_grad_grad = paddle.grad(x_grad, x, create_graph=True)[0] + return x_grad_grad.mean() + + +class TestBackwardHasNoGradError(Dy2StTestBase): + @test_ast_only + @test_pir_only + def test_backward_has_no_grad_error(self): + net = HighOrderNet() + static_net = paddle.jit.to_static(net, full_graph=True) + + x = paddle.to_tensor([[1, 1, 1], [1, 1, 1]], 'float32') + x.stop_gradient = False + + with self.assertRaisesRegex( + ValueError, + "op 'pd_op.matmul_double_grad' has no grad op, consider enable prim to decompose it.", + ): + x_grad_grad = static_net(x) + x_grad_grad.backward() + + +if __name__ == "__main__": + unittest.main() From d68135f224175627d64f2734d056b8251031802d Mon Sep 17 00:00:00 2001 From: SigureMo Date: Fri, 26 Apr 2024 11:53:32 +0000 Subject: [PATCH 3/6] update error msg --- python/paddle/autograd/ir_backward.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py index f4879e35f2511b..413e0a3ad7a29d 100644 --- a/python/paddle/autograd/ir_backward.py +++ b/python/paddle/autograd/ir_backward.py @@ -838,7 +838,7 @@ def append_yield( else: if op.name() not in ALLOW_NO_GRAD_OPS: raise ValueError( - f"op {op.name()} has no grad op, consider enable prim to decompose it." + f"op '{op.name()}' has no grad op, consider enable prim to decompose it." ) state.op_to_opgrad[op] = [] From 5f6b38aa68b14067b769900597afe50eef16e1c9 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Fri, 26 Apr 2024 14:31:56 +0000 Subject: [PATCH 4/6] update allow list --- python/paddle/autograd/ir_backward.py | 62 ++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py index 413e0a3ad7a29d..f3826c0c50fb27 100644 --- a/python/paddle/autograd/ir_backward.py +++ b/python/paddle/autograd/ir_backward.py @@ -55,7 +55,62 @@ """ __all__ = ['grad', 'calc_gradient', 'calc_gradient_helper'] -ALLOW_NO_GRAD_OPS = ["pd_op.full_like"] +# TODO: Consider a better way to mark these ops has no grad op. +# Such as use a new trait to mark these ops. +ALLOW_NO_GRAD_OPS = [ + # Compare ops + "pd_op.equal", + "pd_op.equal_", + "pd_op.not_equal", + "pd_op.not_equal_", + "pd_op.less_than", + "pd_op.less_than_", + "pd_op.less_equal", + "pd_op.less_equal_", + "pd_op.greater_than", + "pd_op.greater_than_", + "pd_op.greater_equal", + "pd_op.greater_equal_", + # Logical ops + "pd_op.logical_and", + "pd_op.logical_and_", + "pd_op.logical_not", + "pd_op.logical_not_", + "pd_op.logical_or", + "pd_op.logical_or_", + "pd_op.logical_xor", + "pd_op.logical_xor_", + # Array ops + "pd_op.assign_array", + "pd_op.array_length", + "pd_op.slice_array", + "pd_op.slice_array_dense", + "pd_op.assign_array", + "pd_op.assign_array_", + "pd_op.create_array", + "pd_op.create_array_like", + "pd_op.array_read", + "pd_op.array_write_", + "pd_op.array_pop", + # Others + "pd_op.remainder", + "pd_op.argmax", + "pd_op.print", + "pd_op.accuracy", + "pd_op.uniform", + "pd_op.gaussian", + "pd_op.bernoulli", + "pd_op.full_like", + "pd_op.assign_value_", + "pd_op.nextafter", + "pd_op.isnan", + "pd_op.isinf", +] + + +def is_builtin_op(op): + dialect_name, opname = op.name().split(".") + return dialect_name == "builtin" def append_full_like(float_value, copy_value, value, state, backward_ops): @@ -836,7 +891,10 @@ def append_yield( else: state.op_to_opgrad[op] = [] else: - if op.name() not in ALLOW_NO_GRAD_OPS: + if ( + not is_builtin_op(op) + and op.name() not in ALLOW_NO_GRAD_OPS + ): raise ValueError( f"op '{op.name()}' has no grad op, consider enable prim to decompose it." ) From f7fa24a14ef1ace12ed28a8944dd19f77f7c1951 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Fri, 26 Apr 2024 15:27:59 +0000 Subject: [PATCH 5/6] update allow list --- python/paddle/autograd/ir_backward.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py index f3826c0c50fb27..a500419713d114 100644 --- a/python/paddle/autograd/ir_backward.py +++ b/python/paddle/autograd/ir_backward.py @@ -80,6 +80,19 @@ "pd_op.logical_or_", "pd_op.logical_xor", "pd_op.logical_xor_", + # Bitwise ops + "pd_op.bitwise_and", + "pd_op.bitwise_and_", + "pd_op.bitwise_left_shift", + "pd_op.bitwise_left_shift_", + "pd_op.bitwise_not", + "pd_op.bitwise_not_", + "pd_op.bitwise_or", + "pd_op.bitwise_or_", + "pd_op.bitwise_right_shift", + "pd_op.bitwise_right_shift_", + "pd_op.bitwise_xor", + "pd_op.bitwise_xor_", # Array ops "pd_op.assign_array", "pd_op.array_length", @@ -105,6 +118,8 @@ "pd_op.nextafter", "pd_op.isnan", "pd_op.isinf", + "pd_op.all", + "pd_op.any", ] From 22d34ea5f8f24d3001b63bd94ab1d3f510f81479 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Sun, 28 Apr 2024 02:47:29 +0000 Subject: [PATCH 6/6] move constants to backward_utils.py --- python/paddle/autograd/backward_utils.py | 72 +++++++++++++++++++++++ python/paddle/autograd/ir_backward.py | 74 +----------------------- 2 files changed, 74 insertions(+), 72 deletions(-) diff --git a/python/paddle/autograd/backward_utils.py b/python/paddle/autograd/backward_utils.py index bdd2756e09cd66..3eb4c01406704c 100644 --- a/python/paddle/autograd/backward_utils.py +++ b/python/paddle/autograd/backward_utils.py @@ -25,6 +25,73 @@ ) from paddle.base.wrapped_decorator import signature_safe_contextmanager +# TODO: Consider a better way to mark these ops has no grad op. +# Such as use a new trait to mark these ops. +ALLOW_NO_GRAD_OPS = [ + # Compare ops + "pd_op.equal", + "pd_op.equal_", + "pd_op.not_equal", + "pd_op.not_equal_", + "pd_op.less_than", + "pd_op.less_than_", + "pd_op.less_equal", + "pd_op.less_equal_", + "pd_op.greater_than", + "pd_op.greater_than_", + "pd_op.greater_equal", + "pd_op.greater_equal_", + # Logical ops + "pd_op.logical_and", + "pd_op.logical_and_", + "pd_op.logical_not", + "pd_op.logical_not_", + "pd_op.logical_or", + "pd_op.logical_or_", + "pd_op.logical_xor", + "pd_op.logical_xor_", + # Bitwise ops + "pd_op.bitwise_and", + "pd_op.bitwise_and_", + "pd_op.bitwise_left_shift", + "pd_op.bitwise_left_shift_", + "pd_op.bitwise_not", + "pd_op.bitwise_not_", + "pd_op.bitwise_or", + "pd_op.bitwise_or_", + "pd_op.bitwise_right_shift", + "pd_op.bitwise_right_shift_", + "pd_op.bitwise_xor", + "pd_op.bitwise_xor_", + # Array ops + "pd_op.assign_array", + "pd_op.array_length", + "pd_op.slice_array", + "pd_op.slice_array_dense", + "pd_op.assign_array", + "pd_op.assign_array_", + "pd_op.create_array", + "pd_op.create_array_like", + "pd_op.array_read", + "pd_op.array_write_", + "pd_op.array_pop", + # Others + "pd_op.remainder", + "pd_op.argmax", + "pd_op.print", + "pd_op.accuracy", + "pd_op.uniform", + "pd_op.gaussian", + "pd_op.bernoulli", + "pd_op.full_like", + "pd_op.assign_value_", + "pd_op.nextafter", + "pd_op.isnan", + "pd_op.isinf", + "pd_op.all", + "pd_op.any", +] + class ValueWrapper: def __init__(self, value) -> None: @@ -281,6 +348,11 @@ def is_control_flow(op): return op.name() == "pd_op.if" or op.name() == "pd_op.while" +def is_builtin_op(op): + dialect_name, opname = op.name().split(".") + return dialect_name == "builtin" + + def update_no_grad_set_by_stopgradient(block, no_grad_set): for op in block.ops: if is_control_flow(op): diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py index a500419713d114..07e283b7617f79 100644 --- a/python/paddle/autograd/ir_backward.py +++ b/python/paddle/autograd/ir_backward.py @@ -18,6 +18,7 @@ import paddle.pir from paddle.autograd.backward_utils import ( + ALLOW_NO_GRAD_OPS, State, ValueDict, ValueSet, @@ -32,6 +33,7 @@ get_real_op_inputs, get_split_op, inverse_sort_op, + is_builtin_op, is_control_flow, is_inplace_net, parent_total_ops, @@ -55,78 +57,6 @@ """ __all__ = ['grad', 'calc_gradient', 'calc_gradient_helper'] -# TODO: Consider a better way to mark these ops has no grad op. -# Such as use a new trait to mark these ops. -ALLOW_NO_GRAD_OPS = [ - # Compare ops - "pd_op.equal", - "pd_op.equal_", - "pd_op.not_equal", - "pd_op.not_equal_", - "pd_op.less_than", - "pd_op.less_than_", - "pd_op.less_equal", - "pd_op.less_equal_", - "pd_op.greater_than", - "pd_op.greater_than_", - "pd_op.greater_equal", - "pd_op.greater_equal_", - # Logical ops - "pd_op.logical_and", - "pd_op.logical_and_", - "pd_op.logical_not", - "pd_op.logical_not_", - "pd_op.logical_or", - "pd_op.logical_or_", - "pd_op.logical_xor", - "pd_op.logical_xor_", - # Bitwise ops - "pd_op.bitwise_and", - "pd_op.bitwise_and_", - "pd_op.bitwise_left_shift", - "pd_op.bitwise_left_shift_", - "pd_op.bitwise_not", - "pd_op.bitwise_not_", - "pd_op.bitwise_or", - "pd_op.bitwise_or_", - "pd_op.bitwise_right_shift", - "pd_op.bitwise_right_shift_", - "pd_op.bitwise_xor", - "pd_op.bitwise_xor_", - # Array ops - "pd_op.assign_array", - "pd_op.array_length", - "pd_op.slice_array", - "pd_op.slice_array_dense", - "pd_op.assign_array", - "pd_op.assign_array_", - "pd_op.create_array", - "pd_op.create_array_like", - "pd_op.array_read", - "pd_op.array_write_", - "pd_op.array_pop", - # Others - "pd_op.remainder", - "pd_op.argmax", - "pd_op.print", - "pd_op.accuracy", - "pd_op.uniform", - "pd_op.gaussian", - "pd_op.bernoulli", - "pd_op.full_like", - "pd_op.assign_value_", - "pd_op.nextafter", - "pd_op.isnan", - "pd_op.isinf", - "pd_op.all", - "pd_op.any", -] - - -def is_builtin_op(op): - dialect_name, opname = op.name().split(".") - return dialect_name == "builtin" - def append_full_like(float_value, copy_value, value, state, backward_ops): with paddle.amp.auto_cast(enable=False):