From f58cb01864151e27ff45d9fc99b61b72cce3295e Mon Sep 17 00:00:00 2001 From: Chengmo Date: Thu, 25 Mar 2021 17:37:09 +0800 Subject: [PATCH 01/15] =?UTF-8?q?=E3=80=90Paddle.Fleet=E3=80=91fix=20datas?= =?UTF-8?q?et=20zip=20py3=20bug=20(#31441)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix zip py3 bug --- .../fleet/data_generator/data_generator.py | 26 +++++++----- .../tests/unittests/test_data_generator.py | 40 +++++++++++++++++++ 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/python/paddle/distributed/fleet/data_generator/data_generator.py b/python/paddle/distributed/fleet/data_generator/data_generator.py index 669d2ea24a0c78..9d743fc38bf398 100644 --- a/python/paddle/distributed/fleet/data_generator/data_generator.py +++ b/python/paddle/distributed/fleet/data_generator/data_generator.py @@ -32,11 +32,11 @@ def set_batch(self, batch_size): ''' Set batch size of current DataGenerator This is necessary only if a user wants to define generator_batch - + Example: .. code-block:: python - + import paddle.distributed.fleet.data_generator as dg class MyData(dg.DataGenerator): @@ -52,7 +52,7 @@ def local_iter(): yield ("words", s[1].extend([s[1][0]])) mydata = MyData() mydata.set_batch(128) - + ''' self.batch_size_ = batch_size @@ -63,7 +63,7 @@ def run_from_memory(self): Example: .. code-block:: python - + import paddle.distributed.fleet.data_generator as dg class MyData(dg.DataGenerator): @@ -100,9 +100,9 @@ def run_from_stdin(self): generated. Example: - + .. code-block:: python - + import paddle.distributed.fleet.data_generator as dg class MyData(dg.DataGenerator): @@ -161,7 +161,7 @@ def generate_sample(self, line): The data format is list or tuple: [(name, [feasign, ...]), ...] or ((name, [feasign, ...]), ...) - + For example: [("words", [1926, 08, 17]), ("label", [1])] or (("words", [1926, 08, 17]), ("label", [1])) @@ -174,7 +174,7 @@ def generate_sample(self, line): Example: .. code-block:: python - + import paddle.distributed.fleet.data_generator as dg class MyData(dg.DataGenerator): @@ -206,7 +206,7 @@ def generate_batch(self, samples): Example: .. code-block:: python - + import paddle.distributed.fleet.data_generator as dg class MyData(dg.DataGenerator): @@ -259,6 +259,9 @@ def _gen_str(self, line): Returns: Return a string data that can be read directly by the MultiSlotDataFeed. ''' + if sys.version > '3' and isinstance(line, zip): + line = list(line) + if not isinstance(line, list) and not isinstance(line, tuple): raise ValueError( "the output of process() must be in list or tuple type" @@ -289,7 +292,7 @@ def _gen_str(self, line): >>> [ids_num id1 id2 ...] ... The proto_info will be in this format: >>> [(name, type), ...] - + For example, if the input is like this: >>> [("words", [1926, 08, 17]), ("label", [1])] >>> or (("words", [1926, 08, 17]), ("label", [1])) @@ -304,6 +307,9 @@ def _gen_str(self, line): Returns: Return a string data that can be read directly by the MultiSlotDataFeed. ''' + if sys.version > '3' and isinstance(line, zip): + line = list(line) + if not isinstance(line, list) and not isinstance(line, tuple): raise ValueError( "the output of process() must be in list or tuple type" diff --git a/python/paddle/fluid/tests/unittests/test_data_generator.py b/python/paddle/fluid/tests/unittests/test_data_generator.py index 6381cb36402636..69d8e01fd464af 100644 --- a/python/paddle/fluid/tests/unittests/test_data_generator.py +++ b/python/paddle/fluid/tests/unittests/test_data_generator.py @@ -95,6 +95,32 @@ def data_iter(): return data_iter +class MyMultiSlotStringDataGenerator_zip(fleet.MultiSlotStringDataGenerator): + def generate_sample(self, line): + def data_iter(): + for i in range(40): + if i == 1: + yield None + feature_name = ["words", "label"] + data = [["1", "2", "3", "4"], ["0"]] + yield zip(feature_name, data) + + return data_iter + + +class MyMultiSlotDataGenerator_zip(fleet.MultiSlotDataGenerator): + def generate_sample(self, line): + def data_iter(): + for i in range(40): + if i == 1: + yield None + feature_name = ["words", "label"] + data = [[1, 2, 3, 4], [0]] + yield zip(feature_name, data) + + return data_iter + + class TestMultiSlotDataGenerator(unittest.TestCase): def test_MultiSlotDataGenerator_basic(self): my_ms_dg = MyMultiSlotDataGenerator() @@ -149,5 +175,19 @@ def test_MultiSlotDataGenerator_error(self): my_ms_dg.run_from_memory() +class TestMultiSlotStringDataGeneratorZip(unittest.TestCase): + def test_MultiSlotStringDataGenerator_zip(self): + my_ms_dg = MyMultiSlotStringDataGenerator_zip() + my_ms_dg.set_batch(1) + my_ms_dg.run_from_memory() + + +class TestMultiSlotDataGeneratorZip(unittest.TestCase): + def test_MultiSlotDataGenerator_zip(self): + my_ms_dg = MyMultiSlotDataGenerator_zip() + my_ms_dg.set_batch(1) + my_ms_dg.run_from_memory() + + if __name__ == '__main__': unittest.main() From e804f08559d96a87b8c7eb50120eef68402e4313 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Fri, 26 Mar 2021 13:43:48 +0800 Subject: [PATCH 02/15] delete include framework.pb.h (#31859) * delete include framework.pb.h * fix error --- paddle/fluid/framework/custom_operator.cc | 1 - paddle/fluid/framework/executor_gc_helper.cc | 1 - paddle/fluid/framework/ir/graph_pattern_detector.h | 1 - paddle/fluid/framework/ir/layer_norm_fuse_pass.cc | 1 - paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc | 1 - paddle/fluid/framework/op_info.h | 1 - paddle/fluid/framework/op_proto_maker.h | 1 - paddle/fluid/framework/op_version_registry.h | 1 - paddle/fluid/framework/operator.h | 1 - paddle/fluid/framework/program_desc.h | 1 - paddle/fluid/framework/reader.h | 1 - paddle/fluid/framework/tensor_util.h | 1 - paddle/fluid/framework/var_type.h | 1 - paddle/fluid/framework/var_type_traits.h | 1 - paddle/fluid/framework/variable_helper.h | 1 - paddle/fluid/imperative/gradient_accumulator.cc | 1 - paddle/fluid/inference/analysis/analysis_pass.h | 1 - paddle/fluid/inference/analysis/helper.cc | 1 - paddle/fluid/inference/analysis/helper.h | 1 - paddle/fluid/inference/engine.h | 1 - paddle/fluid/operators/cast_op.h | 1 - paddle/fluid/operators/distributed_ops/recv_save_op.cc | 1 - .../operators/fused/fused_embedding_eltwise_layernorm_op.cu | 1 - paddle/fluid/operators/inplace_abn_op.cc | 1 - paddle/fluid/operators/one_hot_op.cc | 1 - paddle/fluid/operators/one_hot_op_xpu.cc | 1 - paddle/fluid/operators/one_hot_v2_op.cc | 1 - paddle/fluid/operators/one_hot_v2_op_xpu.cc | 1 - paddle/fluid/operators/reader/create_py_reader_op.cc | 1 - paddle/fluid/operators/reader/read_op.cc | 1 - paddle/fluid/operators/save_combine_op.h | 1 - paddle/fluid/operators/save_op.h | 1 - paddle/fluid/pybind/pybind.cc | 1 - 33 files changed, 33 deletions(-) diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 1ebb8998c854eb..97d58df6dc5738 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -28,7 +28,6 @@ limitations under the License. */ #include "paddle/fluid/extension/include/ext_tensor.h" #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/custom_tensor_utils.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_meta_info_helper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index c8bc735790400b..c06a3d4a183799 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -18,7 +18,6 @@ #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 2e518c1d4df72a..b6c1074d90dd2a 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -28,7 +28,6 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/inference/analysis/dot.h" diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc index 69edc3d87f97d6..18d2e9817ebec8 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc @@ -14,7 +14,6 @@ #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc index 5fd47b21733b54..5fe71fbc21451f 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc @@ -17,7 +17,6 @@ #include #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_test_util.h" #include "paddle/fluid/framework/naive_executor.h" diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index af657232e91a68..ddd84bfd81abf5 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 912e82f60ef5df..506c3eb1e0ad09 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "glog/logging.h" #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h index b9ec5507612099..5ae8f255d63be5 100644 --- a/paddle/fluid/framework/op_version_registry.h +++ b/paddle/fluid/framework/op_version_registry.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_version_proto.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e9ecf9b5a83978..bf27a8e37e0b32 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -27,7 +27,6 @@ limitations under the License. */ #include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_kernel_type.h" diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index cfef80b8d37778..4ceb0c5c824814 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/proto_desc.h" #include "paddle/fluid/platform/macros.h" diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index a4207deb7e8113..e7c23eab1fa5fc 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -20,7 +20,6 @@ #include #include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 8a127e0ed59295..fd0f98784ceb0a 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/dlpack_tensor.h" #include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 8affeda67b3d07..2e35f9b845ac73 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index b0d8f43a90f35f..fc754cbaf177c9 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -21,7 +21,6 @@ #include #include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h index 6e65bc2c932877..4cdfba29249ccf 100644 --- a/paddle/fluid/framework/variable_helper.h +++ b/paddle/fluid/framework/variable_helper.h @@ -15,7 +15,6 @@ limitations under the License. */ #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/variable.h" namespace paddle { diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index deb504a1b657e4..b9df88b1f1eeaa 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -18,7 +18,6 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/imperative/layer.h" diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h index d5a972fab3beae..14a1c3eea34174 100644 --- a/paddle/fluid/inference/analysis/analysis_pass.h +++ b/paddle/fluid/inference/analysis/analysis_pass.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/inference/analysis/argument.h" #include "paddle/fluid/inference/analysis/helper.h" diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc index 368ef2e5583fe2..ede0402f816765 100644 --- a/paddle/fluid/inference/analysis/helper.cc +++ b/paddle/fluid/inference/analysis/helper.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/helper.h" -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index ab4949935140c6..cace420d87c9df 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -25,7 +25,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h index 1a13ba510384c0..e29162cf5b23ba 100644 --- a/paddle/fluid/inference/engine.h +++ b/paddle/fluid/inference/engine.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h index 8fa0416049f8fa..cd60c7707cb0aa 100644 --- a/paddle/fluid/operators/cast_op.h +++ b/paddle/fluid/operators/cast_op.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/transform.h" diff --git a/paddle/fluid/operators/distributed_ops/recv_save_op.cc b/paddle/fluid/operators/distributed_ops/recv_save_op.cc index d194fcda36a474..d6da818e1df51d 100644 --- a/paddle/fluid/operators/distributed_ops/recv_save_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_save_op.cc @@ -20,7 +20,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/selected_rows.h" diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu index 9711cc8d811d5d..14a6608836a8a7 100644 --- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu @@ -14,7 +14,6 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h" diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index 652c071be6b33f..8234d63d681ff0 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -16,7 +16,6 @@ #include #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/operators/batch_norm_op.h" namespace paddle { diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc index 9c321832f8489a..64323e588c6285 100644 --- a/paddle/fluid/operators/one_hot_op.cc +++ b/paddle/fluid/operators/one_hot_op.cc @@ -15,7 +15,6 @@ #include "paddle/fluid/operators/one_hot_op.h" #include #include -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/one_hot_op_xpu.cc b/paddle/fluid/operators/one_hot_op_xpu.cc index 14ecd11d114d0b..3e214aa8bf8221 100644 --- a/paddle/fluid/operators/one_hot_op_xpu.cc +++ b/paddle/fluid/operators/one_hot_op_xpu.cc @@ -16,7 +16,6 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/operators/one_hot_op.h" namespace paddle { diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc index 29fe6f10c72f43..c42db1e6f449c0 100644 --- a/paddle/fluid/operators/one_hot_v2_op.cc +++ b/paddle/fluid/operators/one_hot_v2_op.cc @@ -15,7 +15,6 @@ #include "paddle/fluid/operators/one_hot_v2_op.h" #include #include -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/one_hot_v2_op_xpu.cc b/paddle/fluid/operators/one_hot_v2_op_xpu.cc index 6fec597db17290..e24be3bead6889 100644 --- a/paddle/fluid/operators/one_hot_v2_op_xpu.cc +++ b/paddle/fluid/operators/one_hot_v2_op_xpu.cc @@ -16,7 +16,6 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/operators/one_hot_op.h" namespace paddle { diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index c04bdb2f10930e..a7d177f326e511 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/operators/reader/py_reader.h" #include "paddle/fluid/operators/reader/reader_op_registry.h" diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index 9086291e17db89..38894495b4ca05 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h index 0246c42d433255..939768693a2431 100644 --- a/paddle/fluid/operators/save_combine_op.h +++ b/paddle/fluid/operators/save_combine_op.h @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h index fbde722a425bc3..e44a5c77bd8410 100644 --- a/paddle/fluid/operators/save_op.h +++ b/paddle/fluid/operators/save_op.h @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/selected_rows.h" diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c8ca3bf2c8fa27..e1ff69e7485eb8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -29,7 +29,6 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/io/fs.h" #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" From 70b67f1029a8ddfa68cf2a6f0d5631b95ff591bd Mon Sep 17 00:00:00 2001 From: Wilber Date: Fri, 26 Mar 2021 13:45:31 +0800 Subject: [PATCH 03/15] fix go api bug. (#31857) --- go/README_cn.md | 1 + go/demo/mobilenet.go | 2 +- go/paddle/common.go | 2 +- go/paddle/config.go | 2 +- go/paddle/predictor.go | 4 ++-- go/paddle/tensor.go | 4 ++-- paddle/fluid/inference/capi/pd_predictor.cc | 9 ++++++--- 7 files changed, 14 insertions(+), 10 deletions(-) diff --git a/go/README_cn.md b/go/README_cn.md index a184ecbb8dea1a..040540e939bc3a 100644 --- a/go/README_cn.md +++ b/go/README_cn.md @@ -50,6 +50,7 @@ output_data := value.Interface().([][]float32) 运行 ```bash +go mod init github.com/paddlepaddle export LD_LIBRARY_PATH=`pwd`/paddle_c/paddle/lib:$LD_LIBRARY_PATH go run ./demo/mobilenet.go ``` diff --git a/go/demo/mobilenet.go b/go/demo/mobilenet.go index 1b42fe8049a584..c1ca2e967f72dc 100644 --- a/go/demo/mobilenet.go +++ b/go/demo/mobilenet.go @@ -13,7 +13,7 @@ // limitations under the License. package main -import "../paddle" +import "github.com/paddlepaddle/paddle" import "strings" import "io/ioutil" import "strconv" diff --git a/go/paddle/common.go b/go/paddle/common.go index 4bf94765931282..cbbde6a45f59b8 100644 --- a/go/paddle/common.go +++ b/go/paddle/common.go @@ -15,7 +15,7 @@ package paddle // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c +// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c // #include // #include import "C" diff --git a/go/paddle/config.go b/go/paddle/config.go index 89f7d7e63ff2a8..68a31230997bed 100644 --- a/go/paddle/config.go +++ b/go/paddle/config.go @@ -15,7 +15,7 @@ package paddle // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c +// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c // #include // #include // #include diff --git a/go/paddle/predictor.go b/go/paddle/predictor.go index 59bad908e6a508..5f2b2c81a60549 100644 --- a/go/paddle/predictor.go +++ b/go/paddle/predictor.go @@ -15,7 +15,7 @@ package paddle // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c +// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c // #include // #include "paddle_c_api.h" import "C" @@ -88,7 +88,7 @@ func (predictor *Predictor) GetInputNames() []string { } func (predictor *Predictor) GetOutputNames() []string { - names := make([]string, predictor.GetInputNum()) + names := make([]string, predictor.GetOutputNum()) for i := 0; i < len(names); i++ { names[i] = predictor.GetOutputName(i) } diff --git a/go/paddle/tensor.go b/go/paddle/tensor.go index e6e2c53fef1af5..6fbcf039f88a7c 100644 --- a/go/paddle/tensor.go +++ b/go/paddle/tensor.go @@ -15,7 +15,7 @@ package paddle // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c +// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c // #include // #include // #include @@ -209,7 +209,7 @@ func DecodeTensor(r *bytes.Reader, shape []int32, t reflect.Type, ptr reflect.Va value := reflect.Indirect(ptr) value.Set(reflect.MakeSlice(t, int(shape[0]), int(shape[0]))) if len(shape) == 1 && value.Len() > 0 { - switch value.Index(1).Kind() { + switch value.Index(0).Kind() { case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32: binary.Read(r, Endian(), value.Interface()) return diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc index c1bf4c974fac8c..c4e195b6ec8fab 100644 --- a/paddle/fluid/inference/capi/pd_predictor.cc +++ b/paddle/fluid/inference/capi/pd_predictor.cc @@ -207,13 +207,16 @@ int PD_GetOutputNum(const PD_Predictor* predictor) { } const char* PD_GetInputName(const PD_Predictor* predictor, int n) { - static std::vector names = predictor->predictor->GetInputNames(); + static std::vector names; + names.resize(predictor->predictor->GetInputNames().size()); + names[n] = predictor->predictor->GetInputNames()[n]; return names[n].c_str(); } const char* PD_GetOutputName(const PD_Predictor* predictor, int n) { - static std::vector names = - predictor->predictor->GetOutputNames(); + static std::vector names; + names.resize(predictor->predictor->GetOutputNames().size()); + names[n] = predictor->predictor->GetOutputNames()[n]; return names[n].c_str(); } From 01aa252624a639552116a0c46188ca7f5c43a1ee Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Fri, 26 Mar 2021 15:58:50 +0800 Subject: [PATCH 04/15] [Paddle-TRT] multiclass nms (#31742) * add multiclass_nms * add multiclass_nms unittest * add default enable_tensorrt_oss option * refine multiclas nms unittest and add serialization/dynamic test * change super to InferencePassTest for python2 compatibility * refine multiclass nms unittest * move out dynamic shape test due to ci timelimit --- .../fluid/inference/api/analysis_predictor.cc | 2 +- .../inference/tensorrt/convert/CMakeLists.txt | 2 +- .../tensorrt/convert/multiclass_nms_op.cc | 133 ++++++++++++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 34 ++++- .../ir/inference/inference_pass_test.py | 3 + .../inference/test_trt_multiclass_nms_op.py | 144 ++++++++++++++++++ 6 files changed, 315 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 8f2b217a2fde0a..0007582e2c73d2 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1192,7 +1192,7 @@ USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(stack); USE_TRT_CONVERTER(clip); USE_TRT_CONVERTER(gather); - +USE_TRT_CONVERTER(multiclass_nms); USE_TRT_CONVERTER(nearest_interp); #endif diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index b0d0229ec0531f..be7fa0548d9f34 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -6,7 +6,7 @@ nv_library(tensorrt_converter shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc gather_op.cc - + multiclass_nms_op.cc nearest_interp_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc new file mode 100644 index 00000000000000..b0d67a5bf90ca9 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc @@ -0,0 +1,133 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class MultiClassNMSOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid multiclassNMS op to tensorrt plugin"; + + // for now, only work for static shape and regular tensor + framework::OpDesc op_desc(op, nullptr); + + std::string bboxes = op_desc.Input("BBoxes").front(); + std::string scores = op_desc.Input("Scores").front(); + std::string output_name = op_desc.Output("Out").front(); + + auto* bboxes_tensor = engine_->GetITensor(bboxes); + auto* scores_tensor = engine_->GetITensor(scores); + + int background_label = + BOOST_GET_CONST(int, op_desc.GetAttr("background_label")); + float score_threshold = + BOOST_GET_CONST(float, op_desc.GetAttr("score_threshold")); + int nms_top_k = BOOST_GET_CONST(int, op_desc.GetAttr("nms_top_k")); + float nms_threshold = + BOOST_GET_CONST(float, op_desc.GetAttr("nms_threshold")); + int keep_top_k = BOOST_GET_CONST(int, op_desc.GetAttr("keep_top_k")); + bool normalized = BOOST_GET_CONST(bool, op_desc.GetAttr("normalized")); + int num_classes = scores_tensor->getDimensions().d[0]; + + auto bboxes_dims = bboxes_tensor->getDimensions(); + nvinfer1::Dims3 bboxes_expand_dims(bboxes_dims.d[0], 1, bboxes_dims.d[1]); + auto* bboxes_expand_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *bboxes_tensor); + bboxes_expand_layer->setReshapeDimensions(bboxes_expand_dims); + + nvinfer1::Permutation permutation{1, 0}; + auto* scores_transpose_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scores_tensor); + scores_transpose_layer->setFirstTranspose(permutation); + + std::vector batch_nms_inputs; + batch_nms_inputs.push_back(bboxes_expand_layer->getOutput(0)); + batch_nms_inputs.push_back(scores_transpose_layer->getOutput(0)); + + constexpr bool shareLocation = true; + constexpr bool clip_boxes = false; + + const std::vector fields{ + {"shareLocation", &shareLocation, nvinfer1::PluginFieldType::kINT32, 1}, + {"backgroundLabelId", &background_label, + nvinfer1::PluginFieldType::kINT32, 1}, + {"numClasses", &num_classes, nvinfer1::PluginFieldType::kINT32, 1}, + {"topK", &nms_top_k, nvinfer1::PluginFieldType::kINT32, 1}, + {"keepTopK", &keep_top_k, nvinfer1::PluginFieldType::kINT32, 1}, + {"scoreThreshold", &score_threshold, + nvinfer1::PluginFieldType::kFLOAT32, 1}, + {"iouThreshold", &nms_threshold, nvinfer1::PluginFieldType::kFLOAT32, + 1}, + {"isNormalized", &normalized, nvinfer1::PluginFieldType::kINT32, 1}, + {"clipBoxes", &clip_boxes, nvinfer1::PluginFieldType::kINT32, 1}, + }; + + nvinfer1::PluginFieldCollection* plugin_collections = + static_cast( + malloc(sizeof(*plugin_collections) + + fields.size() * sizeof(nvinfer1::PluginField))); + plugin_collections->nbFields = static_cast(fields.size()); + plugin_collections->fields = fields.data(); + + auto creator = GetPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1"); + auto batch_nms_plugin = + creator->createPlugin("BatchNMSPlugin", plugin_collections); + free(plugin_collections); + + auto batch_nms_layer = engine_->network()->addPluginV2( + batch_nms_inputs.data(), batch_nms_inputs.size(), *batch_nms_plugin); + auto nmsed_boxes = batch_nms_layer->getOutput(1); + auto nmsed_scores = batch_nms_layer->getOutput(2); + auto nmsed_classes = batch_nms_layer->getOutput(3); + + auto nmsed_scores_transpose_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_scores); + nmsed_scores_transpose_layer->setReshapeDimensions( + nvinfer1::Dims2(keep_top_k, 1)); + auto nmsed_classes_reshape_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_classes); + nmsed_classes_reshape_layer->setReshapeDimensions( + nvinfer1::Dims2(keep_top_k, 1)); + + std::vector concat_inputs; + concat_inputs.push_back(nmsed_classes_reshape_layer->getOutput(0)); + concat_inputs.push_back(nmsed_scores_transpose_layer->getOutput(0)); + concat_inputs.push_back(nmsed_boxes); + + auto nms_concat_layer = TRT_ENGINE_ADD_LAYER( + engine_, Concatenation, concat_inputs.data(), concat_inputs.size()); + nms_concat_layer->setAxis(1); + + RreplenishLayerAndOutput(nms_concat_layer, "multiclass_nms", {output_name}, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(multiclass_nms, MultiClassNMSOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 11752d71a45e1b..82f58254fe8e0d 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -111,7 +111,7 @@ struct SimpleOpTypeSetTeller : public Teller { "flatten2", "flatten", "gather", - + "multiclass_nms", "nearest_interp", }; }; @@ -195,6 +195,38 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, // current not support axis from input, use default 0 if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; } + + if (op_type == "multiclass_nms") { + if (with_dynamic_shape) return false; + auto* block = desc.Block(); + for (auto& param_name : desc.Inputs()) { + for (auto& var_name : param_name.second) { + auto* var_desc = block->FindVar(var_name); + const auto shape = var_desc->GetShape(); + if (shape.size() != 3) { + VLOG(1) << "multiclass_nms op dims != 3 not supported in tensorrt, " + "but got dims " + << shape.size() << ", so jump it."; + return false; + } + } + } + bool has_attrs = + (desc.HasAttr("background_label") && + desc.HasAttr("score_threshold") && desc.HasAttr("nms_top_k") && + desc.HasAttr("keep_top_k") && desc.HasAttr("normalized")); + if (has_attrs == false) return false; + + auto nms_top_k = BOOST_GET_CONST(int, desc.GetAttr("nms_top_k")); + if (nms_top_k < 0) return false; + + auto keep_top_k = BOOST_GET_CONST(int, desc.GetAttr("keep_top_k")); + if (keep_top_k < 0) return false; + + auto registry = GetPluginRegistry(); + if (registry == nullptr) return false; + } + if (op_type == "fc" || op_type == "mul") { const int x_num_col_dims = desc.HasAttr("x_num_col_dims") diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py index 993493a3ccf2b6..010086bfbbc47f 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py @@ -46,6 +46,7 @@ def __init__(self, methodName='runTest'): self.enable_mkldnn = False self.enable_mkldnn_bfloat16 = False self.enable_trt = False + self.enable_tensorrt_oss = True self.trt_parameters = None self.dynamic_shape_params = None self.enable_lite = False @@ -133,6 +134,8 @@ def _get_analysis_config(self, self.dynamic_shape_params.max_input_shape, self.dynamic_shape_params.optim_input_shape, self.dynamic_shape_params.disable_trt_plugin_fp16) + if self.enable_tensorrt_oss: + config.enable_tensorrt_oss() elif use_mkldnn: config.enable_mkldnn() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py new file mode 100644 index 00000000000000..3ca6985985985e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py @@ -0,0 +1,144 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import itertools +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.core import PassVersionChecker +from paddle.fluid.core import AnalysisConfig + + +class TensorRTMultiClassNMSTest(InferencePassTest): + def setUp(self): + self.enable_trt = True + self.enable_tensorrt_oss = True + self.precision = AnalysisConfig.Precision.Float32 + self.serialize = False + self.bs = 1 + self.background_label = -1 + self.score_threshold = .5 + self.nms_top_k = 8 + self.nms_threshold = .3 + self.keep_top_k = 8 + self.normalized = False + self.num_classes = 8 + self.num_boxes = 8 + self.trt_parameters = InferencePassTest.TensorRTParam( + 1 << 30, self.bs, 2, self.precision, self.serialize, False) + + def build(self): + with fluid.program_guard(self.main_program, self.startup_program): + boxes = fluid.data( + name='bboxes', shape=[-1, self.num_boxes, 4], dtype='float32') + scores = fluid.data( + name='scores', + shape=[-1, self.num_classes, self.num_boxes], + dtype='float32') + multiclass_nms_out = fluid.layers.multiclass_nms( + bboxes=boxes, + scores=scores, + background_label=self.background_label, + score_threshold=self.score_threshold, + nms_top_k=self.nms_top_k, + nms_threshold=self.nms_threshold, + keep_top_k=self.keep_top_k, + normalized=self.normalized) + mutliclass_nms_out = multiclass_nms_out + 1. + multiclass_nms_out = fluid.layers.reshape( + multiclass_nms_out, [self.bs, 1, self.keep_top_k, 6], + name='reshape') + out = fluid.layers.batch_norm(multiclass_nms_out, is_test=True) + + boxes_data = np.arange(self.num_boxes * 4).reshape( + [self.bs, self.num_boxes, 4]).astype('float32') + scores_data = np.arange(1 * self.num_classes * self.num_boxes).reshape( + [self.bs, self.num_classes, self.num_boxes]).astype('float32') + self.feeds = { + 'bboxes': boxes_data, + 'scores': scores_data, + } + self.fetch_list = [out] + + def run_test(self): + self.build() + self.check_output() + + def run_test_all(self): + precision_opt = [ + AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half + ] + serialize_opt = [False, True] + max_shape = { + 'bboxes': [self.bs, self.num_boxes, 4], + 'scores': [self.bs, self.num_classes, self.num_boxes], + } + opt_shape = max_shape + dynamic_shape_opt = [ + None, InferencePassTest.DynamicShapeParam({ + 'bboxes': [1, 1, 4], + 'scores': [1, 1, 1] + }, max_shape, opt_shape, False) + ] + for precision, serialize, dynamic_shape in itertools.product( + precision_opt, serialize_opt, dynamic_shape_opt): + self.precision = precision + self.serialize = serialize + self.dynamic_shape_params = dynamic_shape + self.build() + self.check_output() + + def check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + def test_base(self): + self.run_test() + + def test_fp16(self): + self.precision = AnalysisConfig.Precision.Half + self.run_test() + + def test_serialize(self): + self.serialize = True + self.run_test() + + def test_dynamic(self): + max_shape = { + 'bboxes': [self.bs, self.num_boxes, 4], + 'scores': [self.bs, self.num_classes, self.num_boxes], + } + opt_shape = max_shape + self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({ + 'bboxes': [1, 1, 4], + 'scores': [1, 1, 1] + }, max_shape, opt_shape, False) + self.run_test() + + def test_background(self): + self.background = 7 + self.run_test() + + def test_disable_oss(self): + self.diable_tensorrt_oss = False + self.run_test() + + +if __name__ == "__main__": + unittest.main() From c3974d0e2a6353f3a134e8925aeb15cac7f0e48b Mon Sep 17 00:00:00 2001 From: lilong12 Date: Fri, 26 Mar 2021 18:11:51 +0800 Subject: [PATCH 05/15] [3D-parallel] Reformat pipeline parallel (#31786) * update, test=develop --- paddle/fluid/framework/section_worker.cc | 20 +- .../fleet/meta_optimizers/common.py | 41 +- .../meta_optimizers/pipeline_optimizer.py | 308 +++--- .../contrib/mixed_precision/fp16_utils.py | 10 +- python/paddle/fluid/device_worker.py | 2 +- python/paddle/fluid/executor.py | 23 +- python/paddle/fluid/optimizer.py | 954 +++++++++++------- .../fluid/tests/unittests/pipeline_mnist.py | 27 +- 8 files changed, 816 insertions(+), 569 deletions(-) diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 90a371e474756e..e740771e5ca9fc 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -39,13 +39,13 @@ void SectionWorker::RunForward( int op_role = op->Attr(std::string("op_role")); // We run op with op_role = kLRSched only for the first microbatch // to avoid increasing the @LR_DECAY_STEP@ multiple times. - bool run_first_mbatch = op_role == static_cast(OpRole::kForward) || - op_role == (static_cast(OpRole::kForward) | - static_cast(OpRole::kLoss)) || - op_role == static_cast(OpRole::kLRSched); - bool run_others = op_role == static_cast(OpRole::kForward) || - op_role == (static_cast(OpRole::kForward) | - static_cast(OpRole::kLoss)); + bool run_first_mbatch = (op_role == static_cast(OpRole::kForward)) || + (op_role == (static_cast(OpRole::kForward) | + static_cast(OpRole::kLoss))) || + (op_role == static_cast(OpRole::kLRSched)); + bool run_others = (op_role == static_cast(OpRole::kForward)) || + (op_role == (static_cast(OpRole::kForward) | + static_cast(OpRole::kLoss))); if ((micro_id == 0 && run_first_mbatch) || (micro_id != 0 && run_others)) { VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch " << micro_id; @@ -64,9 +64,9 @@ void SectionWorker::RunBackward( &unused_vars_) { for (auto &op : ops_) { int op_role = op->Attr(std::string("op_role")); - if (op_role == static_cast(OpRole::kBackward) || - op_role == (static_cast(OpRole::kBackward) | - static_cast(OpRole::kLoss))) { + if ((op_role == static_cast(OpRole::kBackward)) || + (op_role == (static_cast(OpRole::kBackward) | + static_cast(OpRole::kLoss)))) { VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch " << micro_id; op->Run(*microbatch_scopes_[micro_id], place_); diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py index 00d58cbd997fb4..c3d27bcc4ea551 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/common.py +++ b/python/paddle/distributed/fleet/meta_optimizers/common.py @@ -47,7 +47,7 @@ def is_optimizer_op(op): class CollectiveHelper(object): - def __init__(self, role_maker, nrings=1, wait_port='6174'): + def __init__(self, role_maker, nrings=1, wait_port=True): self.nrings = nrings self.wait_port = wait_port self.role_maker = role_maker @@ -65,14 +65,48 @@ def update_startup_program(self, startup_program=None): self.role_maker._worker_index(), ring_id, self.wait_port) self._broadcast_params() - def _init_communicator(self, program, current_endpoint, endpoints, rank, - ring_id, wait_port): + def _init_communicator(self, + program, + current_endpoint, + endpoints, + rank, + ring_id, + wait_port, + global_ring_id=None, + sync=True): nranks = len(endpoints) other_endpoints = endpoints[:] other_endpoints.remove(current_endpoint) if rank == 0 and wait_port: wait_server_ready(other_endpoints) + def _add_sync_by_allreduce(block): + sync_var = block.create_var( + name=unique_name.generate('sync_var'), + dtype=core.VarDesc.VarType.INT32, + persistable=False, + stop_gradient=True) + block.append_op( + type='fill_constant', + inputs={}, + outputs={'Out': [sync_var]}, + attrs={ + 'shape': [1], + 'dtype': sync_var.dtype, + 'value': 1, + 'force_cpu': False, + OP_ROLE_KEY: OpRole.Forward + }) + block.append_op( + type='c_allreduce_sum', + inputs={'X': [sync_var]}, + outputs={'Out': [sync_var]}, + attrs={ + 'ring_id': global_ring_id, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward + }) + block = program.global_block() if core.is_compiled_with_cuda(): comm_id_var = block.create_var( @@ -128,6 +162,7 @@ def _init_communicator(self, program, current_endpoint, endpoints, rank, raise ValueError( "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu." ) + if sync: _add_sync_by_allreduce(block) def _wait(self, current_endpoint, endpoints): assert (self.wait_port) diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py index 9535c9ef53c2e6..6f435bb86ba5ac 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py @@ -19,130 +19,21 @@ from ..base.private_helper_function import wait_server_ready from paddle.fluid.optimizer import PipelineOptimizer as PO from .meta_optimizer_base import MetaOptimizerBase -from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op - - -def _get_node_num(endpoints): - ss = set() - for ep in endpoints: - ip = ep.split(":")[0].strip() - if ip not in ss: - ss.add(ip) - return len(ss) - - -class PipelineHelper(object): - def __init__(self, role_maker, wait_port='6174'): - self.wait_port = wait_port - self.role_maker = role_maker - - def update_startup_program(self, - startup_program=None, - inner_parallelism=None): - self.startup_program = startup_program - - nranks = self.role_maker._worker_num() - rank = self.role_maker._worker_index() - endpoints = self.role_maker._get_trainer_endpoints() - current_endpoint = endpoints[rank] - node_num = _get_node_num(endpoints) - assert nranks % node_num == 0 - - # Create ring 0 for all gpus in the same pipeline - if inner_parallelism > 1: - pipeline_rank = rank % inner_parallelism - pipeline_id = rank // inner_parallelism - start_index = pipeline_id * inner_parallelism - pipeline_endpoints = endpoints[start_index:start_index + - inner_parallelism] - self._init_communicator(self.startup_program, current_endpoint, - pipeline_endpoints, pipeline_rank, 0, - self.wait_port) - - pipeline_num = len(endpoints) // inner_parallelism - if pipeline_num == 1: return - # Create rings for gpus with the same pipeline id for data parallel - eps = [] - pipeline_rank = rank % inner_parallelism - ring_id = pipeline_rank + 1 - for i in range(pipeline_num): - eps.append(endpoints[i * inner_parallelism + pipeline_rank]) - # rank in a ring of gpus with the same pipeline id for data parallel - dp_rank = rank // inner_parallelism - self._init_communicator(self.startup_program, current_endpoint, eps, - dp_rank, ring_id, self.wait_port) - self._broadcast_params(ring_id) - - def _init_communicator(self, program, current_endpoint, endpoints, rank, - ring_id, wait_port): - nranks = len(endpoints) - other_endpoints = endpoints[:] - other_endpoints.remove(current_endpoint) - if rank == 0 and wait_port: - wait_server_ready(other_endpoints) - - block = program.global_block() - nccl_id_var = block.create_var( - name=unique_name.generate('nccl_id'), - persistable=True, - type=core.VarDesc.VarType.RAW) - block.append_op( - type='c_gen_nccl_id', - inputs={}, - outputs={'Out': nccl_id_var}, - attrs={ - 'rank': rank, - 'endpoint': current_endpoint, - 'other_endpoints': other_endpoints, - OP_ROLE_KEY: OpRole.Forward, - }) - block.append_op( - type='c_comm_init', - inputs={'X': nccl_id_var}, - outputs={}, - attrs={ - 'nranks': nranks, - 'rank': rank, - 'ring_id': ring_id, - OP_ROLE_KEY: OpRole.Forward, - }) - - def _broadcast_params(self, ring_id): - block = self.startup_program.global_block() - for var_name in block.vars: - if "nccl_id" in var_name: continue - param = block.var(var_name) - if not param.persistable: - continue - - block.append_op( - type='c_broadcast', - inputs={'X': param}, - outputs={'Out': param}, - attrs={ - 'ring_id': ring_id, - 'root': 0, - OP_ROLE_KEY: OpRole.Forward - }) - - block.append_op( - type='c_sync_comm_stream', - inputs={'X': param}, - outputs={'Out': param}, - attrs={'ring_id': ring_id, - OP_ROLE_KEY: OpRole.Forward}) +from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op class PipelineOptimizer(MetaOptimizerBase): def __init__(self, optimizer): super(PipelineOptimizer, self).__init__(optimizer) self.inner_opt = optimizer - # we do not allow meta optimizer to be inner optimizer currently self.meta_optimizers_white_list = [ "RecomputeOptimizer", "AMPOptimizer", ] self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ] + self.global_ring_id = 1 + self.dp_ring_id = 2 + self.start_pipeline_ring_id = 20 # Just a magic number def _set_basic_info(self, loss, role_maker, user_defined_optimizer, user_defined_strategy): @@ -165,7 +56,11 @@ def _can_apply(self): def _disable_strategy(self, dist_strategy): dist_strategy.pipeline = False - dist_strategy.pipeline_configs = {} + dist_strategy.pipeline_configs = { + "micro_batch_size": 1, + "accumulate_steps": 1, + "schedule_mode": "1F1B", + } def _enable_strategy(self, dist_strategy, context): dist_strategy.pipeline = True @@ -175,61 +70,134 @@ def _enable_strategy(self, dist_strategy, context): "schedule_mode": "1F1B", } + def _broadcast_params(self, ring_id): + block = self.startup_program.global_block() + param = None + for param in block.iter_parameters(): + if param.is_distributed: + continue + + block.append_op( + type='c_broadcast', + inputs={'X': param}, + outputs={'Out': param}, + attrs={ + 'ring_id': ring_id, + 'root': 0, + OP_ROLE_KEY: OpRole.Forward + }) + + if not param: return # no parameter on this device + block.append_op( + type='c_sync_comm_stream', + inputs={'X': param}, + outputs={'Out': param}, + attrs={'ring_id': ring_id, + OP_ROLE_KEY: OpRole.Forward}) + + def _get_process_group_info(self): + # global ring info + self.global_endpoints = self.endpoints + self.global_rank = self.rank + self.global_nranks = self.nranks + + # data parallel ring info + if self.pipeline_num > 1: + self.dp_rank = self.rank // self.inner_parallelism + self.dp_nranks = self.nranks // self.inner_parallelism + start_index = self.rank % self.inner_parallelism + self.dp_endpoints = [ + self.endpoints[start_index + i * self.inner_parallelism] + for i in range(self.pipeline_num) + ] + + def _init_process_group(self, pipeline_pair, pipeline_ring_map): + self._get_process_group_info() + collective_helper = CollectiveHelper(self.role_maker, wait_port=False) + # Create global ring for all gpus (ring_id = 0) + collective_helper._init_communicator( + self.startup_program, self.current_endpoint, self.global_endpoints, + self.global_rank, self.global_ring_id, True, self.global_ring_id, + True) + # Create pipeline rings + if self.inner_parallelism > 1: + pipeline_id = self.rank // self.inner_parallelism + start_index = pipeline_id * self.inner_parallelism + for pair in pipeline_pair: + pair_key = pair[0] * 1000 + pair[1] + ring_id = pipeline_ring_map[pair_key] + assert ring_id >= self.start_pipeline_ring_id + first_node = pair[0] + start_index + second_node = pair[1] + start_index + if self.rank != first_node and self.rank != second_node: + continue + pipeline_endpoints = [ + self.endpoints[first_node], self.endpoints[second_node] + ] + pipeline_rank = 0 if self.rank == first_node else 1 + pipeline_nranks = 2 + collective_helper._init_communicator( + self.startup_program, self.current_endpoint, + pipeline_endpoints, pipeline_rank, ring_id, False, + self.global_ring_id, True) + + # Create dp rings + if self.pipeline_num > 1: + collective_helper._init_communicator( + self.startup_program, self.current_endpoint, self.dp_endpoints, + self.dp_rank, self.dp_ring_id, True, self.global_ring_id, True) + self._broadcast_params(self.dp_ring_id) + def minimize_impl(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): - endpoints = self.role_maker._get_trainer_endpoints() - current_endpoint = endpoints[self.role_maker._worker_index()] - self.wrapped_opt = PO(self.inner_opt, - num_microbatches=self.num_microbatches) - node_num = _get_node_num(endpoints) - gpus_per_node = len(endpoints) // node_num - self.startup_program = startup_program - if startup_program is None: - self.startup_program = fluid.default_startup_program() - + self.endpoints = self.role_maker._get_trainer_endpoints() + self.current_endpoint = self.endpoints[self.role_maker._worker_index()] self.rank = self.role_maker._worker_index() self.nranks = self.role_maker._worker_num() - assert self.nranks % node_num == 0 - loss.block.program._pipeline_opt = dict() - loss.block.program._pipeline_opt['local_rank'] = self.rank - loss.block.program._pipeline_opt[ - 'micro_batch_size'] = self.micro_batch_size - loss.block.program._pipeline_opt['schedule_mode'] = self.schedule_mode - optimize_ops, params_grads, prog_list = self.wrapped_opt.minimize( + self.wrapped_opt = PO(self.inner_opt, + num_microbatches=self.num_microbatches) + orig_startup_program = startup_program if startup_program else fluid.default_startup_program( + ) + block = loss.block + program = block.program + + program._pipeline_opt = dict() + program._pipeline_opt['local_rank'] = self.rank + program._pipeline_opt['global_ring_id'] = self.global_ring_id + program._pipeline_opt['ring_id'] = self.start_pipeline_ring_id + program._pipeline_opt['micro_batch_size'] = self.micro_batch_size + program._pipeline_opt['schedule_mode'] = self.schedule_mode + optimize_ops, params_grads, prog_list, pp_pair, ring_map = self.wrapped_opt.minimize( loss, startup_program, parameter_list, no_grad_set) - assert prog_list - - self.main_program_list = prog_list - self.main_program = loss.block.program - self.inner_parallelism = loss.block.program._pipeline_opt[ - 'inner_parallelism'] + self.startup_program = orig_startup_program._pipeline_opt[ + 'startup_program'] + self.inner_parallelism = program._pipeline_opt['inner_parallelism'] assert self.nranks % self.inner_parallelism == 0 + assert prog_list + self.pipeline_num = len(self.endpoints) // self.inner_parallelism - pipeline_helper = PipelineHelper(self.role_maker) - pipeline_helper.update_startup_program( - self.startup_program._pipeline_opt["startup_program"], - self.inner_parallelism) + self._init_process_group(pp_pair, ring_map) - pipeline_num = self.nranks // self.inner_parallelism - self._transpile_main_program(loss, pipeline_num, self.inner_parallelism) + self.main_program_list = prog_list + self.main_program = program + if self.pipeline_num > 1: + self._transpile_main_program(loss) return optimize_ops, params_grads - def _transpile_main_program(self, loss, pipeline_num, inner_parallelism): - if pipeline_num <= 1: return - self._insert_loss_grad_ops(loss, pipeline_num) - for ring_id in range(1, inner_parallelism + 1): - self._insert_allreduce_ops(ring_id) + def _transpile_main_program(self, loss): + self._insert_loss_grad_ops(loss, self.pipeline_num) + self._insert_allreduce_ops(self.dp_ring_id) def _insert_loss_grad_ops(self, loss, pipeline_num): """ In order to keep the learning rate consistent in different numbers of training workers, we scale the loss grad by the number of workers """ - block = self.main_program_list[-1]['program'].global_block() + block = self.main_program_list[-1].global_block() for idx, op in reversed(list(enumerate(block.ops))): if is_loss_grad_op(op): loss_grad_var = block.vars[op.output_arg_names[0]] @@ -244,57 +212,53 @@ def _insert_loss_grad_ops(self, loss, pipeline_num): }) def _insert_allreduce_ops(self, ring_id): - block = self.main_program_list[ring_id - 1]['program'].global_block() + block = self.main_program._pipeline_opt['section_program'].global_block( + ) origin_block = self.main_program.global_block() grad = None processed_param_name = set() + first_optimize_op_idx = None + add_sync_calc_stream = False for idx, op in reversed(list(enumerate(block.ops))): + if is_backward_op(op) and not first_optimize_op_idx: + first_optimize_op_idx = idx + 1 + # no optimize phase + if first_optimize_op_idx == len(block.ops): return if is_backward_op(op) and \ OP_ROLE_VAR_KEY in op.attr_names: op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY] if len(op_role_var) == 0: continue assert len(op_role_var) % 2 == 0 - offset = idx + offset = 0 for i in range(0, len(op_role_var), 2): param_name = op_role_var[i] param = block.vars[op_role_var[i]] if param_name in processed_param_name: continue processed_param_name.add(param_name) - grad = block.vars[op_role_var[i + 1]] + grad_name = op_role_var[i + 1] + if not 'MERGED' in grad_name: grad_name += '@MERGED' + grad = block.vars[grad_name] origin_param = origin_block.vars[op_role_var[i]] if origin_param.is_distributed: continue - if offset == idx: - offset += 1 + if not add_sync_calc_stream: + add_sync_calc_stream = True block._insert_op( - offset, + first_optimize_op_idx + offset, type='c_sync_calc_stream', inputs={'X': grad}, outputs={'Out': grad}, - attrs={OP_ROLE_KEY: OpRole.Backward}) + attrs={OP_ROLE_KEY: OpRole.Optimize}) offset += 1 block._insert_op( - offset, + first_optimize_op_idx + offset, type='c_allreduce_sum', inputs={'X': grad}, outputs={'Out': grad}, attrs={ 'ring_id': ring_id, - OP_ROLE_KEY: OpRole.Backward + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Optimize }) - - if grad is None: - return - - for idx, op in enumerate(block.ops): - if is_optimizer_op(op): - block._insert_op( - idx, - type='c_sync_comm_stream', - inputs={'X': grad}, - outputs={'Out': grad}, - attrs={'ring_id': ring_id, - OP_ROLE_KEY: OpRole.Backward}) - break diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index f9c3a613c4053a..67e83a2ec4617c 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -123,7 +123,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): outputs={"Out": out_var}, attrs={ "in_dtype": in_var.dtype, - "out_dtype": out_var.dtype + "out_dtype": out_var.dtype, + "op_device": op.attr("op_device") }) num_cast_ops += 1 _rename_arg(op, in_var.name, out_var.name) @@ -171,8 +172,11 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name, type="cast", inputs={"X": target_var}, outputs={"Out": cast_var}, - attrs={"in_dtype": target_var.dtype, - "out_dtype": cast_var.dtype}) + attrs={ + "in_dtype": target_var.dtype, + "out_dtype": cast_var.dtype, + "op_device": op.attr("op_device") + }) num_cast_ops += 1 op_var_rename_map[block.idx][target_var.name] = cast_var.name diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py index b923f36af8d027..0f98af5772313d 100644 --- a/python/paddle/fluid/device_worker.py +++ b/python/paddle/fluid/device_worker.py @@ -427,7 +427,7 @@ def _gen_worker_desc(self, trainer_desc): section_param.schedule_mode = schedule_mode cfg = section_param.section_config program = pipeline_opt["section_program"] - cfg.program_desc.ParseFromString(program["program"]._get_desc() + cfg.program_desc.ParseFromString(program._get_desc() .serialize_to_string()) # TODO: why does not work # cfg.program_desc.CopyFrom(program.program._get_desc()) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 9b0b04a6ea7164..da326ec074c1d9 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1458,7 +1458,7 @@ def _run_from_dataset(self, dataset._prepare_to_run() real_fetch_list = [] if program._pipeline_opt: - real_program = program._pipeline_opt["section_program"]['program'] + real_program = program._pipeline_opt["section_program"] for fetch_var in fetch_list: if isinstance(fetch_var, Variable): fetch_var_name = fetch_var.name @@ -1467,13 +1467,20 @@ def _run_from_dataset(self, if fetch_var_name in real_program.global_block().vars: real_fetch_list.append(fetch_var) - program._pipeline_opt["section_program"][ - 'program'] = self._add_feed_fetch_ops( - program=program._pipeline_opt["section_program"]['program'], - feed=[], - fetch_list=real_fetch_list, - feed_var_name='feed', - fetch_var_name='fetch') + program._pipeline_opt["section_program"] = self._add_feed_fetch_ops( + program=program._pipeline_opt["section_program"], + feed=[], + fetch_list=real_fetch_list, + feed_var_name='feed', + fetch_var_name='fetch') + main_block = program._pipeline_opt["section_program"].block(0) + for op in main_block.ops: + # set the op_role of fetch op to Optimize to avoid + # erase the fetched vars by gc for pipeline + if op.type == 'fetch': + op._set_attr( + 'op_role', + core.op_proto_and_checker_maker.OpRole.Optimize) fetch_list = None scope, trainer = self._prepare_trainer( diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 9c724cbfdd4a73..2aa918bf806616 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -3784,6 +3784,12 @@ def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): "Optimizer, but the given type is {}.".format( type(optimizer))) self._optimizer = optimizer + + # Get the original optimizer defined by users, such as SGD + self._origin_optimizer = self._optimizer + while hasattr(self._origin_optimizer, "inner_opt"): + self._origin_optimizer = self._origin_optimizer.inner_opt + assert num_microbatches >= 1, ( "num_microbatches must be a positive value.") self._num_microbatches = num_microbatches @@ -3797,13 +3803,98 @@ def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): self._op_role_var_key = op_maker.kOpRoleVarAttrName() self._op_device_key = op_maker.kOpDeviceAttrName() self._param_device_map = None + self._pipeline_pair = [] + self._pp_ring_map = dict() + self._global_ring_id = None + + # insert allreduce op to sync global information for global + # gradient clip and amp + def _insert_allreduce_op(self, op_idx, block): + """ + Insert allreduce op to sync global information for global + gradient clip and amp. + """ + op = block.ops[op_idx] + out_name = op.desc.output_arg_names()[0] + out_var = block.var(out_name) + offset = 0 + if op.type == "reduce_any": + # cast the bool var to int32 to use allreduce_max op + temp_var_name = unique_name.generate(out_name + "_cast_int32") + temp_var = block.create_var( + name=temp_var_name, shape=[1], dtype="int32") + block._insert_op( + op_idx + 1 + offset, + type='cast', + inputs={'X': out_var}, + outputs={'Out': temp_var}, + attrs={ + 'in_dtype': out_var.dtype, + 'out_dtype': temp_var.dtype, + self._op_role_key: self._op_role.Optimize + }) + offset += 1 + block._insert_op( + op_idx + 1 + offset, + type='c_allreduce_max' + if op.type == "reduce_any" else 'c_allreduce_sum', + inputs={'X': temp_var if op.type == "reduce_any" else out_var}, + outputs={'Out': temp_var if op.type == "reduce_any" else out_var}, + attrs={ + 'ring_id': self._global_ring_id, + self._op_role_key: self._op_role.Optimize, + 'use_calc_stream': True + }) + offset += 1 + if op.type == "reduce_any": + block._insert_op( + op_idx + 1 + offset, + type='cast', + inputs={'X': temp_var}, + outputs={'Out': out_var}, + attrs={ + 'in_dtype': temp_var.dtype, + 'out_dtype': out_var.dtype, + self._op_role_key: self._op_role.Optimize + }) + return offset def _create_vars(self, block, ori_block): - # Create vars for block, copied from main_program's global block + # Create vars for block, copied from ori_block used_var_set = set() - for op_idx in range(block.desc.op_size()): - op_desc = block.desc.op(op_idx) - vars = op_desc.input_arg_names() + op_desc.output_arg_names() + added_op_num = 0 + op_idx = 0 + op_size = block.desc.op_size() + while op_idx < op_size + added_op_num: + # Whether to insert allreduce_sum or allreduce_max op. + # For amp and global gradient clip strategies, we should + # get the global information, so allreduce op is needed. + should_insert = False + op = block.ops[op_idx] + # For op process vars on all devices, remove its input + # vars not in this block + reserved_x = [] + if op.type == 'reduce_any' and self._is_optimize_op(op): + should_insert = True + elif op.type == 'concat' and self._is_optimize_op(op): + for input_name in op.desc.input("X"): + if block._find_var_recursive(input_name): + reserved_x.append(input_name) + op.desc.set_input('X', reserved_x) + elif op.type == 'update_loss_scaling': + for input_name in op.desc.input("X"): + if block._find_var_recursive(input_name): + reserved_x.append(input_name) + op.desc.set_input('X', reserved_x) + op.desc.set_output('Out', reserved_x) + elif op.type == 'sum' and self._is_gradient_clip_op(op): + for input_name in op.desc.input("X"): + if block._find_var_recursive(input_name): + reserved_x.append(input_name) + op.desc.set_input('X', reserved_x) + should_insert = True + + vars = op.desc.input_arg_names() + op.desc.output_arg_names() for var in vars: # a var whose name contains "blocking_queue" # only exists in startup program @@ -3813,27 +3904,39 @@ def _create_vars(self, block, ori_block): if block._find_var_recursive(str(var)): continue source_var = ori_block._var_recursive(str(var)) if source_var.type == core.VarDesc.VarType.READER: - block.create_var( + dest_var = block.create_var( name=var, type=core.VarDesc.VarType.READER, persistable=source_var.persistable) else: - block._clone_variable(source_var, False) + dest_var = block._clone_variable(source_var, False) + dest_var.stop_gradient = source_var.stop_gradient + # When use with sharding, allreduce_sum and allreduce_max + # used for global gradient clip and amp will be added by sharding. + op_idx += 1 + if self.use_sharding or not should_insert: continue + inserted_ops = self._insert_allreduce_op(op_idx - 1, block) + added_op_num += inserted_ops + op_idx += inserted_ops + block._sync_with_cpp() def _is_loss_grad_op(self, op): - if self._op_role_key not in op.attr_names: - return False - op_role = int(op.all_attrs()[self._op_role_key]) + assert self._op_role_key in op.attr_names + op_role = int(op.attr(self._op_role_key)) return op_role & int(self._op_role.Backward) and op_role & int( self._op_role.Loss) def _is_backward_op(self, op): - return self._op_role_key in op.attr_names and int(op.all_attrs()[ - self._op_role_key]) & int(self._op_role.Backward) + return self._op_role_key in op.attr_names and ( + int(op.attr(self._op_role_key)) & int(self._op_role.Backward)) + + def _is_loss_op(self, op): + assert self._op_role_key in op.attr_names + return int(op.attr(self._op_role_key)) == int(self._op_role.Loss) def _is_optimize_op(self, op): - return self._op_role_key in op.attr_names and int(op.all_attrs()[ - self._op_role_key]) & int(self._op_role.Optimize) + return self._op_role_key in op.attr_names and ( + int(op.attr(self._op_role_key)) & int(self._op_role.Optimize)) def _is_update_op(self, op): return 'Param' in op.input_names and 'Grad' in op.input_names and ( @@ -3842,50 +3945,40 @@ def _is_update_op(self, op): def _split_program(self, main_program, devices): """ Split a program into sections according to devices that ops run on. - The ops of the role LRSched are copied to all sections. + The op whose op_device attr is "gpu:all" is copied to all sections. Args: main_program (Program): the main program devices: all used devices """ - programs = [] # Map from device to its corresponding section program info - device_program_map = dict() - for device in devices: - p = {'program': Program()} - device_program_map[device] = p + device_program_map = defaultdict(Program) block = main_program.block(0) for op in block.ops: device = op.attr(self._op_device_key) - op_role = op.attr(self._op_role_key) - if int(op_role) & int(self._op_role.LRSched): - # Copy ops of the role LRSched to all sections. - for device in device_program_map.keys(): - program = device_program_map[device] - op_desc = op.desc - ap_op = program["program"].block(0).desc.append_op() - ap_op.copy_from(op_desc) - # ap_op._set_attr(self._op_device_key, "") - elif op.type == "create_py_reader" or op.type == "read" or op.type == "create_double_buffer_reader": - # Copy read related ops to all section to make them exit after each epoch. - for device in device_program_map.keys(): + # Copy ops whose op_device set to "gpu:all" to all sections. + if device == "gpu:all": + for device in devices: program = device_program_map[device] op_desc = op.desc - ap_op = program["program"].block(0).desc.append_op() + ap_op = program.global_block().desc.append_op() ap_op.copy_from(op_desc) + ap_op._set_attr(self._op_device_key, "") else: program = device_program_map[device] op_desc = op.desc - ap_op = program["program"].block(0).desc.append_op() + ap_op = program.global_block().desc.append_op() ap_op.copy_from(op_desc) + ap_op._set_attr(self._op_device_key, "") + program_list = [] for key in devices: program = device_program_map[key] - program['program']._sync_with_cpp() - programs.append(program) + program._sync_with_cpp() + program_list.append(program) - return programs + return program_list def _get_op_device_for_startup_program(self, var_name): """ @@ -3894,21 +3987,22 @@ def _get_op_device_for_startup_program(self, var_name): get the real op_device attribute of the fill_constant as the device where the corresponding parameters on. """ - assert "beta1_pow_acc" in var_name or "beta2_pow_acc" in var_name + assert "beta1_pow_acc" in var_name or "beta2_pow_acc" in var_name, \ + 'For accumulators for Adam, the name must contain beta1_pow_acc ' \ + 'or beta2_pow_acc.' param_name = var_name[0:var_name.index('_beta')] device = self._param_device_map[param_name] return device - def _split_startup_program(self, startup_program, local_rank): - block = startup_program.block(0) + def _split_startup_program(self, startup_program, device_id): + block = startup_program.global_block() new_startup_program = Program() for op in block.ops: device = op.attr(self._op_device_key) if device == "cpu": assert op.type == "fill_constant", ( - "For ops in startup " - "program that with the op_device attribute of cpu, " - "they must be fill_constant.") + "For ops in startup program with the op_device attribute " + "of cpu, they must be of type fill_constant.") output_var = op.output_arg_names[0] device = self._get_op_device_for_startup_program(output_var) @@ -3917,14 +4011,13 @@ def _split_startup_program(self, startup_program, local_rank): else: # LR related ops device = None - if device and device_index != local_rank: continue + if device and device_index != device_id: continue op_desc = op.desc - ap_op = new_startup_program.block(0).desc.append_op() + ap_op = new_startup_program.global_block().desc.append_op() ap_op.copy_from(op_desc) ap_op._set_attr(self._op_device_key, "") new_startup_program._sync_with_cpp() - self._create_vars( - new_startup_program.block(0), startup_program.global_block()) + self._create_vars(new_startup_program.global_block(), block) return new_startup_program def _find_post_op(self, ops, cur_op, var_name): @@ -3937,6 +4030,11 @@ def _find_post_op(self, ops, cur_op, var_name): var_name as output. var_name (string): Variable name. """ + # To skip the cast op added by amp which has no op_device set + if '.cast_fp32' in var_name: + var_name = var_name.replace('.cast_fp32', '') + elif '.cast_fp16' in var_name: + var_name = var_name.replace('.cast_fp16', '') post_op = [] before = True for op in ops: @@ -3965,7 +4063,8 @@ def _find_real_prev_op(self, ops, cur_op, var_name): """ prev_op = [] for op in ops: - if op.type == 'send_v2' or op.type == 'recv_v2': + if op.type == 'send_v2' or op.type == 'recv_v2' \ + or op.type == 'c_broadcast': continue if op == cur_op: break @@ -3980,11 +4079,8 @@ def _find_real_prev_op(self, ops, cur_op, var_name): return None def _rename_arg(self, op, old_name, new_name): - op_desc = op.desc - if isinstance(op_desc, tuple): - op_desc = op_desc[0] - op_desc._rename_input(old_name, new_name) - op_desc._rename_output(old_name, new_name) + op._rename_input(old_name, new_name) + op._rename_output(old_name, new_name) def _create_var(self, block, ref_var, name): """ @@ -3998,99 +4094,12 @@ def _create_var(self, block, ref_var, name): dtype=ref_var.dtype, type=ref_var.type, lod_level=ref_var.lod_level, - persistable=False, - is_data=False, + persistable=ref_var.persistable, + is_data=ref_var.is_data, need_check_feed=ref_var.desc.need_check_feed()) + new_var.stop_gradient = ref_var.stop_gradient return new_var - def _get_data_var_info(self, block): - """ - Get info of all vars whose is_data attribute are true. - """ - # map of data vars to devices that that data on - data_devices_map = dict() - for op in block.ops: - dev_spec = op.attr(self._op_device_key) - for var_name in op.input_arg_names: - if "blocking_queue" in var_name: continue - var = block.var(var_name) - if not var.is_data: - continue - if not var_name in data_devices_map: - data_devices_map[var_name] = [] - if not dev_spec in data_devices_map[var_name]: - data_devices_map[var_name].append(dev_spec) - return data_devices_map - - def _insert_sendrecv_for_data_var(self, main_block, programs, startup, - devices): - """ - Insert send and recv ops for data var that on other devices. - - Args: - main_block (Block): Global block for main program - programs (dict): Dictionary for section params - startup (Program): Startup program - devices (list): List of devices in the format (dev:dev_index) - """ - main_program = main_block.program - data_devices_map = self._get_data_var_info(main_block) - - first_prog = programs[0]['program'] - first_block = first_prog.block(0) - insert_index = 0 - for op in first_block.ops: - insert_index += 1 - if op.type == "read": - break - first_dev_spec = devices[0] - first_dev_index = int(first_dev_spec.split(':')[1]) - for var_name in data_devices_map.keys(): - for device in data_devices_map[var_name]: - if device == first_dev_spec: continue - main_var = main_block.var(var_name) - assert main_var.is_data - if not var_name in first_block.vars: - self._create_var(first_block, main_var, var_name) - dev_index = int(device.split(':')[1]) - first_block._insert_op( - index=insert_index, - type='send_v2', - inputs={'X': first_block.var(var_name)}, - attrs={ - self._op_device_key: first_dev_spec, - self._op_role_key: self._op_role.Forward, - 'use_calc_stream': True, - 'peer': dev_index, - }) - # Get the device that that data on - assert device in devices - prog_index = devices.index(device) - prog = programs[prog_index]['program'] - block = prog.block(0) - index = 0 - for op in block.ops: - index += 1 - if op.type == "read": - break - source_var = main_program.block(0).var(var_name) - new_var = self._create_var(block, source_var, var_name) - new_var_shape = list(new_var.shape) - new_var_shape[0] = self.micro_batch_size if new_var_shape[ - 0] < 0 else new_var_shape[0] - block._insert_op( - index=index, - type='recv_v2', - outputs={'Out': [new_var]}, - attrs={ - 'out_shape': new_var_shape, - 'dtype': new_var.dtype, - self._op_device_key: device, - self._op_role_key: self._op_role.Forward, - 'peer': first_dev_index, - 'use_calc_stream': True, - }) - def _strip_grad_suffix(self, name): """ Strip the grad suffix from the given variable name @@ -4104,95 +4113,161 @@ def _append_grad_suffix(self, name): """ return name + core.grad_var_suffix() - def _add_opdevice_attr_for_regularization_clip(self, block): + def _get_op_device_attr(self, op): """ - Add op_device attribute for regulization and clip ops. + Get the op_device attribute of a op. """ - for op in block.ops: - # role for regularization and clip ops is optimize - if int(op.attr(self._op_role_key)) != int(self._op_role.Optimize): - continue - if op.has_attr(self._op_device_key) and ( - op.attr(self._op_device_key) != ""): - continue - assert self._op_role_var_key in op.attr_names - op_role_var = op.all_attrs()[self._op_role_var_key] - assert len(op_role_var) == 2 + device = op.attr(self._op_device_key) \ + if op.has_attr(self._op_device_key) else None + if device: + assert device[0:3] == 'gpu', "Now, only gpu devices are " \ + "supported in pipeline parallemism." + return device + + def _add_op_device_attr_for_op(self, op, idx, block): + """ + Add op_device attrribute for ops that have not that attribute set. + We use "gpu:all" to represent the op should be put on all + sub-programs, such as lr-related ops. Note that: "gpu:all" + is only used by pipeline as an indicator. + """ + lrsched_role = int(self._op_role.LRSched) + if op.attr(self._op_role_key) == lrsched_role: + # For LRSched ops, we should put them on all sub-programs to + # make sure each sub-program update the lr correctly + op._set_attr(self._op_device_key, "gpu:all") + elif (op.type == "cast" or + op.type == "scale") and self._is_backward_op(op): + prev_op = self._find_real_prev_op(block.ops, op, + op.desc.input("X")[0]) + op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key)) + elif op.type == "memcpy" and not self._is_optimize_op(op): + assert len(op.input_arg_names) == 1 and len( + op.output_arg_names) == 1 + input_name = op.input_arg_names[0] + output_name = op.output_arg_names[0] + if '@Fetch' in output_name: + post_op = self._find_post_op(block.ops, op, output_name) + op._set_attr(self._op_device_key, + post_op.attr(self._op_device_key)) + else: + prev_op = self._find_real_prev_op(block.ops, op, + op.desc.input("X")[0]) + op._set_attr(self._op_device_key, + prev_op.attr(self._op_device_key)) + elif self._is_loss_op(op): + # For loss * loss_scaling op added by AMP + offset = 1 + while (not block.ops[idx + offset].has_attr(self._op_device_key) or + not block.ops[idx + offset].attr(self._op_device_key)): + offset += 1 + device = block.ops[idx + offset].attr(self._op_device_key) + assert device, "Please put you program within device_guard scope." + for i in range(offset): + block.ops[idx + i]._set_attr(self._op_device_key, device) + elif self._is_optimize_op(op) and op.type == "check_finite_and_unscale": + op_role_var = op.attr(self._op_role_var_key) param_name = op_role_var[0] device = self._param_device_map[param_name] op._set_attr(self._op_device_key, device) - - def _add_default_opdevice_attr(self, block): + elif self._is_optimize_op(op) and op.type == "cast": + # For fp16-->fp32 cast added by AMP + grad_name = op.output('Out') + assert len(grad_name) == 1 + param_name = grad_name[0].strip(core.grad_var_suffix()) + device = self._param_device_map[param_name] + op._set_attr(self._op_device_key, device) + elif self._is_gradient_clip_op(op) or self._is_regularization_op(op): + # For gradient clip and regularization ops, we set their op_device + # attribute to the device where their corresponding parameters on. + assert self._op_role_var_key in op.attr_names, "gradient_clip " \ + "and regularization ops must have op_role_var attribute." + op_role_var = op.attr(self._op_role_var_key) + assert len(op_role_var) == 2, "op_role_var for gradient_clip " \ + "regularization ops must have two elements." + param_name = op_role_var[0] + device = self._param_device_map[param_name] + # For sum op added by global gradient clip, it must be + # put on all devices + if (op.type == 'sum' or op.type == 'sqrt' or + op.type == 'fill_constant' or + op.type == 'elementwise_max' or + op.type == 'elementwise_div'): + device = "gpu:all" + op._set_attr(self._op_device_key, device) + else: + other_known_ops = [ + 'update_loss_scaling', 'reduce_any', 'concat', 'sum' + ] + assert op.type in other_known_ops, "For other ops without " \ + "op_device set, they must be one of {}, but it " \ + "is {}".format(other_known_ops, op.type) + assert self._is_optimize_op(op) + op._set_attr(self._op_device_key, "gpu:all") + + def _add_op_device_attr(self, block): """ - 1. Add default op_device attribute for lr-related ops. - The default value is the one that of the first place. - 2. Add default op_device attribute for sum ops added during - backward. For these ops, we set the op_device attribute - as the one of its post op, i.e, which op has the output of the - sum op as an input. + Add op_device attrribute for ops in block that have + not that attribute set. """ - first_devcie = "" - - # Get the device spec of the first place. - # device_spec: 'cpu' for cpu device and 'gpu:id' for gpu device, - # e.g. 'gpu:0', 'gpu:1', etc. - for op in block.ops: - if op.has_attr(self._op_device_key) and ( - op.attr(self._op_device_key) != ""): - first_device = op.attr(self._op_device_key) - break - assert first_device - first_device_type = first_device.split(":")[0] - assert first_device_type == "gpu" - - # set op_device attr for lr-related ops - lrsched_role = int(self._op_role.LRSched) - for op in block.ops: - if not op.has_attr(self._op_device_key) or ( - op.attr(self._op_device_key) == ""): - if op.type == "sum": - # For sum ops that compute the sum of @RENAMED@ vars - for name in op.desc.input_arg_names(): - assert '@RENAME@' in name - assert len(op.desc.output_arg_names()) == 1 - out_name = op.desc.output_arg_names()[0] - post_op = self._find_post_op(block.ops, op, out_name) - device = post_op.attr(self._op_device_key) - assert device - op._set_attr(self._op_device_key, device) - continue - - assert op.attr(self._op_role_key) == lrsched_role, ( - "Op whose op_device attr has not been set for pipeline" - " must be of the role LRSched.") - op._set_attr(self._op_device_key, first_device) + for idx, op in enumerate(list(block.ops)): + if (op.type == "create_py_reader" or op.type == "read" or + op.type == "create_double_buffer_reader"): + # Copy read related ops to all section to make them exit + # after each epoch. + # We use "gpu:all" to represent the op should be put on all + # sub-programs, such as lr-related ops. Note that: "gpu:all" + # is only used by pipeline as an indicator. + op._set_attr(self._op_device_key, "gpu:all") + continue + # op_device attribute has been set + if self._get_op_device_attr(op): continue + self._add_op_device_attr_for_op(op, idx, block) def _check_validation(self, block): """ - Check whether ops in a block are all validate (i.e., the - op_device attribute has been set). - Then, return all device specifications in order. + Check whether ops in a block have both the op_device and the + op_role attributes set. + Then, return all devices in order. """ - device_specs = [] + device_list = [] + # Section worker only supports the following op_role + valid_op_role_value = [ + int(self._op_role.LRSched), + int(self._op_role.Forward), + int(self._op_role.Backward), + int(self._op_role.Loss), + int(self._op_role.Optimize), + int(self._op_role.Backward) | int(self._op_role.Loss), + ] for op in block.ops: - type = op.type - if not op._has_kernel(type): + if not op._has_kernel(op.type): assert op.type == "conditional_block" and ( op.attr(self._op_role_key) == int(self._op_role.LRSched)), ( "Now, the only supported op without kernel is " "conditional_block, and its op role must be LRSched.") + assert op.has_attr(self._op_role_key), ( + "op ({}) has no {} attribute.".format(op.type, + self._op_role_key)) + assert int(op.attr(self._op_role_key)) in valid_op_role_value, \ + "op_role {} for op {} must be one of {}".format( + op.attr(self._op_role_key), + op.type, + valid_op_role_value) assert op.has_attr(self._op_device_key), ( "op ({}) has no {} attribute.".format(op.type, self._op_device_key)) - dev_spec = op.attr(self._op_device_key) - assert dev_spec, ("op_device attribute for op " - "{} has not been set.".format(op.type)) - dev_type = dev_spec.split(':')[0] + + device = op.attr(self._op_device_key) + assert device, ("op_device attribute for op " + "{} has not been set.".format(op.type)) + if device == "gpu:all": continue + dev_type = device.split(':')[0] assert dev_type == "gpu", ("Now only gpu devices are supported " "for pipeline parallelism.") - if not dev_spec in device_specs: - device_specs.append(dev_spec) - return device_specs + if not device in device_list: + device_list.append(device) + return device_list def _insert_sendrecv_ops_for_boundaries(self, block): """ @@ -4201,148 +4276,267 @@ def _insert_sendrecv_ops_for_boundaries(self, block): """ extra_index = 0 - # A map from var to device spec where op takes it as input, + # A map from var to device where op takes it as input, # avoiding multiple send and recv ops. - var_devspec = dict() + var_dev_map = dict() for index, op in enumerate(list(block.ops)): - # skips lr-related ops and vars, as we will process them later. - if int(op.attr(self._op_role_key)) & int(self._op_role.LRSched): - continue - # skips update ops and vars, as we will process them later. - if self._is_update_op(op): continue - - cur_device_spec = op.attr(self._op_device_key) + cur_device = op.attr(self._op_device_key) + if cur_device == "gpu:all": continue for var_name in op.input_arg_names: # i.e., lod_tensor_blocking_queue created by DataLoader, # which only exists in startup program. - if not var_name in block.vars: continue var = block.var(var_name) # skip data, because we will process it later if var.is_data: continue + prev_device = None + if var_name in self._param_device_map: + prev_device = self._param_device_map[var_name] prev_op = self._find_real_prev_op(block.ops, op, var_name) - if prev_op is None: - continue - prev_device_spec = prev_op.attr(self._op_device_key) + if not prev_device: + prev_device = prev_op.attr(self._op_device_key) \ + if prev_op else None + if not prev_device or prev_device == 'gpu:all': continue - if prev_device_spec != cur_device_spec: - if var_name not in var_devspec: - var_devspec[var_name] = [] - if cur_device_spec in var_devspec[var_name]: continue - var_devspec[var_name].append(cur_device_spec) + if prev_device != cur_device: + if var_name not in var_dev_map: var_dev_map[var_name] = [] + if cur_device in var_dev_map[var_name]: continue + var_dev_map[var_name].append(cur_device) op_role = op.all_attrs()[self._op_role_key] var = block.vars[var_name] - prev_device_index = int(prev_device_spec.split(':')[1]) - cur_device_index = int(cur_device_spec.split(':')[1]) - block._insert_op( - index=index + extra_index, - type='send_v2', - inputs={'X': var}, - attrs={ - self._op_device_key: prev_device_spec, - self._op_role_key: op_role, - 'use_calc_stream': True, - 'peer': cur_device_index, - }) - extra_index += 1 - var_shape = list(var.shape) - var_shape[0] = self.micro_batch_size if var_shape[ - 0] < 0 else var_shape[0] - block._insert_op( - index=index + extra_index, - type='recv_v2', - outputs={'Out': [var]}, - attrs={ - 'out_shape': var_shape, - 'dtype': var.dtype, - self._op_device_key: cur_device_spec, - self._op_role_key: op_role, - 'use_calc_stream': True, - 'peer': prev_device_index, - }) - extra_index += 1 - - def _clear_gradients(self, main_block, dev_spec): - """ - Clear gradients at the begining of each run of a minibatch. - """ - for param_name in self._param_device_map: - device = self._param_device_map[param_name] - if device != dev_spec: continue - grad_name = self._append_grad_suffix(param_name) - if not main_block.has_var(grad_name): continue - grad_var = main_block.vars[grad_name] - grad_var.persistable = True - main_block._insert_op( - index=0, - type='fill_constant', - inputs={}, - outputs={'Out': [grad_var]}, - attrs={ - 'shape': grad_var.shape, - 'dtype': grad_var.dtype, - 'value': float(0), - self._op_device_key: device, - # a trick to run this op once per mini-batch - self._op_role_key: self._op_role.Optimize.LRSched, - }) + prev_device_index = int(prev_device.split(':')[1]) + cur_device_index = int(cur_device.split(':')[1]) + pair = (prev_device_index, cur_device_index) + pair_key = prev_device_index * 1000 + cur_device_index + if pair not in self._pipeline_pair: + self._pipeline_pair.append(pair) + self._pp_ring_map[pair_key] = self.ring_id + ring_id = self.ring_id + self.ring_id += 1 + else: + ring_id = self._pp_ring_map[pair_key] + if self.schedule_mode == 'F-then-B': # F-then-B + block._insert_op( + index=index + extra_index, + type='send_v2', + inputs={'X': var}, + attrs={ + self._op_device_key: prev_device, + self._op_role_key: op_role, + 'use_calc_stream': True, + 'peer': 1, + 'ring_id': ring_id + }) + extra_index += 1 + block._insert_op( + index=index + extra_index, + type='recv_v2', + outputs={'Out': [var]}, + attrs={ + 'out_shape': var.shape, + 'dtype': var.dtype, + self._op_device_key: cur_device, + self._op_role_key: op_role, + 'use_calc_stream': True, + 'peer': 0, + 'ring_id': ring_id + }) + extra_index += 1 + elif self.schedule_mode == '1F1B': # 1F1B + block._insert_op( + index=index + extra_index, + type='c_sync_calc_stream', + inputs={'X': [var]}, + outputs={'Out': [var]}, + attrs={ + self._op_device_key: prev_device, + self._op_role_key: op_role, + }) + extra_index += 1 + block._insert_op( + index=index + extra_index, + type='send_v2', + inputs={'X': var}, + attrs={ + self._op_device_key: prev_device, + self._op_role_key: op_role, + 'use_calc_stream': False, + 'ring_id': ring_id, + 'peer': 1, + }) + extra_index += 1 + block._insert_op( + index=index + extra_index, + type='c_sync_comm_stream', + inputs={'X': [var]}, + outputs={'Out': [var]}, + attrs={ + self._op_device_key: prev_device, + self._op_role_key: self._op_role.Backward, + 'ring_id': ring_id, + }) + extra_index += 1 + var_shape = list(var.shape) + var_shape[0] = self.micro_batch_size if var_shape[ + 0] < 0 else var_shape[0] + block._insert_op( + index=index + extra_index, + type='recv_v2', + outputs={'Out': [var]}, + attrs={ + 'out_shape': var_shape, + 'dtype': var.dtype, + self._op_device_key: cur_device, + self._op_role_key: op_role, + 'use_calc_stream': True, + 'peer': 0, + 'ring_id': ring_id + }) + extra_index += 1 + else: + raise ValueError( + "Now only 'F-then-B' and '1F1B' are supported." + "The given value is {}.".format(self.schedule_mode)) - def _accumulate_gradients(self, block): + def _insert_loss_scale(self, block): """ - Accumulate the gradients generated in microbatch to the one in mini-batch. - We also scale the loss corresponding to number of micro-batches as well. + Scale the loss corresponding to number of micro-batches. """ + if self._num_microbatches == 1: return for index, op in reversed(tuple(enumerate(list(block.ops)))): - offset = index - device = op.attr(self._op_device_key) - - # Backward pass if self._is_loss_grad_op(op): loss_grad_var = block.vars[op.output_arg_names[0]] - scale_factor = self._num_microbatches block._insert_op( index=index + 1, type='scale', inputs={'X': loss_grad_var}, outputs={'Out': loss_grad_var}, attrs={ - 'scale': 1.0 / scale_factor, - self._op_device_key: device, + 'scale': 1.0 / self._num_microbatches, self._op_role_key: self._op_role.Backward }) break - if self._is_backward_op(op) and ( - self._op_role_var_key in op.attr_names): - op_role_var = op.all_attrs()[self._op_role_var_key] - if len(op_role_var) == 0: + def _rename_gradient_var_name(self, block): + for index, op in enumerate(block.ops): + if not self._is_optimize_op(op): continue + input_names = op.input_arg_names + output_names = op.output_arg_names + in_out_names = input_names + output_names + if op.type == 'cast': continue + # append "MERGED" to the names of parameter gradients, + # and mofify the op_role_var attribute (by rename_arg func). + for name in in_out_names: + if not core.grad_var_suffix() in name: continue + param_name = name.strip(core.grad_var_suffix()) + new_grad_name = name + "@MERGED" + self._rename_arg(op, name, new_grad_name) + + def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False): + """ + Create a new merged gradient for each parameter and accumulate the + corresponding gradient to it. + """ + merged_gradient_names = [] + first_opt_op_idx = None + + for index, op in reversed(tuple(enumerate(list(block.ops)))): + # remove the cast op of fp16 grad to fp32 grad + if self._is_optimize_op(op) and op.type == 'cast': + in_name = op.input_arg_names[0] + out_name = op.output_arg_names[0] + if out_name.strip('@GRAD') in self._param_device_map: + assert in_name.replace('.cast_fp16', '') == out_name + block._remove_op(index) continue + + if self._is_backward_op(op) and not first_opt_op_idx: + first_opt_op_idx = index + 1 + # no optimize phase + if first_opt_op_idx == len(block.ops): return + if block.ops[first_opt_op_idx].type == "c_sync_comm_stream": + first_opt_op_idx += 1 + + if self._is_backward_op(op) and ( + self._op_role_var_key in op.attr_names): + op_role_var = op.attr(self._op_role_var_key) + if len(op_role_var) == 0: continue assert len(op_role_var) % 2 == 0 - offset = index for i in range(0, len(op_role_var), 2): - grad_name = op_role_var[i + 1] - grad_var = block.vars[grad_name] - new_grad_var_name = unique_name.generate(grad_name) - new_var = self._create_var(block, grad_var, - new_grad_var_name) - self._rename_arg(op, grad_name, new_grad_var_name) + offset = 0 + param_name = op_role_var[i] + if not block.has_var(param_name): continue + if '@BroadCast' in param_name: continue + param_grad_name = param_name + core.grad_var_suffix() + merged_param_grad_name = param_grad_name + '@MERGED' + if not block.has_var(merged_param_grad_name): + self._create_var(block, block.vars[param_name], + merged_param_grad_name) + assert block.has_var(merged_param_grad_name) + param_grad_var = block.var(param_grad_name) + merged_param_grad_var = block.var(merged_param_grad_name) + merged_param_grad_var.persistable = True block._insert_op( - index=offset + 1, - type='sum', - inputs={'X': [grad_var, new_var]}, - outputs={'Out': grad_var}, + index=first_opt_op_idx + offset, + type='fill_constant', + inputs={}, + outputs={'Out': [merged_param_grad_var]}, attrs={ - self._op_device_key: device, - self._op_role_key: self._op_role.Backward, - self._op_role_var_key: op_role_var + 'shape': merged_param_grad_var.shape, + 'dtype': merged_param_grad_var.dtype, + 'value': float(0), + # a trick to run this op once per mini-batch + self._op_role_key: self._op_role.Optimize.LRSched, }) offset += 1 + grad_name = op_role_var[i + 1] + grad_var = block.vars[grad_name] + if not 'cast_fp16' in grad_name: + block._insert_op( + index=first_opt_op_idx + offset, + type='sum', + inputs={'X': [grad_var, merged_param_grad_var]}, + outputs={'Out': merged_param_grad_var}, + attrs={ + self._op_role_key: self._op_role.Backward, + }) + offset += 1 + merged_gradient_names.append(merged_param_grad_name) + else: + # cast gradient to fp32 to accumulate to merged gradient + cast_grad_var_name = param_grad_name + '@TMP' + cast_grad_var = self._create_var(block, param_grad_var, + cast_grad_var_name) + cast_grad_var.persistable = False + block._insert_op( + index=first_opt_op_idx + offset, + type='cast', + inputs={'X': grad_var}, + outputs={'Out': cast_grad_var}, + attrs={ + 'in_dtype': grad_var.dtype, + 'out_dtype': cast_grad_var.dtype, + self._op_role_key: self._op_role.Backward, + }) + offset += 1 + block._insert_op( + index=first_opt_op_idx + offset, + type='sum', + inputs={ + 'X': [merged_param_grad_var, cast_grad_var] + }, + outputs={'Out': merged_param_grad_var}, + attrs={ + self._op_role_key: self._op_role.Backward, + }) + offset += 1 + merged_gradient_names.append(merged_param_grad_name) + return merged_gradient_names def _add_sub_blocks(self, main_block, program_list): main_program = main_block.program - for prog_info in program_list: - prog = prog_info['program'] + for prog in program_list: for op in prog.block(0).ops: if not op.has_attr('sub_block'): continue @@ -4372,8 +4566,7 @@ def _process_persistable_vars_in_multi_sections(self, main_program, # var_info = {var_name: [program1, program2...]}, # persistable var only var_info = dict() - for prog_info in program_list: - prog = prog_info['program'] + for prog in program_list: block = prog.block(0) for var_name in block.vars: if var_name == "double_buffer_0": continue @@ -4395,7 +4588,7 @@ def _process_persistable_vars_in_multi_sections(self, main_program, block = prog.block(0) for op in block.ops: if op.type == "recv_v2" or op.type == "create_py_reader" or \ - op.type == "read": + op.type == "read" or op.type == "update_loss_scaling": continue # We have processed lr related vars if op.attr(self._op_role_key) == int( @@ -4423,6 +4616,15 @@ def _process_persistable_vars_in_multi_sections(self, main_program, read_block = prog.block(0) read_device = self._get_device_info(read_block) read_dev_index = int(read_device.split(':')[1]) + pair = (write_dev_index, read_dev_index) + pair_key = write_dev_index * 1000 + read_dev_index + if pair not in self._pipeline_pair: + self._pipeline_pair.append(pair) + self._pp_ring_map[pair_key] = self.ring_id + ring_id = self.ring_id + self.ring_id += 1 + else: + ring_id = self._pp_ring_map[pair_key] write_block._insert_op( index=0, @@ -4430,11 +4632,12 @@ def _process_persistable_vars_in_multi_sections(self, main_program, inputs={'X': write_block.var(var_name), }, attrs={ self._op_device_key: write_device, - 'use_calc_stream': True, + 'use_calc_stream': False, # A trick to make the role LRSched to avoid copy every # microbatch self._op_role_key: self._op_role.LRSched, 'peer': read_dev_index, + 'ring_id': ring_id }) read_block._insert_op( index=0, @@ -4444,12 +4647,33 @@ def _process_persistable_vars_in_multi_sections(self, main_program, 'out_shape': read_block.var(var_name).shape, 'dtype': read_block.var(var_name).dtype, self._op_device_key: read_device, - 'use_calc_stream': True, + 'use_calc_stream': False, # A trick to make the role LRSched to avoid copy every # microbatch self._op_role_key: self._op_role.LRSched, - 'peer': write_dev_index + 'peer': write_dev_index, + 'ring_id': ring_id }) + read_block._insert_op( + index=1, + type='c_sync_comm_stream', + inputs={'X': [read_block.var(var_name)]}, + outputs={'Out': [read_block.var(var_name)]}, + attrs={ + self._op_device_key: read_device, + # A trick to make the role LRSched to avoid copy every + # microbatch + self._op_role_key: self._op_role.LRSched, + 'ring_id': ring_id + }) + + def _is_gradient_clip_op(self, op): + return op.desc.has_attr("op_namescope") \ + and op.desc.attr("op_namescope").startswith("/gradient_clip") + + def _is_regularization_op(self, op): + return op.desc.has_attr("op_namescope") \ + and op.desc.attr("op_namescope").startswith("/regularization") def minimize(self, loss, @@ -4457,23 +4681,34 @@ def minimize(self, parameter_list=None, no_grad_set=None): main_block = loss.block + self.origin_main_block = main_block if startup_program is None: startup_program = default_startup_program() optimize_ops, params_grads = self._optimizer.minimize( loss, startup_program, parameter_list, no_grad_set) - self._param_device_map = self._optimizer._param_device_map + self._param_device_map = self._origin_optimizer._param_device_map + assert main_block.program._pipeline_opt \ + and 'local_rank' in main_block.program._pipeline_opt, \ + 'Please use pipeline with fleet.' + local_rank = main_block.program._pipeline_opt['local_rank'] + self._global_ring_id = main_block.program._pipeline_opt[ + 'global_ring_id'] + schedule_mode = 0 + if 'schedule_mode' in main_block.program._pipeline_opt: + schedule_mode = main_block.program._pipeline_opt['schedule_mode'] + self.schedule_mode = schedule_mode + # micro batch size self.micro_batch_size = main_block.program._pipeline_opt[ 'micro_batch_size'] - # Step1: add default op_device attribute for regulization and clip ops - self._add_opdevice_attr_for_regularization_clip(main_block) - - # Step2: add default op_device attribute for ops whose op_device - # attribute have not been set yet. Then check all ops have the - # op_device attribute. - self._add_default_opdevice_attr(main_block) + self.use_sharding = False + if 'use_sharding' in main_block.program._pipeline_opt: + self.use_sharding = main_block.program._pipeline_opt['use_sharding'] + self.ring_id = main_block.program._pipeline_opt['ring_id'] - device_specs = self._check_validation(main_block) + # Step1: add default op_device attribute for ops. + self._add_op_device_attr(main_block) + device_list = self._check_validation(main_block) def device_cmp(device1, device2): dev1_id = int(device1.split(':')[1]) @@ -4485,70 +4720,59 @@ def device_cmp(device1, device2): else: return 0 - sorted_device_spec = sorted(device_specs, key=cmp_to_key(device_cmp)) - assert sorted_device_spec == device_specs, ( - "With pipeline " - "parallelism, you must use gpu devices one after another " - "in the order of their ids.") - - # Step3: add send and recv ops between section boundaries + sorted_device_list = sorted(device_list, key=cmp_to_key(device_cmp)) + assert sorted_device_list == device_list, ( + "With pipeline parallelism, you must use gpu devices one after " + "another in the order of their ids.") + # Step2: add send and recv ops between section boundaries self._insert_sendrecv_ops_for_boundaries(main_block) - # Step4: split program into sections and add pairs of + # Step3: split program into sections and add pairs of # send and recv ops for data var. main_program = main_block.program - program_list = self._split_program(main_program, device_specs) + program_list = self._split_program(main_program, device_list) for p in program_list: - self._create_vars(p["program"].block(0), - main_program.global_block()) - self._insert_sendrecv_for_data_var(main_block, program_list, - startup_program, device_specs) + self._create_vars(p.global_block(), main_block) - # Step5: Special Case: process persistable vars that exist in + # Step4: Special Case: process persistable vars that exist in # multiple sections self._process_persistable_vars_in_multi_sections( main_program, startup_program, program_list) - # Step6: Add sub blocks for section programs + # Step5: Add sub blocks for section programs self._add_sub_blocks(main_block, program_list) - assert (main_program._pipeline_opt and - isinstance(main_program._pipeline_opt, dict) and - 'local_rank' in main_program._pipeline_opt), \ - "You must use pipeline with fleet" - local_rank = main_program._pipeline_opt['local_rank'] % len( - device_specs) - self.schedule_mode = main_program._pipeline_opt['schedule_mode'] - + local_rank = main_program._pipeline_opt['local_rank'] % len(device_list) place_list = [] - for dev_spec in device_specs: - dev_index = dev_spec.split(":")[1] - place_list.append(core.CUDAPlace(local_rank)) + for dev in device_list: + dev_index = int(dev.split(":")[1]) + place_list.append(core.CUDAPlace(dev_index % 8)) - # Step7: Split startup program + # Step6: Split startup program new_startup_program = self._split_startup_program(startup_program, local_rank) - # Step8: clear gradients before each mini-batch and - # accumulate gradients during backward - self._clear_gradients( - program_list[local_rank]['program'].global_block(), - dev_spec=device_specs[local_rank]) - self._accumulate_gradients(program_list[local_rank]['program'] - .global_block()) - startup_program._pipeline_opt = { "startup_program": new_startup_program, } + real_block = program_list[local_rank].global_block() + self._insert_loss_scale(real_block) + if not self.use_sharding: + # Step7: clear gradients before each mini-batch and + # accumulate gradients during backward + self._rename_gradient_var_name(real_block) + real_block._sync_with_cpp() + self._accumulate_gradients(real_block) + real_block._sync_with_cpp() place_id = int(os.getenv("FLAGS_selected_gpus", "0")) main_program._pipeline_opt = { "trainer": "PipelineTrainer", "device_worker": "Section", "pipeline_stage": local_rank, - "num_pipeline_stages": len(device_specs), + "num_pipeline_stages": len(device_list), "schedule_mode": self.schedule_mode, - "inner_parallelism": len(device_specs), + "inner_parallelism": len(device_list), "section_program": program_list[local_rank], "place": place_list[local_rank], "place_id": place_id, @@ -4556,7 +4780,7 @@ def device_cmp(device1, device2): "num_microbatches": self._num_microbatches, "start_cpu_core_id": self._start_cpu_core_id, } - return optimize_ops, params_grads, program_list + return optimize_ops, params_grads, program_list, self._pipeline_pair, self._pp_ring_map class RecomputeOptimizer(Optimizer): diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/pipeline_mnist.py index f433af24813d52..8c3a66f933f59d 100644 --- a/python/paddle/fluid/tests/unittests/pipeline_mnist.py +++ b/python/paddle/fluid/tests/unittests/pipeline_mnist.py @@ -66,12 +66,21 @@ def cnn_model(data): param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 - predict = fluid.layers.fc( - input=conv_pool_2, - size=SIZE, - act="softmax", - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Constant(value=0.01))) + with fluid.device_guard("gpu:1"): + predict = fluid.layers.fc( + input=conv_pool_2, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + # To cover @RENAMED@GRADIENT + predict2 = fluid.layers.fc( + input=conv_pool_1, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + predict += predict2 return predict @@ -108,7 +117,10 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): bd = [steps_per_pass * p for p in passes] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr) - opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9) + opt = fluid.optimizer.Momentum( + learning_rate=lr_val, + momentum=0.9, + grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) acc_steps = 2 # accumulated steps for pipeline if dist_strategy: @@ -120,6 +132,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): fleet.init(is_collective=True) strategy = fleet.DistributedStrategy() strategy.pipeline = True + strategy.amp = True strategy.pipeline_configs = { 'micro_batch_size': batch_size, 'schedule_mode': '1F1B', From b47478efc2caf9247fd73245a3b87154ec3e81a1 Mon Sep 17 00:00:00 2001 From: cc <52520497+juncaipeng@users.noreply.github.com> Date: Fri, 26 Mar 2021 19:15:56 +0800 Subject: [PATCH 06/15] [dygraph qat] Use layer to calculate output scale (#31861) * Use layer to calculate output scale * add backward for moving_average_abs_max_scale and save output scales to op's attr --- paddle/fluid/operators/fake_quantize_op.cc | 69 +++-- paddle/fluid/operators/fake_quantize_op.cu | 4 +- paddle/fluid/operators/fake_quantize_op.h | 16 +- paddle/fluid/pybind/op_function_generator.cc | 6 +- .../slim/quantization/imperative/qat.py | 268 +++++------------- .../slim/quantization/imperative/quant_nn.py | 111 ++++---- .../slim/quantization/imperative/utils.py | 47 +-- .../slim/tests/test_imperative_out_scale.py | 25 -- .../tests/unittests/test_fake_quantize_op.py | 5 +- 9 files changed, 222 insertions(+), 329 deletions(-) diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index abfc88e5155e5a..4544386718813c 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -649,13 +649,18 @@ class MovingAverageAbsMaxScaleOp : public framework::OperatorWithKernel { "MovingAverageAbsMaxScale"); OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale", "MovingAverageAbsMaxScale"); + if (ctx->HasOutput("OutState")) { ctx->SetOutputDim("OutState", {1}); } if (ctx->HasOutput("OutAccum")) { ctx->SetOutputDim("OutAccum", {1}); } - ctx->SetOutputDim("OutScale", {1}); + if (ctx->HasOutput("Out")) { + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->SetOutputDim("OutScale", {1}); + ctx->ShareLoD("X", /*->*/ "Out"); + } } protected: @@ -673,6 +678,9 @@ class MovingAverageAbsMaxScaleOpMaker AddInput("X", "(Tensor) Input is float data type."); AddInput("InAccum", "Last accum.").AsDispensable(); AddInput("InState", "Last state.").AsDispensable(); + AddOutput("Out", + "(Tensor) Output tensor is just equivalent to the input tensor.") + .AsDispensable(); AddOutput("OutScale", " Current scale"); AddOutput("OutState", "(Tensor) state buffer.").AsDispensable(); AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable(); @@ -693,7 +701,7 @@ And it will not quantize the input tensor. } }; -class FakeQuantDequantGradOp : public framework::OperatorWithKernel { +class StrightThroughEstimatorGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -701,9 +709,9 @@ class FakeQuantDequantGradOp : public framework::OperatorWithKernel { auto out_grad_name = framework::GradVarName("Out"); auto x_grad_name = framework::GradVarName("X"); OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name, - "FakeQuantDequantGradOp"); + "StrightThroughEstimatorGradOp"); OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name, - "FakeQuantDequantGradOp"); + "StrightThroughEstimatorGradOp"); ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name)); } @@ -717,13 +725,13 @@ class FakeQuantDequantGradOp : public framework::OperatorWithKernel { }; template -class FakeQuantDequantGradMaker : public framework::SingleGradOpMaker { +class StrightThroughEstimatorMaker : public framework::SingleGradOpMaker { public: using framework::SingleGradOpMaker::SingleGradOpMaker; protected: void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("fake_quantize_dequantize_grad"); + grad_op->SetType("stright_throuth_estimator_grad"); grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); grad_op->SetAttrMap(this->Attrs()); @@ -744,11 +752,11 @@ REGISTER_OPERATOR( REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxKernel); -REGISTER_OPERATOR(fake_quantize_dequantize_abs_max, - ops::FakeQuantOrWithDequantAbsMaxOp, - ops::FakeQuantOrWithDequantAbsMaxOpMaker, - ops::FakeQuantDequantGradMaker, - ops::FakeQuantDequantGradMaker); +REGISTER_OPERATOR( + fake_quantize_dequantize_abs_max, ops::FakeQuantOrWithDequantAbsMaxOp, + ops::FakeQuantOrWithDequantAbsMaxOpMaker, + ops::StrightThroughEstimatorMaker, + ops::StrightThroughEstimatorMaker); REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_abs_max, ops::FakeQuantizeDequantizeAbsMaxKernel); @@ -769,11 +777,12 @@ REGISTER_OPERATOR( REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max, ops::FakeQuantizeMovingAverageAbsMaxKernel); -REGISTER_OPERATOR(fake_quantize_dequantize_moving_average_abs_max, - ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp, - ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker, - ops::FakeQuantDequantGradMaker, - ops::FakeQuantDequantGradMaker); +REGISTER_OPERATOR( + fake_quantize_dequantize_moving_average_abs_max, + ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp, + ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker, + ops::StrightThroughEstimatorMaker, + ops::StrightThroughEstimatorMaker); REGISTER_OP_CPU_KERNEL( fake_quantize_dequantize_moving_average_abs_max, ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel); @@ -789,20 +798,22 @@ REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max, REGISTER_OPERATOR( moving_average_abs_max_scale, ops::MovingAverageAbsMaxScaleOp, ops::MovingAverageAbsMaxScaleOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + ops::StrightThroughEstimatorMaker, + ops::StrightThroughEstimatorMaker); REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale, ops::MovingAverageAbsMaxScaleKernel); -REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp); -REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad, - ops::FakeQuantDequantGradKernel); +REGISTER_OPERATOR(stright_throuth_estimator_grad, + ops::StrightThroughEstimatorGradOp); +REGISTER_OP_CPU_KERNEL(stright_throuth_estimator_grad, + ops::StrightThroughEstimatorGradKernel); -REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max, - ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp, - ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker, - ops::FakeQuantDequantGradMaker, - ops::FakeQuantDequantGradMaker); +REGISTER_OPERATOR( + fake_channel_wise_quantize_dequantize_abs_max, + ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp, + ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker, + ops::StrightThroughEstimatorMaker, + ops::StrightThroughEstimatorMaker); REGISTER_OP_CPU_KERNEL( fake_channel_wise_quantize_dequantize_abs_max, ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel); @@ -820,4 +831,8 @@ REGISTER_OP_VERSION(moving_average_abs_max_scale) "Out", "Delete output in order to make the inference model not " "save moving_average_abs_max_scale operator. This will " - "make the quantitative model be correctly applied in inference.")); + "make the quantitative model be correctly applied in inference.")) + .AddCheckpoint( + R"ROC(Incompatible upgrade of output [Out])ROC", + paddle::framework::compatible::OpVersionDesc().NewOutput( + "Out", "In order to support dygraph qat, add output again.")); diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 92127f9aebd0d5..78052179f6be72 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -543,8 +543,8 @@ REGISTER_OP_CUDA_KERNEL(moving_average_abs_max_scale, REGISTER_OP_CUDA_KERNEL( fake_quantize_dequantize_moving_average_abs_max, ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel); -REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad, - ops::FakeQuantDequantGradKernel); +REGISTER_OP_CUDA_KERNEL(stright_throuth_estimator_grad, + ops::StrightThroughEstimatorGradKernel); REGISTER_OP_CUDA_KERNEL( fake_channel_wise_quantize_dequantize_abs_max, ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 94a75f930bebae..11a2d2de8bcf73 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -314,6 +314,12 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel { auto* in = context.Input("X"); auto& dev_ctx = context.template device_context(); + if (context.HasOutput("Out")) { + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out); + } + bool is_test = context.Attr("is_test"); // testing if (is_test) { @@ -344,17 +350,17 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel { }; template -class FakeQuantDequantGradKernel : public framework::OpKernel { +class StrightThroughEstimatorGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* d_out = context.Input(framework::GradVarName("Out")); auto x_grad_name = framework::GradVarName("X"); auto* d_x = context.Output(x_grad_name); - PADDLE_ENFORCE_NOT_NULL( - d_x, platform::errors::PreconditionNotMet( - "FakeQuantDequantGradOp doesn't have the output named %s.", - x_grad_name)); + PADDLE_ENFORCE_NOT_NULL(d_x, platform::errors::PreconditionNotMet( + "StrightThroughEstimatorGradKernel " + "doesn't have the output named %s.", + x_grad_name)); // Initialize dx as same as d_out d_x->mutable_data(context.GetPlace()); diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index b1c42d91df5048..69856fa4fa142e 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -84,7 +84,8 @@ std::map> op_outs_map = { {"matrix_nms", {"Out", "Index", "RoisNum"}}, {"distribute_fpn_proposals", {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}}, - {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}}, + {"moving_average_abs_max_scale", + {"Out", "OutScale", "OutAccum", "OutState"}}, {"multiclass_nms3", {"Out", "NmsRoisNum"}}, {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, {"momentum", {"ParamOut", "VelocityOut"}}, @@ -137,7 +138,8 @@ std::map> op_passing_outs_map = { {"check_finite_and_unscale", {"Out", "FoundInfinite"}}, {"update_loss_scaling", {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}}, - {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}}, + {"moving_average_abs_max_scale", + {"Out", "OutScale", "OutAccum", "OutState"}}, {"lamb", {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, {"rnn", {"DropoutState"}}, diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index ea2e8e073b5084..f4620ff00013c8 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -21,14 +21,14 @@ import paddle from paddle.fluid import dygraph, core, framework, unique_name -from paddle.fluid.executor import Executor +from paddle.fluid.executor import Executor, global_scope from paddle.fluid.param_attr import ParamAttr from paddle.fluid.initializer import Constant from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.fluid.io import load_inference_model, save_inference_model from paddle.fluid.log_helper import get_logger -from . import quant_nn from .. import quantization_pass +from . import quant_nn from . import utils __all__ = ['ImperativeQuantAware'] @@ -201,7 +201,7 @@ def forward(self, inputs): self._quantize_inputs = ImperativeQuantizeInputs(**kwargs) - self._calc_output_scale = ImperativeCalcOutputScale() + self._quantize_outputs = ImperativeQuantizeOutputs() def quantize(self, model): """ @@ -219,11 +219,11 @@ def quantize(self, model): assert isinstance(model, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." self._quantize_inputs.apply(model) - self._calc_output_scale.apply(model) + self._quantize_outputs.apply(model) def save_quantized_model(self, layer, path, input_spec=None, **config): - self._calc_output_scale.save_quantized_model(layer, path, input_spec, - **config) + self._quantize_outputs.save_quantized_model(layer, path, input_spec, + **config) class ImperativeQuantizeInputs(object): @@ -323,10 +323,10 @@ def apply(self, model): idx += 1 target = name[last_idx:idx] - quant_layer = self._get_quantized_layer(layer) + quant_layer = self._get_input_quantized_layer(layer) setattr(obj, target, quant_layer) - def _get_quantized_layer(self, layer): + def _get_input_quantized_layer(self, layer): quant_layer_name = None for key, value in utils.quant_input_layers_map.items(): if isinstance(layer, value): @@ -343,24 +343,26 @@ def _get_quantized_layer(self, layer): return quant_nn.__dict__[quant_layer_name](layer, **self._kwargs) -class ImperativeCalcOutputScale(object): +class ImperativeQuantizeOutputs(object): + """ + Calculate the output scales for some layers. + """ + def __init__(self, moving_rate=0.9): """ - Add the logic of calculating and setting output scales of some layers. + The constructor for ImperativeQuantizeOutputs. Args: moving_rate(float): The decay coefficient of moving average. The default value is 0.9. """ - super(ImperativeCalcOutputScale, self).__init__() + super(ImperativeQuantizeOutputs, self).__init__() self._moving_rate = moving_rate - self._register_hook_handle_list = [] - self._out_scale_dict = collections.OrderedDict() def apply(self, model): """ - Insert the `moving_average_abs_max_scale` op to calculate output - scale of specific layers in model. + Insert the `moving_average_abs_max_scale` layers to calculate the + output scales for specific layers in the dygraph model. Args: model(fluid.dygraph.Layer): The target model which would be @@ -372,14 +374,25 @@ def apply(self, model): assert isinstance(model, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." - # Calculate the target ops's output scale, and don't consider - # the skip_quant attr - for _, layer in model.named_sublayers(): - if self._is_target_layer(layer): - self._init_scale_params(layer) - hook_handle = layer.register_forward_post_hook( - self._calc_output_scale_hook) - self._register_hook_handle_list.append(hook_handle) + for name, layer in model.named_sublayers(): + if not self._is_target_layer(layer): + continue + + # TODO(jc): optimize this module + last_idx = 0 + idx = 0 + obj = model + while idx < len(name): + if (name[idx] == '.'): + if hasattr(obj, name[last_idx:idx]): + obj = getattr(obj, name[last_idx:idx]) + last_idx = idx + 1 + idx += 1 + target = name[last_idx:idx] + + quant_layer = quant_nn.__dict__["QuantizedOutputLayer"]( + layer, self._moving_rate) + setattr(obj, target, quant_layer) def save_quantized_model(self, layer, path, input_spec=None, **config): """ @@ -409,33 +422,18 @@ def save_quantized_model(self, layer, path, input_spec=None, **config): Returns: None """ - assert isinstance(layer, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." - self._gather_output_scale(layer) - - with dygraph.guard(): - layer.eval() - for handle in self._register_hook_handle_list: - handle.remove() paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config) - if len(self._out_scale_dict) == 0: - warnings.warn("Warning: No Layer of the model while to be " \ - "saved contains the out_threshold attribute, so the " \ - "generated inference model would not contain the " \ - "out_threshold.") - return - - # load static model is_dynamic_mode = False if paddle.in_dynamic_mode(): is_dynamic_mode = True paddle.enable_static() - place = core.CUDAPlace(0) if core.is_compiled_with_cuda() \ - else core.CPUPlace() + place = core.CPUPlace() + scope = global_scope() exe = Executor(place) dirname = os.path.dirname(path) @@ -450,20 +448,10 @@ def save_quantized_model(self, layer, path, input_spec=None, **config): model_filename=model_filename, params_filename=params_filename)) - # TODO(jc): analyse whether the dygraph model has - # several blocks before applying qat - assert infer_program.num_blocks == 1, \ - "Quantization aware training (QAT) requires the program " \ - "only has a block for now. When the model has if-else or " \ - "while, the program will have several blocks." + self._save_output_scale(infer_program, scope) - # set output scales to the static model - self._save_output_scale(infer_program) - - # process skip quant self._set_skip_quant_attr(infer_program) - # save the final quantized model that has output scales save_inference_model( dirname=dirname, feeded_var_names=feed_target_names, @@ -476,144 +464,42 @@ def save_quantized_model(self, layer, path, input_spec=None, **config): if is_dynamic_mode: paddle.disable_static() - def _gather_output_scale(self, layer): - """ - Gather all output scales to self._out_scale_dict - """ - with dygraph.guard(): - layer.eval() - for _, sub_layer in layer.named_sublayers(): - if self._is_target_layer(sub_layer): - layer_name = sub_layer.full_name() - if hasattr(sub_layer, "_quant_out_scale"): - self._out_scale_dict[layer_name] = float( - sub_layer._quant_out_scale) - - def _save_output_scale(self, infer_program): + def _is_target_layer(self, layer): """ - Save all output scales to the corresponding ops in static - inference program. - - Because the Layer in dygraph may correspond to multiple ops - in static program after being saved. To ensure correctness, - the outscale collected for output of dygraph Layer can only - be set to the last op in the corresponding ops in static program. + Whether the layer needs to calculate output scales. """ - assert infer_program.num_blocks == 1, \ - "The inference program should only have a block." - - global_block = infer_program.global_block() - target_ops = global_block.ops - - scale_idx = 0 - op_idx = 0 - attr_name = "out_threshold" - - for scale_name, scale_value in self._out_scale_dict.items(): - while True: - if op_idx >= len(target_ops): - break - - op = target_ops[op_idx] - if not self._is_scale_op_matched(scale_name, op, global_block): - op_idx += 1 - else: - if op.type in utils.weight_op_types \ - and op_idx + 1 < len(target_ops) \ - and target_ops[op_idx+1].type == "elementwise_add": - target_ops[op_idx + 1]._set_attr(attr_name, scale_value) - op_idx += 2 - else: - op._set_attr(attr_name, scale_value) - op_idx += 1 - scale_idx += 1 - break - - if scale_idx != len(self._out_scale_dict): - _logger.warning("Warning: the model have %s output scales, "\ - "but it only saves %s output scales." \ - % (len(self._out_scale_dict), scale_idx)) - - def _is_target_layer(self, layer): return isinstance(layer, tuple(utils.quant_output_layers_map.values())) \ - or ('quantized_' in layer.full_name() and \ + or ('quantized' in layer.full_name() and \ 'quantized_noweight' not in layer.full_name()) - def _init_scale_params(self, layer, name=None): + def _save_output_scale(self, program, scope): """ - Init the scale params for calculating output scales and save them in the - target layer. - After the users define the dygraph model, the hooks for calculating output - scales will not execute immediately. If the users load parameters form - checkpoint and save the quantized inference model immediately, the inference - model would not be saved successfully. Beacuse the dygraph_to_static requires - that the parameters created in __init__, but the uniqueness of hook make it - impossible to create parameters in __init__. To avoid this mistake, we define - the scale parameters in the beginning instead of hook. + Save all output scales to the corresponding ops in static + inference program and delete 'moving_average_abs_max_scale' ops. """ + for block in program.blocks: + for op in block.ops: + if op.type == "moving_average_abs_max_scale": + in_var_name = op.input('X')[0] + out_var_name = op.output('Out')[0] + out_scale_name = op.output('OutScale')[0] - def _create_param(in_layer, first_name, last_name, dtype): - prefix = '{}.{}'.format(first_name, last_name) \ - if first_name else 'outscale.{}'.format(last_name) - attr = ParamAttr( - name=unique_name.generate(prefix), - initializer=Constant(1), - trainable=False) - param = in_layer.create_parameter(shape=[1], attr=attr, dtype=dtype) - return param - - dtype = layer._dtype if layer._dtype is not None else "float32" - if dtype not in ["float32", "float64"]: - return - - layer._quant_out_scale = _create_param(layer, name, "scale", dtype) - layer._quant_out_scale.stop_gradient = True - - layer._quant_out_state = _create_param(layer, name, "state", dtype) - layer._quant_out_state.stop_gradient = True + out_scale = utils.load_variable_data(scope, out_scale_name) + previous_op = utils.find_previous_op(block, in_var_name) + previous_op._set_attr("out_threshold", float(out_scale)) - layer._quant_out_accum = _create_param(layer, name, "accum", dtype) - layer._quant_out_accum.stop_gradient = True + next_ops = utils.find_next_ops(block, out_var_name) + for next_op in next_ops: + next_op._rename_input(out_var_name, in_var_name) - def _is_scale_op_matched(self, scale_name, op, block): + def _set_skip_quant_attr(self, program): """ - Based on the op name and attrs to judge whether the op in - program matches the scale_name. We must know the corresponding - name between dgraph and static model. + Label the skip quantized ops. """ - fp_type = [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32] - if op.type in quantization_pass._op_real_in_out_name.keys(): - output_var_names = quantization_pass._get_op_output_var_names(op) - for output_var_name in output_var_names: - output_var_tensor = block.var(output_var_name) - if output_var_tensor.dtype not in fp_type: - return False - - # corresponding_map: [name, op_types, function] - # Note that, the items have priority in corresponding_map - corresponding_map = [ - ['conv2d_tranpose', ['conv2d_transpose', \ - 'depthwise_conv2d_transpose'], None], - ['conv2d', ['conv2d', 'depthwise_conv2d'], None], - ['linear', ['matmul'], None], - ['re_lu6', ['relu6'], None], - ['p_re_lu', ['prelu'], None], - ['leaky_re_lu', ['leaky_relu'], None], - ['re_lu', ['relu'], None], - ] - - for item in corresponding_map: - if item[0] in scale_name: - return (op.type in item[1]) and \ - (len(item) == 2 or item[2] is None or item[2](op)) - - return op.type in scale_name - - def _set_skip_quant_attr(self, program): - block = program.global_block() - for op in block.ops: - if self._is_skip_quant_op(block, op): - op._set_attr("skip_quant", True) + for block in program.blocks: + for op in block.ops: + if self._is_skip_quant_op(block, op): + op._set_attr("skip_quant", True) def _is_skip_quant_op(self, block, in_op): """ @@ -621,33 +507,11 @@ def _is_skip_quant_op(self, block, in_op): 1. the type of input op should be conv2d, depthwise_conv2d or matmul 2. the previous ops of the input op are not fake_quantize_dequantize ops """ - - def _find_previous_op(block, var_name): - for op in block.ops: - if var_name in op.output_arg_names: - return op - target_op_types = ["conv2d", "depthwise_conv2d", "matmul"] if in_op.type not in target_op_types: return False - previous_ops = [_find_previous_op(block, arg_name) \ + previous_ops = [utils.find_previous_op(block, arg_name) \ for arg_name in in_op.input_arg_names] - return any(op is not None and op.type not in utils.fake_quantize_dequantize_types \ - for op in previous_ops ) - - def _calc_output_scale_hook(self, layer, input, output): - """ - Create the MovingAverageAbsMaxScale layer for the target layer if needed. - Execute MovingAverageAbsMaxScale layer to calculate the output scale. - """ - assert isinstance(output, (core.VarBase, framework.Variable)), \ - "Multiple outputs are not currently supported in ImperativeOutScale." - - fp_types = [core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64] - if output.dtype in fp_types: - if not hasattr(layer, "_out_scale"): - self._out_scale = quant_nn.MovingAverageAbsMaxScale( - layer, output.name, self._moving_rate, output.dtype) - # TODO (jc): consider the ops that have several outputs - self._out_scale(output) + return any(op is not None and op.type not in \ + utils.fake_quantize_dequantize_types for op in previous_ops) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py index 3c4fb323bc505f..f6fef0689d43af 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py @@ -507,59 +507,42 @@ def forward(self, input): class MovingAverageAbsMaxScale(layers.Layer): - def __init__(self, layer=None, name=None, moving_rate=0.9, dtype='float32'): + def __init__(self, name=None, moving_rate=0.9, dtype='float32'): r""" - MovingAverageMaxScale layer is used to calculating the output quantization scale of Layer. - Its computational formula is described as below: + MovingAverageMaxScale layer is used to calculating the output quantization + scale of Layer. Its computational formula is described as below: :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)` :math:`Out = X` """ super(MovingAverageAbsMaxScale, self).__init__() self._moving_rate = moving_rate - self._dtype = dtype - self._layer = layer - if self._layer is None or not hasattr(self._layer, "_quant_out_scale"): - scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale' - scale_name = unique_name.generate(scale_prefix) - scale_attr = ParamAttr( - name=scale_name, initializer=Constant(1), trainable=False) - self._scale = self.create_parameter( - shape=[1], attr=scale_attr, dtype=self._dtype) - self._scale.stop_gradient = True - if self._layer is not None: - setattr(self._layer, "_quant_out_scale", self._scale) - else: - self._scale = self._layer._quant_out_scale + scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale' + scale_name = unique_name.generate(scale_prefix) + scale_attr = ParamAttr( + name=scale_name, initializer=Constant(1), trainable=False) + self._scale = self.create_parameter( + shape=[1], attr=scale_attr, dtype=dtype) + self._scale.stop_gradient = True - if self._layer is None or not hasattr(self._layer, "_quant_out_state"): - state_prefix = "{}.state".format(name) if name else 'outscale.state' - state_attr = ParamAttr( - name=unique_name.generate(state_prefix), - initializer=Constant(1), - trainable=False) - self._state = self.create_parameter( - shape=[1], attr=state_attr, dtype=self._dtype) - self._state.stop_gradient = True - if self._layer is not None: - setattr(self._layer, "_quant_out_state", self._state) - else: - self._state = self._layer._quant_out_state + state_prefix = "{}.state".format(name) if name else 'outscale.state' + state_attr = ParamAttr( + name=unique_name.generate(state_prefix), + initializer=Constant(1), + trainable=False) + self._state = self.create_parameter( + shape=[1], attr=state_attr, dtype=dtype) + self._state.stop_gradient = True - if self._layer is None or not hasattr(self._layer, "_quant_out_accum"): - accum_prefix = "{}.accum".format(name) if name else 'outscale.accum' - accum_attr = ParamAttr( - name=unique_name.generate(accum_prefix), - initializer=Constant(1), - trainable=False) - self._accum = self.create_parameter( - shape=[1], attr=accum_attr, dtype=self._dtype) - self._accum.stop_gradient = True - if self._layer is not None: - setattr(self._layer, "_quant_out_accum", self._accum) - else: - self._accum = self._layer._quant_out_accum + accum_prefix = "{}.accum".format(name) if name else 'outscale.accum' + accum_attr = ParamAttr( + name=unique_name.generate(accum_prefix), + initializer=Constant(1), + trainable=False) + self._accum = self.create_parameter( + shape=[1], attr=accum_attr, dtype=dtype) + self._accum.stop_gradient = True def forward(self, input): if in_dygraph_mode(): @@ -567,18 +550,30 @@ def forward(self, input): not self.training) state = self._state if self.training else None accum = self._accum if self.training else None + quant_out = _varbase_creator( + type=input.type, + name="{}.tmp".format(input.name), + shape=input.shape, + dtype=input.dtype, + persistable=False) - self._scale, _, _ = core.ops.moving_average_abs_max_scale( - input, accum, state, self._scale, state, accum, *attrs) - return self._scale + out, _, _, _ = core.ops.moving_average_abs_max_scale( + input, accum, state, quant_out, self._scale, state, accum, + *attrs) + return out check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'MovingAverageAbsMaxScale') attrs = {'moving_rate': self._moving_rate, 'is_test': not self.training} - inputs = {"X": [input]} - outputs = {"OutScale": [self._scale]} + quant_out = self._helper.create_variable( + name="{}.tmp".format(input.name), + dtype=input.dtype, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=False) + outputs = {"Out": [quant_out], "OutScale": [self._scale]} if self.training: inputs['InState'] = [self._state] @@ -592,4 +587,22 @@ def forward(self, input): outputs=outputs, attrs=attrs) - return self._scale + return quant_out + + +class QuantizedOutputLayer(layers.Layer): + def __init__(self, layer=None, moving_rate=0.9, dtype='float32'): + r""" + Add MovingAverageMaxScale layer to the behind of the input layer. + """ + super(QuantizedOutputLayer, self).__init__() + self._layer = layer + self._moving_average_abs_max_scale = \ + MovingAverageAbsMaxScale(layer.full_name(), moving_rate, dtype) + + def forward(self, input): + if isinstance(input, list): + assert len(input) == 1, \ + "The QuantizedOutputLayer should only have one input." + out = self._layer(input) + return self._moving_average_abs_max_scale(out) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py index 090f6cda389af2..f45eb8c97f419e 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py @@ -13,22 +13,7 @@ # limitations under the License. import paddle - -op_real_in_out_name = { - "conv2d": [["Input", "Filter"], ["Output"]], - "depthwise_conv2d": [["Input", "Filter"], ["Output"]], - "pool2d": [["X"], ["Out"]], - "elementwise_add": [["X", "Y"], ["Out"]], - "softmax": [["X"], ["Out"]], - "relu": [["X"], ["Out"]], - "relu6": [["X"], ["Out"]], - "leaky_relu": [["X"], ["Out"]], - "prelu": [["X"], ["Out"]], - "tanh": [["X"], ["Out"]], - "batch_norm": [["X"], ["Y"]], - "sigmoid": [["X"], ["Out"]], - "swish": [["X"], ["Out"]], -} +import numpy as np quant_input_layers_map = { 'Conv2D': paddle.nn.Conv2D, @@ -85,3 +70,33 @@ "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose", "depthwise_conv2d_transpose" ] + + +def load_variable_data(scope, var_name): + ''' + Load variable value from scope + ''' + var_node = scope.find_var(var_name) + assert var_node is not None, \ + "Can not find " + var_name + " in the scope." + return np.array(var_node.get_tensor()) + + +def find_previous_op(block, var_name): + """ + Find the previous op for the input variable. + """ + for op in block.ops: + if var_name in op.output_arg_names: + return op + + +def find_next_ops(block, var_name): + """ + Find all followed ops for the input variable. + """ + res_ops = [] + for op in block.ops: + if var_name in op.input_arg_names: + res_ops.append(op) + return res_ops diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py index 600174e503feb2..8d6ce76ef0fa5f 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py @@ -478,30 +478,5 @@ def test_save_quantized_model(self): self.assertTrue(op_count == 14) -class TestSaveQuantizedModel_Warning(unittest.TestCase): - def test_warning(self): - path = "./dynamic_outscale_infer_model_with_warnings/lenet" - imperative_out_scale = ImperativeQuantAware() - with fluid.dygraph.guard(): - lenet = ImperativeLenet() - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - imperative_out_scale.save_quantized_model( - layer=lenet, - path=path, - input_spec=[ - paddle.static.InputSpec( - shape=[None, 1, 28, 28], dtype='float32') - ]) - - warning_message = "Warning: No Layer of the model while to be " \ - "saved contains the out_threshold attribute, so the " \ - "generated inference model would not contain the " \ - "out_threshold." - num = get_vaild_warning_num(warning_message, w) - assert num == 1 - - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py index 01f0abe0f217c3..1d7bfc9f6963c6 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py @@ -166,12 +166,14 @@ def setUp(self): accum[0] = 1 state = np.zeros(1).astype("float32") state[0] = 1 + x = np.random.random((8, 16, 7, 7)).astype("float32") self.inputs = { - 'X': np.random.random((8, 16, 7, 7)).astype("float32"), + 'X': x, 'InAccum': accum, 'InState': state, } + out = x out_accum = np.zeros(1).astype("float32") out_state = np.zeros(1).astype("float32") out_scale = np.zeros(1).astype("float32") @@ -180,6 +182,7 @@ def setUp(self): out_state[0] = self.attrs['moving_rate'] * state[0] + 1 out_scale = out_accum / out_state self.outputs = { + 'Out': out, 'OutAccum': out_accum, 'OutState': out_state, 'OutScale': out_scale, From bfb5cf5567a604fded177d90d639f7337015e3fa Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Mon, 29 Mar 2021 10:16:54 +0800 Subject: [PATCH 07/15] [Paddle-TRT] trt affine channel converter (#31628) * trt affine channel converter * add trt affine channel base test * add trt affine channel NHWC * remove asterisk for python2 compatibility * trt affine channel converter * add trt affine channel base test * add trt affine channel NHWC * remove asterisk for python2 compatibility * fix rebase * move LodTensor to Tensor * add dbg info * affine channel converter only support NCHW * scale,bias are parameters, use create_parameters api * reduce test input size to not exceed the timelimit of ci * refine affine channel unittest and add serialization/dynamic test * change super to InferencePassTest for python2 compatibility * change super to InferencePassTest for python2 compatibility * fix affine channel fp16 serialize setting --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../tensorrt/convert/affine_channel_op.cc | 94 ++++++++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 10 +- .../inference/test_trt_affine_channel_op.py | 141 ++++++++++++++++++ 5 files changed, 246 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 0007582e2c73d2..76bf5948a2b98c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1192,6 +1192,7 @@ USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(stack); USE_TRT_CONVERTER(clip); USE_TRT_CONVERTER(gather); +USE_TRT_CONVERTER(affine_channel); USE_TRT_CONVERTER(multiclass_nms); USE_TRT_CONVERTER(nearest_interp); #endif diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index be7fa0548d9f34..6af76bd11cd59a 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -6,6 +6,7 @@ nv_library(tensorrt_converter shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc gather_op.cc + affine_channel_op.cc multiclass_nms_op.cc nearest_interp_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) diff --git a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc new file mode 100644 index 00000000000000..813342c08483b7 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Affine Channel Op + */ +class AffineChannelOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid affine_channel op to tensorrt scale nd layer"; + + framework::OpDesc op_desc(op, nullptr); + std::string input_name = op_desc.Input("X").front(); + std::string scale_name = op_desc.Input("Scale").front(); + std::string bias_name = op_desc.Input("Bias").front(); + std::string output_name = op_desc.Output("Out").front(); + + auto input_tensor = engine_->GetITensor(input_name); + auto idim = input_tensor->getDimensions(); + + auto* scale_v = scope.FindVar(scale_name); + auto* scale_t = scale_v->GetMutable(); + float* scale_ptr = engine_->GetWeightCPUData(scale_name, scale_t, false); + + auto* bias_v = scope.FindVar(bias_name); + auto* bias_t = bias_v->GetMutable(); + float* bias_ptr = engine_->GetWeightCPUData(bias_name, bias_t, false); + + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout"))); + + PADDLE_ENFORCE_EQ( + data_layout, framework::DataLayout::kNCHW, + platform::errors::InvalidArgument( + "TensorRT affine channel converter can only convert NCHW format. " + "Other format should be run in fluid mode. Report a bug on github " + "issue if you see this line.")); + + // tensorrt scalend layer only support spatial dims >= 2, + // so nhwc is not availabe (spatial dims == 0) + const int channel_axis = engine_->with_dynamic_shape(); + + TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, + static_cast(scale_ptr), + (size_t)idim.d[channel_axis]}; + TensorRTEngine::Weight bias_weights{nvinfer1::DataType::kFLOAT, + static_cast(bias_ptr), + (size_t)idim.d[channel_axis]}; + TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + + auto layer = TRT_ENGINE_ADD_LAYER(engine_, ScaleNd, *input_tensor, + nvinfer1::ScaleMode::kCHANNEL, + bias_weights.get(), scale_weights.get(), + power_weights.get(), channel_axis); + + RreplenishLayerAndOutput(layer, "affine_channel", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(affine_channel, AffineChannelOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 82f58254fe8e0d..eb429405d18aeb 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -111,6 +111,7 @@ struct SimpleOpTypeSetTeller : public Teller { "flatten2", "flatten", "gather", + "affine_channel", "multiclass_nms", "nearest_interp", }; @@ -196,6 +197,13 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; } + if (op_type == "affine_channel") { + if (!desc.HasAttr("data_layout")) return false; + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, desc.GetAttr("data_layout"))); + if (data_layout != framework::DataLayout::kNCHW) return false; + } + if (op_type == "multiclass_nms") { if (with_dynamic_shape) return false; auto* block = desc.Block(); @@ -238,6 +246,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; } } + if (op_type == "nearest_interp") { std::vector attrs{"data_layout", "interp_method", "align_corners", "scale", @@ -254,7 +263,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, BOOST_GET_CONST(std::string, desc.GetAttr("interp_method")); if (interp_method != "nearest") return false; } - if ((*teller)(op_type, desc, use_no_calib_int8)) return true; } return false; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py new file mode 100644 index 00000000000000..8bbba7c8b55fef --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py @@ -0,0 +1,141 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import itertools +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.core import PassVersionChecker +from paddle.fluid.core import AnalysisConfig + + +class TRTAffineChannelTest(InferencePassTest): + def setUp(self): + self.bs = 2 + self.channel = 8 + self.height = 16 + self.width = 16 + self.data_layout = 'NCHW' + self.precision = AnalysisConfig.Precision.Float32 + self.serialize = False + self.enable_trt = True + + def build(self): + # set min_graph_size to 2, + # because affine channel doesn't support nhwc format + self.trt_parameters = InferencePassTest.TensorRTParam( + 1 << 30, self.bs, 2, self.precision, self.serialize, False) + + with fluid.program_guard(self.main_program, self.startup_program): + if self.data_layout == 'NCHW': + shape = [-1, self.channel, self.height, self.width] + else: + shape = [-1, self.height, self.width, self.channel] + + data = fluid.data(name='in', shape=shape, dtype='float32') + # set scale, bias by constant + scale = fluid.layers.create_parameter( + shape=[self.channel], + dtype='float32', + default_initializer=fluid.initializer.Constant(2.)) + bias = fluid.layers.create_parameter( + shape=[self.channel], + dtype='float32', + default_initializer=fluid.initializer.Constant(.5)) + affine_channel_out = fluid.layers.affine_channel( + data, scale=scale, bias=bias, data_layout=self.data_layout) + out = fluid.layers.batch_norm(affine_channel_out, is_test=True) + + shape[0] = self.bs + self.feeds = {'in': np.random.random(shape).astype('float32'), } + self.fetch_list = [out] + + def check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + atol = 1e-5 + if self.trt_parameters.precision == AnalysisConfig.Precision.Half: + atol = 1e-3 + self.check_output_with_option(use_gpu, atol, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + def run_test(self): + self.build() + self.check_output() + + def run_test_all(self): + precision_opt = [ + AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half + ] + serialize_opt = [False, True] + + if self.data_layout == 'NCHW': + min_shape = [ + self.bs, self.channel, self.height // 2, self.width // 2 + ] + max_shape = [self.bs, self.channel, self.height * 2, self.width * 2] + opt_shape = [self.bs, self.channel, self.height, self.width] + + if self.data_layout == 'NHWC': + min_shape = [ + self.bs, self.height // 2, self.width // 2, self.channel + ] + max_shape = [self.bs, self.height * 2, self.width * 2, self.channel] + opt_shape = [self.bs, self.height, self.width, self.channel] + + dynamic_shape_profile = InferencePassTest.DynamicShapeParam({ + 'in': min_shape + }, {'in': max_shape}, {'in': opt_shape}, False) + dynamic_shape_opt = [None, dynamic_shape_profile] + + for precision, serialize, dynamic_shape in itertools.product( + precision_opt, serialize_opt, dynamic_shape_opt): + self.precision = precision + self.serialize = serialize + self.dynamic_shape_params = dynamic_shape + self.run_test() + + def test_base(self): + self.run_test() + + def test_fp16(self): + self.precision = AnalysisConfig.Precision.Half + self.run_test() + + def test_serialize(self): + self.serialize = True + self.run_test() + + def test_dynamic(self): + self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({ + 'in': [self.bs, self.channel, self.height // 2, self.width // 2] + }, {'in': [self.bs, self.channel, self.height * 2, self.width * 2] + }, {'in': [self.bs, self.channel, self.height, self.width]}, False) + self.run_test() + + def test_nchw_all(self): + self.run_test_all() + + def test_nhwc(self): + self.data_layout = 'NHWC' + self.run_test_all() + + +if __name__ == "__main__": + unittest.main() From e3a38d790a0f275fd9332b5ce0ad152f74257b61 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Mon, 29 Mar 2021 14:16:56 +0800 Subject: [PATCH 08/15] [Paddle-TRT] roi_align_plugin (#31732) * add roi_align_plugin * add roi align unit_test * add roi align serialization * remove roi align static plugin because of batch dim issue * refine roi align unittest and add fp16/serialization * add trt roi align condition to op_teller * refine error message * remove unnecessary reshape layer --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../tensorrt/convert/roi_align_op.cc | 86 ++++ paddle/fluid/inference/tensorrt/op_teller.cc | 24 ++ .../inference/tensorrt/plugin/CMakeLists.txt | 1 + .../tensorrt/plugin/roi_align_op_plugin.cu | 380 ++++++++++++++++++ .../tensorrt/plugin/roi_align_op_plugin.h | 112 ++++++ .../ir/inference/test_trt_roi_align_op.py | 119 ++++++ 8 files changed, 724 insertions(+) create mode 100644 paddle/fluid/inference/tensorrt/convert/roi_align_op.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 76bf5948a2b98c..7bb092d0e3c1c0 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1192,6 +1192,7 @@ USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(stack); USE_TRT_CONVERTER(clip); USE_TRT_CONVERTER(gather); +USE_TRT_CONVERTER(roi_align); USE_TRT_CONVERTER(affine_channel); USE_TRT_CONVERTER(multiclass_nms); USE_TRT_CONVERTER(nearest_interp); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 6af76bd11cd59a..bc7b7355ea1922 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -6,6 +6,7 @@ nv_library(tensorrt_converter shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc gather_op.cc + roi_align_op.cc affine_channel_op.cc multiclass_nms_op.cc nearest_interp_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc new file mode 100644 index 00000000000000..1329608aecd205 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Roi Align Op + */ +class RoiAlignOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid roi align op to tensorrt plugin"; + + framework::OpDesc op_desc(op, nullptr); + std::string input_name = op_desc.Input("X").front(); + std::string rois_name = op_desc.Input("ROIs").front(); + std::string output_name = op_desc.Output("Out").front(); + + const auto pooled_height = + BOOST_GET_CONST(int, op_desc.GetAttr("pooled_height")); + const auto pooled_width = + BOOST_GET_CONST(int, op_desc.GetAttr("pooled_width")); + const auto spatial_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("spatial_scale")); + const auto sampling_ratio = + BOOST_GET_CONST(int, op_desc.GetAttr("sampling_ratio")); + + const auto input_tensor = engine_->GetITensor(input_name); + const auto rois_tensor = engine_->GetITensor(rois_name); + + const nvinfer1::DataType data_type_ = engine_->WithFp16() + ? nvinfer1::DataType::kHALF + : nvinfer1::DataType::kFLOAT; + + std::vector inputs{input_tensor, rois_tensor}; + nvinfer1::ILayer* layer = nullptr; + + PADDLE_ENFORCE_EQ( + engine_->with_dynamic_shape(), true, + platform::errors::InvalidArgument( + "TRT roi align plugin only accept the dynamic shape, because that " + "the roi_align will change the batch size.")); + + auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic( + data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio); + auto roi_align_layer = engine_->network()->addPluginV2( + inputs.data(), inputs.size(), *roi_align_plugin); + layer = roi_align_layer; + + std::vector output_names{output_name}; + RreplenishLayerAndOutput(layer, "roi_align", output_names, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(roi_align, RoiAlignOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index eb429405d18aeb..7c1b2e8001edbd 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -111,6 +111,7 @@ struct SimpleOpTypeSetTeller : public Teller { "flatten2", "flatten", "gather", + "roi_align", "affine_channel", "multiclass_nms", "nearest_interp", @@ -263,6 +264,29 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, BOOST_GET_CONST(std::string, desc.GetAttr("interp_method")); if (interp_method != "nearest") return false; } + + if (op_type == "roi_align") { + if (!with_dynamic_shape) return false; + + std::vector attrs{"pooled_height", "pooled_width", + "spatial_scale", "sampling_ratio"}; + for (auto const attr : attrs) { + if (!desc.HasAttr(attr)) return false; + } + + const auto pooled_height = + BOOST_GET_CONST(int, desc.GetAttr("pooled_height")); + if (pooled_height <= 0) return false; + + const auto pooled_width = + BOOST_GET_CONST(int, desc.GetAttr("pooled_width")); + if (pooled_width <= 0) return false; + + const auto spatial_scale = + BOOST_GET_CONST(float, desc.GetAttr("spatial_scale")); + if (spatial_scale <= 0.f) return false; + } + if ((*teller)(op_type, desc, use_no_calib_int8)) return true; } return false; diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 7ee16a598d2d01..4107f9ef674339 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -5,6 +5,7 @@ nv_library(tensorrt_plugin instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu + roi_align_op_plugin.cu DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu new file mode 100644 index 00000000000000..42c0df41a1b5ef --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu @@ -0,0 +1,380 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +template +__inline__ __device__ T BilinearInterpolate(const T* input_data, + const int height, const int width, + T y, T x) { + if (y < -1.f || y > height || x < -1.f || x > width) return 0; + y = y <= 0.f ? 0.f : y; + x = x <= 0.f ? 0.f : x; + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high; + int x_high; + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + T ly = y - y_low, lx = x - x_low; + T hy = 1.f - ly, hx = 1.f - lx; + T v1 = input_data[y_low * width + x_low]; + T v2 = input_data[y_low * width + x_high]; + T v3 = input_data[y_high * width + x_low]; + T v4 = input_data[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__global__ void GPUROIAlignOpt(const int nthreads, + const T* __restrict__ input_data, + const T* __restrict__ input_rois, + const float spatial_scale, const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int sampling_ratio, const int num_rois, + OutT* __restrict__ output_data) { + const int batch = blockIdx.x; + const int channel = blockIdx.y; + const T* offset_input_data = + input_data + (batch * channels + channel) * height * width; + extern __shared__ T s_input_data[]; + if (USE_SMEM) { + for (int idx = threadIdx.x; idx < height * width; idx += blockDim.x) { + s_input_data[idx] = offset_input_data[idx]; + } + __syncthreads(); + } + for (int idx = threadIdx.x; idx < num_rois * pooled_height * pooled_width; + idx += blockDim.x) { + const int pw = idx % pooled_width; + const int ph = (idx / pooled_width) % pooled_height; + const int roi_idx = (idx / pooled_width / pooled_height) % num_rois; + const int n = batch * num_rois + roi_idx; + const float4 rois_offset = reinterpret_cast(input_rois)[n]; + const T roi_xmin = rois_offset.x * spatial_scale; + const T roi_ymin = rois_offset.y * spatial_scale; + const T roi_xmax = rois_offset.z * spatial_scale; + const T roi_ymax = rois_offset.w * spatial_scale; + const T roi_width = max(roi_xmax - roi_xmin, static_cast(1.f)); + const T roi_height = max(roi_ymax - roi_ymin, static_cast(1.f)); + const T bin_size_h = roi_height / static_cast(pooled_height); + const T bin_size_w = roi_width / static_cast(pooled_width); + const int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + const int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + const T count = roi_bin_grid_h * roi_bin_grid_w; + + T output_val = 0.f; + for (int iy = 0; iy < roi_bin_grid_h; ++iy) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ++ix) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + if (USE_SMEM) { + T val = BilinearInterpolate(s_input_data, height, width, y, x); + output_val += val; + } else { + T val = + BilinearInterpolate(offset_input_data, height, width, y, x); + output_val += val; + } + } + } + output_val /= count; + const int out_offset = + batch * num_rois * channels * pooled_height * pooled_width + + roi_idx * channels * pooled_height * pooled_width + + channel * pooled_height * pooled_width + ph * pooled_width + pw; + output_data[out_offset] = static_cast(output_val); + } +} + +#if IS_TRT_VERSION_GE(6000) +RoiAlignPluginDynamic::RoiAlignPluginDynamic(const nvinfer1::DataType data_type, + const int pooled_height, + const int pooled_width, + float spatial_scale, + int sampling_ratio) + : data_type_(data_type), + pooled_height_(pooled_height), + pooled_width_(pooled_width), + spatial_scale_(spatial_scale), + sampling_ratio_(sampling_ratio) { + bool data_type_is_valid = data_type_ == nvinfer1::DataType::kFLOAT || + data_type_ == nvinfer1::DataType::kHALF; + PADDLE_ENFORCE_EQ(data_type_is_valid, true, + platform::errors::InvalidArgument( + "TRT RoiAlign plugin only accepts kFLOAT(%d) or " + "kHALF(%d) data type, but the received data type = %d", + static_cast(nvinfer1::DataType::kFLOAT), + static_cast(nvinfer1::DataType::kHALF), + static_cast(data_type_))); + + PADDLE_ENFORCE_GT(pooled_height_, 0, + platform::errors::InvalidArgument( + "TRT RoiAlign plugin only accepts pooled_height " + "greater than %d, but the received pooled_height = %d", + 0, pooled_height_)); + + PADDLE_ENFORCE_GT(pooled_width_, 0, + platform::errors::InvalidArgument( + "TRT RoiAlign plugin only accepts pooled_width greater " + "than %d, but the received pooled_width = %d", + 0, pooled_height_)); + + PADDLE_ENFORCE_GT(spatial_scale_, 0.f, + platform::errors::InvalidArgument( + "TRT RoiAlign plugin only accepts spatial_scale " + "greater than %f, but the received spatial_scale = %f", + 0, spatial_scale_)); + + int smem_per_block = -1; + int device = -1; + cudaGetDevice(&device); + + PADDLE_ENFORCE_GE( + device, 0, + platform::errors::InvalidArgument( + "The cuda device ID should be greater than %d, but device ID is %d", + 0, device)); + + cudaDeviceGetAttribute(&smem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, + device); + smem_per_block_ = smem_per_block; +} + +RoiAlignPluginDynamic::RoiAlignPluginDynamic(void const* data, size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &pooled_height_); + DeserializeValue(&data, &length, &pooled_width_); + DeserializeValue(&data, &length, &spatial_scale_); + DeserializeValue(&data, &length, &sampling_ratio_); + int smem_per_block = -1; + int device = -1; + cudaGetDevice(&device); + PADDLE_ENFORCE_GE( + device, 0, + platform::errors::InvalidArgument( + "The cuda device ID should be greater than %d, but device ID is %d", + 0, device)); + cudaDeviceGetAttribute(&smem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, + device); + smem_per_block_ = smem_per_block; +} + +nvinfer1::IPluginV2DynamicExt* RoiAlignPluginDynamic::clone() const { + auto* plugin = + new RoiAlignPluginDynamic(data_type_, pooled_height_, pooled_width_, + spatial_scale_, sampling_ratio_); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +nvinfer1::DimsExprs RoiAlignPluginDynamic::getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) { + nvinfer1::DimsExprs ret{}; + ret.nbDims = 4; + ret.d[0] = inputs[1].d[0]; // roi + ret.d[1] = inputs[0].d[1]; // X + ret.d[2] = exprBuilder.constant(pooled_height_); + ret.d[3] = exprBuilder.constant(pooled_width_); + return ret; +} + +bool RoiAlignPluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, + int nbOutputs) { + if (inOut[pos].format != nvinfer1::TensorFormat::kLINEAR) { + return false; + } + if (pos < 2) { // input + return inOut[pos].type == nvinfer1::DataType::kFLOAT; + } + return inOut[pos].type == data_type_; +} + +void RoiAlignPluginDynamic::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {} + +size_t RoiAlignPluginDynamic::getWorkspaceSize( + const nvinfer1::PluginTensorDesc* inputs, int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const { + return 0; +} + +template +int RoiAlignPluginDynamic::enqueue_impl( + const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, + void* const* outputs, void* workspace, cudaStream_t stream) { + auto in_dims = inputDesc[0].dims; + auto rois_dims = inputDesc[1].dims; + auto out_dims = outputDesc[0].dims; + + int rois_num = rois_dims.d[0]; + if (rois_num == 0) return cudaGetLastError() != cudaSuccess; + + int batch = in_dims.d[0]; + int channels = in_dims.d[1]; + int height = in_dims.d[2]; + int width = in_dims.d[3]; + + int output_size = + out_dims.d[0] * out_dims.d[1] * out_dims.d[2] * out_dims.d[3]; + + const dim3 blocks(batch, channels); + const int threads = 512; + + if (smem_per_block_ < width * height * sizeof(T)) { + GPUROIAlignOpt<<>>( + output_size, static_cast(inputs[0]), + static_cast(inputs[1]), spatial_scale_, channels, height, + width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch, + static_cast(outputs[0])); + } else { + GPUROIAlignOpt< + T, OutT, true><<>>( + output_size, static_cast(inputs[0]), + static_cast(inputs[1]), spatial_scale_, channels, height, + width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch, + static_cast(outputs[0])); + } + + return cudaGetLastError() != cudaSuccess; +} + +int RoiAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, + void* const* outputs, void* workspace, + cudaStream_t stream) { + PADDLE_ENFORCE_EQ(outputDesc[0].type, data_type_, + platform::errors::InvalidArgument( + "TRT RoiAlignPluginDynamic expects outputDesc[0].type " + "equal to data_type_")); + + if (data_type_ == nvinfer1::DataType::kHALF) { + return enqueue_impl(inputDesc, outputDesc, inputs, outputs, + workspace, stream); + } + return enqueue_impl(inputDesc, outputDesc, inputs, outputs, + workspace, stream); +} + +nvinfer1::DataType RoiAlignPluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { + return data_type_; +} + +const char* RoiAlignPluginDynamic::getPluginType() const { + return "roi_align_plugin_dynamic"; +} + +int RoiAlignPluginDynamic::getNbOutputs() const { return 1; } + +int RoiAlignPluginDynamic::initialize() { return 0; } + +void RoiAlignPluginDynamic::terminate() {} + +size_t RoiAlignPluginDynamic::getSerializationSize() const { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(pooled_height_); + serialize_size += SerializedSize(pooled_width_); + serialize_size += SerializedSize(spatial_scale_); + serialize_size += SerializedSize(sampling_ratio_); + return serialize_size; +} + +void RoiAlignPluginDynamic::serialize(void* buffer) const { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, pooled_height_); + SerializeValue(&buffer, pooled_width_); + SerializeValue(&buffer, spatial_scale_); + SerializeValue(&buffer, sampling_ratio_); +} + +void RoiAlignPluginDynamic::destroy() {} + +RoiAlignPluginDynamicCreator::RoiAlignPluginDynamicCreator() {} + +void RoiAlignPluginDynamicCreator::setPluginNamespace( + const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* RoiAlignPluginDynamicCreator::getPluginNamespace() const { + return namespace_.c_str(); +} + +const char* RoiAlignPluginDynamicCreator::getPluginName() const { + return "roi_align_plugin_dynamic"; +} + +const char* RoiAlignPluginDynamicCreator::getPluginVersion() const { + return "1"; +} + +const nvinfer1::PluginFieldCollection* +RoiAlignPluginDynamicCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + const nvinfer1::PluginField* fields = fc->fields; +} + +nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new RoiAlignPluginDynamic(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h new file mode 100644 index 00000000000000..bba7d0d5a99664 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h @@ -0,0 +1,112 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +#if IS_TRT_VERSION_GE(6000) +class RoiAlignPluginDynamic : public DynamicPluginTensorRT { + public: + explicit RoiAlignPluginDynamic(const nvinfer1::DataType data_type, + const int pooled_height, + const int pooled_width, float spatial_scale, + int sampling_ratio); + RoiAlignPluginDynamic(void const* data, size_t length); + ~RoiAlignPluginDynamic() = default; + nvinfer1::IPluginV2DynamicExt* clone() const override; + nvinfer1::DimsExprs getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) override; + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, int nbOutputs) override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const override; + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) override; + + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* inputTypes, + int nbInputs) const override; + + const char* getPluginType() const override; + int getNbOutputs() const override; + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + + private: + template + int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, + void* workspace, cudaStream_t stream); + + nvinfer1::DataType data_type_; + int pooled_height_; + int pooled_width_; + float spatial_scale_; + int sampling_ratio_; + int smem_per_block_; + std::string namespace_; +}; + +class RoiAlignPluginDynamicCreator : public nvinfer1::IPluginCreator { + public: + RoiAlignPluginDynamicCreator(); + ~RoiAlignPluginDynamicCreator() override = default; + + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; +REGISTER_TRT_PLUGIN_V2(RoiAlignPluginDynamicCreator); +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py new file mode 100644 index 00000000000000..fa276dd342bc6b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py @@ -0,0 +1,119 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.core import PassVersionChecker +from paddle.fluid.core import AnalysisConfig + + +class TRTRoiAlignTest(InferencePassTest): + def setUp(self): + self.bs = 2 + self.num_rois = 4 + self.channel = 16 + self.height = 32 + self.width = 32 + self.precision = AnalysisConfig.Precision.Float32 + self.serialize = False + self.enable_trt = True + + def build(self): + self.trt_parameters = TRTRoiAlignTest.TensorRTParam( + 1 << 30, self.bs * self.num_rois, 1, self.precision, self.serialize, + False) + with fluid.program_guard(self.main_program, self.startup_program): + data_shape = [-1, self.channel, self.height, self.width] + data = fluid.data(name='data', shape=data_shape, dtype='float32') + rois = fluid.data( + name='rois', shape=[-1, 4], dtype='float32', lod_level=1) + roi_align_out = fluid.layers.roi_align(data, rois) + out = fluid.layers.batch_norm(roi_align_out, is_test=True) + + rois_lod = fluid.create_lod_tensor( + np.random.random([self.bs * self.num_rois, 4]).astype('float32'), + [[self.num_rois, self.num_rois]], fluid.CPUPlace()) + + data_shape[0] = self.bs + self.feeds = { + 'data': np.random.random(data_shape).astype('float32'), + 'rois': rois_lod, + } + self.fetch_list = [out] + + def check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + atol = 1e-5 + if self.trt_parameters.precision == AnalysisConfig.Precision.Half: + atol = 1e-3 + self.check_output_with_option(use_gpu, atol, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + def set_dynamic(self): + min_shape_spec = dict() + max_shape_spec = dict() + opt_shape_spec = dict() + min_shape_spec['data'] = [ + self.bs, self.channel, self.height // 2, self.width // 2 + ] + min_shape_spec['rois'] = [1, 4] + max_shape_spec[ + 'data'] = [self.bs, self.channel, self.height * 2, self.width * 2] + max_shape_spec['rois'] = [self.bs * self.num_rois, 4] + opt_shape_spec[ + 'data'] = [self.bs, self.channel, self.height, self.width] + opt_shape_spec['rois'] = [self.bs * self.num_rois, 4] + + self.dynamic_shape_params = InferencePassTest.DynamicShapeParam( + min_shape_spec, max_shape_spec, opt_shape_spec, False) + + def run_test(self): + self.build() + self.check_output() + + def test_base(self): + self.run_test() + + def test_fp16(self): + self.precision = AnalysisConfig.Precision.Half + self.run_test() + + def test_serialize(self): + self.serialize = True + self.run_test() + + def test_dynamic(self): + self.set_dynamic() + self.run_test() + + def test_dynamic_fp16(self): + self.set_dynamic() + self.precision = AnalysisConfig.Precision.Half + self.run_test() + + def test_dynamic_serialize(self): + self.set_dynamic() + self.serialize = True + self.run_test() + + +if __name__ == "__main__": + unittest.main() From 51eb29de18adcf8c20272218f105eb1c2135cc09 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Mon, 29 Mar 2021 14:17:54 +0800 Subject: [PATCH 09/15] [CustomOP] Add shape related constructor for Tensor (#31681) * give shape related contructor and reshape warning * change line num to fit ut * change ut to fit * remove useless code * call resize directly in constructor --- paddle/fluid/extension/include/ext_tensor.h | 3 +++ paddle/fluid/extension/src/ext_tensor.cc | 21 ++++++++++++++++++- paddle/fluid/framework/custom_tensor_utils.h | 2 +- .../fluid/tests/custom_op/custom_relu_op.cc | 3 +-- .../custom_op/test_custom_relu_op_jit.py | 4 ++-- 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h index be492a6d5535d1..52606b2a7f59e0 100644 --- a/paddle/fluid/extension/include/ext_tensor.h +++ b/paddle/fluid/extension/include/ext_tensor.h @@ -52,6 +52,9 @@ class PD_DLL_DECL Tensor { /// \brief Construct a Tensor on target Place for CustomOp. /// Generally it's only used for user to create Tensor. explicit Tensor(const PlaceType& place); + /// \brief Construct a Tensor on target Place with shape for CustomOp. + /// Generally it's only used for user to create Tensor. + Tensor(const PlaceType& place, const std::vector& shape); /// \brief Reset the shape of the tensor. /// Generally it's only used for the input tensor. /// Reshape must be called before calling diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc index 0cae8f4af7b97d..e9705e2101cc3c 100644 --- a/paddle/fluid/extension/src/ext_tensor.cc +++ b/paddle/fluid/extension/src/ext_tensor.cc @@ -102,13 +102,32 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc, void Tensor::reshape(const std::vector &shape) { GET_CASTED_TENSOR - tensor->Resize(framework::make_ddim(shape)); + auto new_dim = framework::make_ddim(shape); + if (tensor->numel() != framework::product(new_dim)) { + LOG(WARNING) << "Custom Op: Calling reshape to a new shape which is bigger " + "or smaller" + << "than original shape will not change your tensor's memory " + "Please call" + << "paddle::Tensor::mutable_data() after to reallocate " + "your tensor's size." + << std::endl; + } + tensor->Resize(new_dim); } Tensor::Tensor(const PlaceType &place) : tensor_(std::make_shared()), place_(place), stream_(StreamWrapper()) {} + +Tensor::Tensor(const PlaceType &place, const std::vector &shape) + : tensor_(std::make_shared()), + place_(place), + stream_(StreamWrapper()) { + GET_CASTED_TENSOR + tensor->Resize(framework::make_ddim(shape)); +} + template T *Tensor::mutable_data(const PlaceType &place) { place_ = place; diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h index fad1e3ee3496cd..809a6b965aad9b 100644 --- a/paddle/fluid/framework/custom_tensor_utils.h +++ b/paddle/fluid/framework/custom_tensor_utils.h @@ -37,7 +37,7 @@ class CustomTensorUtils { /// \brief Share data FROM another tensor. /// Use this to pass tensor from op to op /// \return void. - static void ShareDataFrom(const void* src, const Tensor& dst); + static void ShareDataFrom(const void* src, const paddle::Tensor& dst); static framework::proto::VarType::Type ConvertEnumDTypeToInnerDType( const paddle::DataType& dtype) { diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc index c0b30a1cb5579c..c075d27f7b1763 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc @@ -38,9 +38,8 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data, } std::vector relu_cpu_forward(const paddle::Tensor& x) { - auto out = paddle::Tensor(paddle::PlaceType::kCPU); + auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape()); - out.reshape(x.shape()); PD_DISPATCH_FLOATING_TYPES( x.type(), "relu_cpu_forward", ([&] { relu_cpu_forward_kernel( diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py index 23733d20841b3a..641630b0f4476a 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py @@ -103,11 +103,11 @@ def test_exception(self): in str(e)) if IS_WINDOWS: self.assertTrue( - r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:48" + r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:47" in str(e)) else: self.assertTrue( - "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:48" + "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:47" in str(e)) self.assertTrue(caught_exception) From 61805d8f0aa304f4226e5793b97da97552a43282 Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Mon, 29 Mar 2021 17:11:26 +0800 Subject: [PATCH 10/15] fix cmake model path (#31866) * fix cmake model path * update cmake * fix unittest * fix unittest --- paddle/fluid/inference/tests/api/CMakeLists.txt | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 92f9c20a369d7a..75628adbe8a859 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -530,7 +530,7 @@ if(WITH_GPU AND TENSORRT_FOUND) ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model") - if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz) + if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz) inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz") endif() inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc @@ -538,7 +538,7 @@ if(WITH_GPU AND TENSORRT_FOUND) ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR}) set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware") - if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR}/yolov3_r50_quant_aware.tgz) + if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware.tgz) inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz") endif() inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc @@ -576,8 +576,7 @@ if(WITH_GPU AND TENSORRT_FOUND) EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune) - set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/") - if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz) + if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized.tgz) inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz") endif() @@ -585,8 +584,7 @@ if(WITH_GPU AND TENSORRT_FOUND) EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized) - set(TEST_TRT_ERNIE_UNSER_FP16_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_fp16_unserialized/") - if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_FP16_MODEL}/ernie_model_4_unserialized.tgz) + if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized.tgz) inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz") endif() From 123949eb48378262c888bf2e5aa3f2127e6bf32f Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Mon, 29 Mar 2021 17:41:31 +0800 Subject: [PATCH 11/15] [ROCM] added a cudnn switch of conv2d for rocm platform (#31836) --- paddle/fluid/platform/flags.cc | 12 +++++++ .../pybind/global_value_getter_setter.cc | 4 ++- python/paddle/fluid/__init__.py | 1 + python/paddle/fluid/layers/nn.py | 4 +++ .../fluid/tests/unittests/test_conv2d_op.py | 36 +++++++++++++++++++ python/paddle/nn/layer/conv.py | 5 +++ 6 files changed, 61 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 1a55562f2b8244..fa77c0be037df3 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -564,3 +564,15 @@ DEFINE_string(tracer_mkldnn_ops_on, "", */ DEFINE_string(tracer_mkldnn_ops_off, "", "List of OneDNN operation types to be turned off"); + +/** + * CUDNN related FLAG + * Name: conv2d_disable_cudnn + * Since Version: + * Value Range: bool, default=false + * Example: + * Note: Disable cudnn in conv2d. + */ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d"); +#endif diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index 6074d191ad2be0..e8ba16398d2b00 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -72,6 +72,7 @@ DECLARE_uint64(conv_workspace_size_limit); DECLARE_bool(cudnn_batchnorm_spatial_persistent); DECLARE_bool(cudnn_deterministic); DECLARE_bool(cudnn_exhaustive_search); +DECLARE_bool(conv2d_disable_cudnn); // data processing DECLARE_bool(enable_cublas_tensor_op_math); // device management @@ -367,7 +368,8 @@ static void RegisterGlobalVarGetterSetter() { FLAGS_fraction_of_cuda_pinned_memory_to_use, FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb, FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math, - FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce); + FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce, + FLAGS_conv2d_disable_cudnn); #endif #ifdef PADDLE_WITH_XPU REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index b24da29d0f5fdd..ae3418687853b9 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -230,6 +230,7 @@ def __bootstrap__(): 'gpu_allocator_retry_time', 'local_exe_sub_scope_limit', 'gpu_memory_limit_mb', + 'conv2d_disable_cudnn', ] core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 00d1db19fc2f51..6bc69ffd5cd324 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1603,6 +1603,10 @@ def _get_default_param_initializer(): pre_bias = helper.create_variable_for_type_inference(dtype) + if (core.is_compiled_with_cuda() and paddle.fluid.get_flags( + "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]): + use_cudnn = False + helper.append_op( type=l_type, inputs={ diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 29c35d28d4d2e3..83bba0b0ca1c3a 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -1465,5 +1465,41 @@ def run_7(): self.assertRaises(ValueError, run_7) +# --------- test environment variable ------ +@unittest.skipIf( + not (core.is_compiled_with_cuda() or core.is_compiled_with_rocm()), + "core is not compiled with CUDA or ROCM") +class TestConv2DEnviron(unittest.TestCase): + def run_conv2d_api(self): + inputs = fluid.layers.data( + shape=[2, 3, 5, 5], + append_batch_size=False, + name="inputs", + dtype="float32") + fluid.layers.conv2d( + input=inputs, + num_filters=4, + filter_size=[3, 3], + stride=[1, 1], + padding=0, + dilation=[1, 1], + groups=1, + data_format="NCHW") + + x_var = paddle.uniform((2, 3, 5, 5), dtype="float32", min=-1., max=1.) + conv = paddle.nn.Conv2D( + in_channels=3, + out_channels=4, + kernel_size=(3, 3), + data_format="NCHW") + y_var = conv(x_var) + + def test_environ(self): + fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False}) + self.run_conv2d_api() + fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True}) + self.run_conv2d_api() + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py index 389920b923876d..d65b874c8badc6 100644 --- a/python/paddle/nn/layer/conv.py +++ b/python/paddle/nn/layer/conv.py @@ -25,6 +25,7 @@ import numpy as np +from ...fluid import get_flags from ...fluid import core from ...device import get_cudnn_version from ...fluid.dygraph import layers @@ -644,6 +645,10 @@ def __init__(self, bias_attr=bias_attr, data_format=data_format) + if (core.is_compiled_with_cuda() and get_flags( + "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]): + self._use_cudnn = False + def forward(self, x): if self._padding_mode != 'zeros': x = F.pad(x, From 525c32e33c8023472cb8178990bbc9c2ec3f1e3c Mon Sep 17 00:00:00 2001 From: liym27 <33742067+liym27@users.noreply.github.com> Date: Mon, 29 Mar 2021 19:47:55 +0800 Subject: [PATCH 12/15] =?UTF-8?q?Fix=20bug=20of=20set=5Fvalue=20op?= =?UTF-8?q?=EF=BC=9ADecerease=20axes=20to=20do=20right=20broadcast=20(#318?= =?UTF-8?q?75)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddle/fluid/operators/set_value_op.cc | 11 ++- paddle/fluid/operators/set_value_op.h | 72 ++++++++++++++++--- python/paddle/fluid/framework.py | 11 ++- .../tests/unittests/test_set_value_op.py | 14 ++++ 4 files changed, 95 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 94d34c648d1748..105d61015fcb9d 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -124,6 +124,9 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker { AddAttr>( "steps", "(list) Stride step from the start to the end.") .SetDefault({}); + AddAttr>("decrease_axes", + "(list) The axes to decrease.") + .SetDefault({}); AddAttr>("bool_values", "Store the bool values.") .SetDefault({}); @@ -185,4 +188,10 @@ Upgrade set_value, add 3 inputs [StartsTensorList, EndsTensorList, StepsTensorLi "Ending indices of corresponding axis in `axes`.", std::vector{}) .NewAttr("steps", "Stride step from the start to the end.", - std::vector{})); + std::vector{})) + .AddCheckpoint( + R"ROC( +Upgrade set_value, add 1 attribute [decrease_axes]. + )ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "decrease_axes", "The axes to decrease.", std::vector{})); diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index 325a2b0b865e9d..eca51147f8159e 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -106,10 +106,10 @@ inline void CheckAndUpdateSlice(const framework::DDim in_dims, } inline framework::DDim GetSliceDims(const framework::DDim in_dims, - const std::vector axes, - const std::vector starts, - const std::vector ends, - const std::vector steps) { + const std::vector& axes, + const std::vector& starts, + const std::vector& ends, + const std::vector& steps) { framework::DDim slice_dims(in_dims); for (size_t i = 0; i < axes.size(); ++i) { @@ -127,6 +127,38 @@ inline framework::DDim GetSliceDims(const framework::DDim in_dims, return slice_dims; } +inline framework::DDim GetDecreasedDims( + const framework::DDim slice_dims, + const std::vector& decrease_axes) { + // Get dims after decreasing axes. + framework::DDim decreased_dims(slice_dims); + if (decrease_axes.size() > 0) { + for (size_t i = 0; i < decrease_axes.size(); ++i) { + int64_t axis = decrease_axes[i]; + PADDLE_ENFORCE_EQ( + decreased_dims[axis], 1, + platform::errors::InvalidArgument("decrease dim should be 1")); + decreased_dims[axis] = 0; + } + + std::vector new_shape; + for (int i = 0; i < decreased_dims.size(); ++i) { + if (decreased_dims[i] != 0) { + new_shape.push_back(decreased_dims[i]); + } + } + + // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and + // uses [1] instead. + if (new_shape.size() == 0) { + new_shape.push_back(1); + } + + decreased_dims = framework::make_ddim(new_shape); + } + return decreased_dims; +} + template class SetValueKernel : public framework::OpKernel { public: @@ -179,6 +211,7 @@ class SetValueKernel : public framework::OpKernel { auto ends = ctx.Attr>("ends"); auto steps = ctx.Attr>("steps"); auto shape = ctx.Attr>("shape"); + auto decrease_axes = ctx.Attr>("decrease_axes"); auto dtype = in->type(); if (!starts_tensor_list.empty()) { @@ -194,6 +227,7 @@ class SetValueKernel : public framework::OpKernel { auto in_dims = in->dims(); CheckAndUpdateSlice(in_dims, axes, &starts, &ends, &steps); auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, steps); + auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes); auto place = ctx.GetPlace(); auto& eigen_place = @@ -212,13 +246,13 @@ class SetValueKernel : public framework::OpKernel { // set_value is what we want. TensorCopy(*in, place, out); - Tensor slice_t(dtype), pad_t(dtype); - slice_t.mutable_data(slice_dims, place); - pad_t.mutable_data(in_dims, place); + Tensor slice_tensor(dtype), pad_tensor(dtype); + slice_tensor.mutable_data(slice_dims, place); + pad_tensor.mutable_data(in_dims, place); - auto pad_e = framework::EigenTensor::From(pad_t, in_dims); + auto pad_e = framework::EigenTensor::From(pad_tensor, in_dims); auto out_e = framework::EigenTensor::From(*out); - auto slice_e = framework::EigenTensor::From(slice_t, slice_dims); + auto slice_e = framework::EigenTensor::From(slice_tensor, slice_dims); // Step 1: Set the value of out at `_index` to zero slice_e.device(eigen_place) = slice_e.constant(T(0)); @@ -244,11 +278,26 @@ class SetValueKernel : public framework::OpKernel { // Step 2: Set a tensor with the same shape as out tensor. And its data at // '_index' is the same as value_tensor, and data out of '_index' to zero + // - Step 2.1 Set slice tensor with value + + // NOTE(liym27): [ Why resize slice_tensor here? ] + // A: When do broadcasting on slice_tensor and value_tensor, the shape of + // slice_tensor should be decreased dims. + // e.g. + // x[:,0] = value_tensor + // x's shape = [3, 4], value_tensor's shape = [3] + // We get slice_dims = [3, 1], decrease_slice_dims = [3] + // If do broadcasting on Tensor with shape [3, 1] and [3], the result's + // shape is [3, 3], which cross the border; + // If do broadcasting on Tensor with shape [3] and [3], the result's shape + // is [3], which is right. + + slice_tensor.Resize(decrease_slice_dims); if (value_tensor != nullptr) { // ElementwiseComputeEx can do broadcasting ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_t, value_tensor, -1, SubFunctor(), &slice_t); + ctx, &slice_tensor, value_tensor, -1, SubFunctor(), &slice_tensor); } else { Tensor value_t(dtype); auto value_dims = framework::make_ddim(shape); @@ -257,8 +306,9 @@ class SetValueKernel : public framework::OpKernel { CopyVecotorToTensor(value_name.c_str(), &value_t, ctx); value_t.Resize(value_dims); ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_t, &value_t, -1, SubFunctor(), &slice_t); + ctx, &slice_tensor, &value_t, -1, SubFunctor(), &slice_tensor); } + slice_tensor.Resize(slice_dims); // - Step 2.2 Pad slice tensor with 0 pad_e.device(eigen_place) = pad_e.constant(T(0)); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index db487128bbe755..18162059e998eb 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1863,6 +1863,7 @@ def __setitem__(self, item, value): if not isinstance(item, tuple): item = [item] + decrease_axes = [] axes = [] starts = [] ends = [] @@ -1933,15 +1934,23 @@ def replace_ellipsis(item): if end is None: end = max_integer if step > 0 else (0 - max_integer) else: + decrease_axes.append(dim) start = slice_item end = slice_item + 1 if slice_item != -1 else max_integer step = 1 + axes.append(dim) starts.append(start) ends.append(end) steps.append(step) - attrs = {'axes': axes, 'starts': starts, 'ends': ends, 'steps': steps} + attrs = { + 'axes': axes, + 'starts': starts, + 'ends': ends, + 'steps': steps, + 'decrease_axes': decrease_axes + } from .layers import utils if utils._contain_var(starts): diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py index 23dac41f64abfe..1239a2249cc43e 100644 --- a/python/paddle/fluid/tests/unittests/test_set_value_op.py +++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py @@ -671,6 +671,20 @@ def _get_answer(self): self.data[0] = self.value +class TestSetValueValueShape5(TestSetValueApi): + def set_value(self): + self.value = np.array([3, 3, 3]).astype(self.dtype) + + def set_shape(self): + self.shape = [3, 4] + + def _call_setitem(self, x): + x[:, 0] = paddle.assign(self.value) # x is Paddle.Tensor + + def _get_answer(self): + self.data[:, 0] = self.value + + # 4. Test error class TestError(TestSetValueBase): def _value_type_error(self): From b48841ba2e7335eaa435a54436ed580d4aef001c Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 29 Mar 2021 19:53:12 +0800 Subject: [PATCH 13/15] modify API nn.Bilinear's doc (#31889) * modify API nn.Bilinear's doc, test=develop * modify API nn.Bilinear's doc, test=develop --- python/paddle/nn/layer/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py index d0f97625bcba75..60c846f9f76ec0 100644 --- a/python/paddle/nn/layer/common.py +++ b/python/paddle/nn/layer/common.py @@ -578,7 +578,7 @@ class Bilinear(layers.Layer): .. math:: - out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,size-1 + out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,outfeatures-1 out = out + b @@ -586,7 +586,7 @@ class Bilinear(layers.Layer): - :math:`x1`: the first input contains in1_features elements, shape is [batch_size, in1_features]. - :math:`x2`: the second input contains in2_features elements, shape is [batch_size, in2_features]. - :math:`W_{i}`: the i-th learned weight, shape is [in1_features, in2_features], and learned weight's shape is [out_features, in1_features, in2_features]. - - :math:`out_{i}`: the i-th element of out, shape is [batch_size, out_features]. + - :math:`out_{i}`: the i-th element of out, shape is [batch_size], and out's shape is [batch_size, out_features]. - :math:`b`: the learned bias, shape is [1, out_features]. - :math:`x2^\mathrm{T}`: the transpose of :math:`x2`. From 8829a309fe056dfecd472f19050c390fd049fead Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Mon, 29 Mar 2021 20:11:26 +0800 Subject: [PATCH 14/15] Delete cudnn6 code (#31835) --- paddle/fluid/operators/conv_cudnn_op_cache.h | 5 --- paddle/fluid/operators/cudnn_lstm_cache.h | 10 +----- paddle/fluid/operators/cudnn_rnn_cache.h | 7 ---- paddle/fluid/platform/cudnn_helper.h | 38 -------------------- 4 files changed, 1 insertion(+), 59 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h index ddddb7f8641ba1..23a471cfa00674 100644 --- a/paddle/fluid/operators/conv_cudnn_op_cache.h +++ b/paddle/fluid/operators/conv_cudnn_op_cache.h @@ -40,11 +40,6 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; -#else -// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc. -static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7; -static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4; -static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5; #endif } // namespace operators diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h index 3181e4b1d990b7..b7859237e737a1 100644 --- a/paddle/fluid/operators/cudnn_lstm_cache.h +++ b/paddle/fluid/operators/cudnn_lstm_cache.h @@ -85,20 +85,12 @@ class ScopedRNNBase { dropout_desc_.descriptor(handle, place, initialized_, dropout_prob_, dropout_state, seed_, state_size); -// ------------------- cudnn rnn descriptors --------------------- -#if CUDNN_VERSION >= 6000 + // ------------------- cudnn rnn descriptors --------------------- PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_RNN_ALGO_STANDARD, cudnn_type)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor( - rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), - CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - cudnn_type)); -#endif #if CUDNN_VERSION >= 7201 if (!sequence_length.empty()) { diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h index 13a3e7d09b9f62..a6a23a91c76c02 100644 --- a/paddle/fluid/operators/cudnn_rnn_cache.h +++ b/paddle/fluid/operators/cudnn_rnn_cache.h @@ -168,18 +168,11 @@ struct CudnnRNNCache { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); -#if CUDNN_VERSION >= 6000 PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_RNN_ALGO_STANDARD, cudnn_type)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor( - rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - cudnn_type)); -#endif PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index af0df2efc5e6d5..6c3c96b68c48a1 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -91,30 +91,6 @@ enum class ActivationMode { kBandPass, }; -#if CUDNN_VERSION < 6000 -#pragma message "CUDNN version under 6.0 is supported at best effort." -#pragma message "We strongly encourage you to move to 6.0 and above." -#pragma message "This message is intended to annoy you enough to update." -#pragma message \ - "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/" - -inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) { - switch (mode) { - case PoolingMode::kMaximumDeterministic: - return CUDNN_POOLING_MAX; - case PoolingMode::kAverageExclusive: - return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - case PoolingMode::kAverageInclusive: - return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; - case PoolingMode::kMaximum: - return CUDNN_POOLING_MAX; - default: - PADDLE_THROW( - platform::errors::Unimplemented("Unexpected CUDNN pooling mode.")); - } -} -#else - inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) { switch (mode) { case PoolingMode::kMaximumDeterministic: @@ -130,7 +106,6 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) { platform::errors::Unimplemented("Unexpected CUDNN pooling mode.")); } } -#endif // CUDNN_VERSION < 6000 inline ActivationMode StringToActivationMode(const std::string& str) { if (str == "identity") { @@ -471,19 +446,6 @@ class ScopedConvolutionDescriptor { "of pads is %d, size of dilations is %d.", pads.size(), dilations.size())); -#if !CUDNN_VERSION_MIN(6, 0, 0) - // cudnn v5 does not support dilation conv, the argument is called upscale - // instead of dilations and it is must be one. - for (size_t i = 0; i < dilations.size(); ++i) { - PADDLE_ENFORCE_EQ(dilations[i], 1, - platform::errors::InvalidArgument( - "Dilations conv is not supported in this cuDNN " - "version(%d.%d.%d).", - CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100, - CUDNN_VERSION % 100)); - } -#endif - cudnnDataType_t compute_type = (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT; PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor( From a71d72d921fc861051553c6d44b32bc9037706bc Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Mon, 29 Mar 2021 20:30:37 +0800 Subject: [PATCH 15/15] relu forward and backward with vectortype (#31869) --- paddle/fluid/operators/activation_op.cu | 286 +++++++++++++++++++++++- 1 file changed, 285 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 2033081af224a4..c6d2fbccd8e84b 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -10,8 +10,278 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/math/math_cuda_utils.h" +#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/float16.h" +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using float16 = paddle::platform::float16; + +template +struct CudaVecType { + using type = T; + static constexpr int vecsize = 1; +}; + +template <> +struct CudaVecType { + using type = __half2; + static constexpr int vecsize = 2; +}; + +template <> +struct CudaVecType { + using type = float4; + static constexpr int vecsize = 4; +}; + +template +class BaseGPUFunctor { + public: + using ELEMENT_TYPE = T; +}; + +/* ========================================================================== */ + +/* =========================== relu forward ============================ */ +template +class ReluGPUFunctor : public BaseGPUFunctor { + private: + T zero_; + + public: + ReluGPUFunctor() { zero_ = static_cast(0.0f); } + + // for relu forward when T is double + __device__ __forceinline__ typename CudaVecType::type Compute( + const typename CudaVecType::type* x); + + // when num % vecsize != 0 this func will be used + __device__ __forceinline__ T ComputeRemainder(const T x) { + return x > zero_ ? x : zero_; + } +}; + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGPUFunctor::Compute(const CudaVecType::type* x) { +// relu forward : out = max(x, 0) +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 + return __ldg(x) > zero_ ? __ldg(x) : zero_; +#else + return (*x) > zero_ ? (*x) : zero_; +#endif +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGPUFunctor::Compute(const CudaVecType::type* xx) { + // relu forward : out = max(xx, 0) + return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y), + (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w)); +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGPUFunctor::Compute(const CudaVecType::type* in) { +// relu forward : out = max(in, 0) +#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const half2 kzero = __float2half2_rn(0.0f); + return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in)); +#else + const float2 xx = __half22float2(*in); + return __floats2half2_rn((xx.x > 0.0f) * static_cast(xx.x), + (xx.y > 0.0f) * static_cast(xx.y)); +#endif +} +/* ========================================================================== */ + +/* =========================== relu backward ============================ + */ + +template +class ReluGradGPUFunctor : public BaseGPUFunctor { + private: + T zero_; + + public: + ReluGradGPUFunctor() { zero_ = static_cast(0.0f); } + + // for relu backward when T is double + __device__ __forceinline__ typename CudaVecType::type Compute( + const typename CudaVecType::type* out, + const typename CudaVecType::type* dout); + + // when num % vecsize != 0 this func will be used + __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) { + // relu backward : dx = out > 0 ? dout : 0 + return out > zero_ ? dout : zero_; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGradGPUFunctor::Compute(const CudaVecType::type* out, + const CudaVecType::type* dout) { +// relu backward : dx = out > 0 ? dout : 0; +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 + return __ldg(out) > zero_ ? __ldg(dout) : zero_; +#else + return (*out) > zero_ ? (*dout) : zero_; +#endif +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGradGPUFunctor::Compute(const CudaVecType::type* out, + const CudaVecType::type* dout) { + // relu backward : dx = out > 0 ? dout : 0; + return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y), + (out->z > zero_) * (dout->z), + (out->w > zero_) * (dout->w)); +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGradGPUFunctor::Compute(const CudaVecType::type* out, + const CudaVecType::type* dout) { +// relu backward : dx = out > 0 ? dout : 0; +#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const half2 kzero = __float2half2_rn(0.0f); + return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout)); +#else + const float2 xx = __half22float2(*out); + const float2 yy = __half22float2(*dout); + return __floats2half2_rn((xx.x > 0.0f) * static_cast(yy.x), + (xx.y > 0.0f) * static_cast(yy.y)); +#endif +} + +/* ========================================================================== */ + +template +__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout, + T* dx, int num, Functor functor) { + using VecType = typename CudaVecType::type; + constexpr int vecsize = CudaVecType::vecsize; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + int loop = num / vecsize; + int tail = num % vecsize; + const VecType* in_forward = reinterpret_cast(forward_data); + const VecType* in_dout = reinterpret_cast(dout); + VecType* out = reinterpret_cast(dx); + + for (int i = idx; i < loop; i += stride) { + out[i] = functor.Compute((in_forward + i), (in_dout + i)); + } + + while (idx == loop && tail) { + dx[num - tail] = + functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]); + --tail; + } +} + +template +__global__ void ActivationkernelVec(const T* src, T* dst, int num, + Functor functor) { + constexpr int vecsize = CudaVecType::vecsize; + using VecType = typename CudaVecType::type; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + int loop = num / vecsize; + int tail = num % vecsize; + const VecType* in = reinterpret_cast(src); + VecType* out = reinterpret_cast(dst); + + for (int i = idx; i < loop; i += stride) { + out[i] = functor.Compute((in + i)); + } + + while (idx == loop && tail) { + dst[num - tail] = functor.ComputeRemainder(src[num - tail]); + --tail; + } +} + +template +class ActivationGPUKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = nullptr; + framework::Tensor* out = nullptr; + ExtractActivationTensor(context, &in_x, &out); + auto& dev_ctx = context.template device_context(); + + int num = in_x->numel(); + const T* input_data = in_x->data(); + T* output_data = out->mutable_data(dev_ctx.GetPlace(), + static_cast(num * sizeof(T))); + + int block = 512; +#ifdef __HIPCC__ + block = 256; +#endif + Functor functor; + constexpr int vecsize = CudaVecType::vecsize; + int grid = max((num / vecsize + block - 1) / block, 1); + auto stream = context.cuda_device_context().stream(); + ActivationkernelVec<<>>( + input_data, output_data, num, functor); + } +}; + +template +class ActivationGradGPUKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor *x, *out, *d_out; + framework::Tensor* d_x = nullptr; + x = out = d_out = nullptr; + ExtractActivationGradTensor(context, &x, &out, &d_out, + &d_x); + int numel = d_out->numel(); + auto& dev_ctx = context.template device_context(); + auto* dx_data = d_x->mutable_data( + dev_ctx.GetPlace(), static_cast(numel * sizeof(T))); + auto* dout_data = d_out->data(); + + auto* forward_data = dout_data; + if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { + // Only need forward output Out + forward_data = out->data(); + } else if (static_cast(Functor::FwdDeps()) == + static_cast(kDepX)) { + // Only need forward input X + forward_data = x->data(); + } + + int block = 512; +#ifdef __HIPCC__ + block = 256; +#endif + Functor functor; + constexpr int vecsize = CudaVecType::vecsize; + int grid = max((numel / vecsize + block - 1) / block, 1); + auto stream = context.cuda_device_context().stream(); + ActivationGradKernelVec<<>>( + forward_data, dout_data, dx_data, numel, functor); + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; @@ -60,7 +330,21 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== relu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluCUDAFunctor, ReluGradFunctor); +REGISTER_OP_CUDA_KERNEL( + relu, ops::ActivationGPUKernel>, + ops::ActivationGPUKernel>, + ops::ActivationGPUKernel>); + +REGISTER_OP_CUDA_KERNEL( + relu_grad, ops::ActivationGradGPUKernel>, + ops::ActivationGradGPUKernel>, + ops::ActivationGradGPUKernel>); REGISTER_OP_CUDA_KERNEL( relu_grad_grad,