diff --git a/cmake/anakin_subgraph.cmake b/cmake/anakin_subgraph.cmake index 4a7d32a63553df..b5437e776d31e4 100644 --- a/cmake/anakin_subgraph.cmake +++ b/cmake/anakin_subgraph.cmake @@ -25,8 +25,9 @@ endif() if(ANAKIN_FOUND) message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ") + include_directories(${ANAKIN_ROOT}) include_directories(${ANAKIN_ROOT}/include) - include_directories(${ANAKIN_ROOT}/include/saber) + include_directories(${ANAKIN_ROOT}/saber) link_directories(${ANAKIN_ROOT}) add_definitions(-DPADDLE_WITH_ANAKIN) endif() diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index b19d50a6ad6afa..581bf50e646510 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -13,6 +13,7 @@ paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, d paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7d9a51fc9cf3c5245b5227080a8064c3')) paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '4c0cd83f0b401fc2ff84c70974e5d210')) paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912')) +paddle.fluid.in_dygraph_mode (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f06314a1cb30c96b5808dde2219c2dae')) paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03')) paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '9c7decb955b9c4f718114179c8985581')) @@ -124,7 +125,7 @@ paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6')) paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2')) paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571')) -paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '6e428384ce6a77207fa2c70d9f011990')) +paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '35c6a241bcc1a1fc89508860d82ad62b')) paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9')) paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32')) paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab')) @@ -235,6 +236,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec')) paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607')) paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329')) +paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '731b21c62a4add60a33bd76d802ffc5c')) paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393')) paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139')) paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc')) @@ -360,7 +362,7 @@ paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_step paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae')) paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28')) paddle.fluid.layers.append_LARS (ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None), ('document', 'd24fa1e7d62ac8a534fc6a86002f84f8')) -paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '9588c64c26ffaef3c466e404a6af9d9b')) +paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', 'f8b2727bccf0f368c997d7cf05847e49')) paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', '2ef3f5ca5cd71ea4217c418e5a7a0565')) paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 85f328b7c40568..beb305ef5e4191 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -83,11 +83,11 @@ struct BuildStrategy { bool sync_batch_norm_{false}; - bool memory_optimize_{true}; - // TODO(dzhwinter): - // make enable_inplace, memory_optimize_ - // memory_early_delete_ true by default - bool enable_inplace_{true}; + // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4 + // to open them by default, we need to solve the fetch variable issue + bool memory_optimize_{false}; + + bool enable_inplace_{false}; bool enable_sequential_execution_{false}; diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index ca008763bff8ff..cd8030519ccfcf 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -48,17 +48,37 @@ void FCFusePass::ApplyImpl(ir::Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern); + auto base_op_desc = mul->Op(); // Create an FC Node. + // OpDesc desc(base_op_desc, nullptr); OpDesc desc; std::string fc_x_in = subgraph.at(x)->Name(); std::string fc_Y_in = w->Name(); std::string fc_bias_in = fc_bias->Name(); std::string fc_out_out = fc_out->Name(); + desc.SetInput("Input", std::vector({fc_x_in})); desc.SetInput("W", std::vector({fc_Y_in})); desc.SetInput("Bias", std::vector({fc_bias_in})); desc.SetOutput("Out", std::vector({fc_out_out})); desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims")); + + // For anakin subgraph int8 + // When in anakin subgraph int8 mode, the pattern like "fake_quant + mul + + // fake_dequant" + // can be detected by the quant_dequant_fuse_pass. This pass will add + // "input_scale", + // "weight_scale" which are extracted from fake_quant op and fake_dequant op + // to mul op, + // and then delete the fake_quant op and fake_dequant op in the graph. If + // the mul op + // has the scale info, we should add those to the fused fc. + if (base_op_desc->HasAttr("enable_int8")) { + desc.SetAttr("enable_int8", base_op_desc->GetAttr("enable_int8")); + desc.SetAttr("input_scale", base_op_desc->GetAttr("input_scale")); + desc.SetAttr("weight_scale", base_op_desc->GetAttr("weight_scale")); + } + desc.SetType("fc"); auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. GraphSafeRemoveNodes(graph, {mul, elementwise_add, mul_out}); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 8468f9ccc12a01..77f50e914b668e 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1640,7 +1640,8 @@ PDNode *patterns::FillConstantElementWiseMulFuse::operator()( void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, const std::string &op_type, const std::string &weight_name, - int times) { + int times, + const std::string &quant_type) { const int kNumFields = 5; const int kQuantizedWeightOffset = 0; const int kQuantizedOpOffset = 1; @@ -1648,24 +1649,22 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, const int kDequantOpOffset = 3; const int kDequantOpOutOffset = 4; // the quant op always be one. - auto quant_op_in_scale = - pattern->NewNode(GetNodeName("quant_op_in_scale")) - ->assert_is_op_input("fake_quantize_range_abs_max", "InScale") - ->AsInput(); - auto quant_op = pattern->NewNode(GetNodeName("quant_op")) - ->assert_is_op("fake_quantize_range_abs_max"); + auto quant_op_in_scale = pattern->NewNode(GetNodeName("quant_op_in_scale")) + ->assert_is_op_input(quant_type, "InScale") + ->AsInput(); + auto quant_op = + pattern->NewNode(GetNodeName("quant_op"))->assert_is_op(quant_type); auto quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale")) - ->assert_is_op_output("fake_quantize_range_abs_max", "OutScale") + ->assert_is_op_output(quant_type, "OutScale") ->assert_is_op_input("fake_dequantize_max_abs", "Scale") ->AsIntermediate(); - auto quant_op_out = - pattern->NewNode(GetNodeName("quant_op_out")) - ->assert_is_op_output("fake_quantize_range_abs_max", "Out") - ->assert_is_op_input(op_type) - ->AsIntermediate(); + auto quant_op_out = pattern->NewNode(GetNodeName("quant_op_out")) + ->assert_is_op_output(quant_type, "Out") + ->assert_is_op_input(op_type) + ->AsIntermediate(); // there are 'times' quantized and dequant op std::vector nodes; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index a5ac3a0c3733cf..525987e0072cb0 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -880,7 +880,8 @@ struct QuantDequantOpFuse : public PatternBase { : PatternBase(pattern, name_scope, "quant_dequant_fuse") {} void operator()(PDNode* quant_op_input, const std::string& op_name, - const std::string& weight_name, int times = 1); + const std::string& weight_name, int times, + const std::string& quant_type); std::string GetNodeName(const std::string& op_type) { return PDNodeName(name_scope_, repr_, id_, op_type); diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 7cab9c353d35cb..017e3ef234c95d 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -25,7 +25,8 @@ namespace framework { namespace ir { void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, - std::string op_type) { + const std::string& op_type, + const std::string& quant_type) { const std::string pattern_name = "quant_dequant_fuse"; // FusePassBase::Init(pattern_name, graph); const int kNumFields = 5; @@ -38,7 +39,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, GraphPatternDetector gpd; auto* x = gpd.mutable_pattern() ->NewNode("x") - ->assert_is_op_input("fake_quantize_range_abs_max", "X") + ->assert_is_op_input(quant_type, "X") ->AsInput(); std::string quantized_op_type = ""; @@ -46,6 +47,9 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, if (op_type == "conv2d") { quantized_op_type = "conv2d"; weight_name = "Filter"; + } else if (op_type == "depthwise_conv2d") { + quantized_op_type = "depthwise_conv2d"; + weight_name = "Filter"; } else if (op_type == "conv2d_fusion") { quantized_op_type = "conv2d_fusion"; weight_name = "Filter"; @@ -62,7 +66,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, } patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name); - pattern(x, quantized_op_type, weight_name, times); + pattern(x, quantized_op_type, weight_name, times, quant_type); auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { @@ -103,7 +107,6 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, std::unordered_set delete_nodes; for (int i = 0; i < times; i++) { - // max_range = (range * range) / weight_scale float max_range = boost::get( nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range")); float weight_scale = (range * range) / max_range; @@ -118,7 +121,8 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, new_op_desc.SetType(quantized_op_type); if (quantized_op_type == "conv2d" || - quantized_op_type == "conv2d_fusion") { + quantized_op_type == "conv2d_fusion" || + quantized_op_type == "depthwise_conv2d") { new_op_desc.SetInput("Input", {new_input}); new_op_desc.SetOutput("Output", {new_output}); } else if (quantized_op_type == "fc") { @@ -156,11 +160,17 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "quant_dequant_fuse"; FusePassBase::Init(pattern_name, graph); - std::unordered_set quantized_op_types = {"conv2d", "mul"}; + std::unordered_set quant_types = { + "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"}; + + std::unordered_set quantized_op_types = {"conv2d", "mul", + "depthwise_conv2d"}; auto* scope = param_scope(); - for (auto& op_type : quantized_op_types) { - for (int i = 1; i <= 6; i++) { - RunQuantDequant(graph, scope, i, op_type); + for (auto& quant_type : quant_types) { + for (auto& op_type : quantized_op_types) { + for (int i = 6; i >= 1; i--) { + RunQuantDequant(graph, scope, i, op_type, quant_type); + } } } } diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt index d3d1522dccf0d8..6546d3b855fbc1 100644 --- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt +++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt @@ -1,4 +1,9 @@ -cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry) +cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc +elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc +batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc +detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc affine_channel.cc +roi_align.cc helper.cc DEPS anakin_engine framework_proto scope op_registry +gtest) cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL) cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL) @@ -14,5 +19,5 @@ cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter fla cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL) cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL) cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL) -#cc_test(test_anakin_im2sequence SRCS test_im2sequence_op.cc DEPS anakin_op_converter im2sequence_op im2col) cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS anakin_op_converter sum_op selected_rows_functor SERIAL) +cc_test(test_anakin_affine_channel SRCS test_affine_channel_op.cc DEPS anakin_op_converter affine_channel_op SERIAL) diff --git a/paddle/fluid/inference/anakin/convert/activation.cc b/paddle/fluid/inference/anakin/convert/activation.cc index a9aeb19ffd5f04..523571f1aa8b5a 100644 --- a/paddle/fluid/inference/anakin/convert/activation.cc +++ b/paddle/fluid/inference/anakin/convert/activation.cc @@ -16,16 +16,13 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; - namespace paddle { namespace inference { namespace anakin { -ActivationOpConverter::ActivationOpConverter(const std::string &op_type) +template +ActivationOpConverter::ActivationOpConverter( + const std::string &op_type) : op_type_(op_type) { auto it = anakin_op_types_.find(op_type_); PADDLE_ENFORCE(it != anakin_op_types_.end(), @@ -33,10 +30,10 @@ ActivationOpConverter::ActivationOpConverter(const std::string &op_type) anakin_op_type_ = it->second; } -void ActivationOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void ActivationOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); @@ -44,8 +41,17 @@ void ActivationOpConverter::operator()(const framework::proto::OpDesc &op, auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); auto input_name = op_desc.Input("X").front(); auto output_name = op_desc.Output("Out").front(); - engine_->AddOp(op_name, "Activation", {input_name}, {output_name}); - engine_->AddOpAttr(op_name, "type", anakin_op_type_); + this->engine_->AddOp(op_name, "Activation", {input_name}, {output_name}); + this->engine_->AddOpAttr(op_name, "type", anakin_op_type_); + + if (op_type_ == "swish") { + float beta = boost::get(op_desc.GetAttr("beta")); + this->engine_->AddOpAttr(op_name, "clip_relu_num", beta); + } + if (op_type_ == "relu6") { + float threshold = boost::get(op_desc.GetAttr("threshold")); + this->engine_->AddOpAttr(op_name, "clip_relu_num", threshold); + } } } // namespace anakin @@ -54,3 +60,5 @@ void ActivationOpConverter::operator()(const framework::proto::OpDesc &op, REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter); REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(swish, SwishOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(relu6, Relu6OpConverter); diff --git a/paddle/fluid/inference/anakin/convert/activation.h b/paddle/fluid/inference/anakin/convert/activation.h index 592a3d5bd9d127..a2475e492c4080 100644 --- a/paddle/fluid/inference/anakin/convert/activation.h +++ b/paddle/fluid/inference/anakin/convert/activation.h @@ -22,7 +22,8 @@ namespace paddle { namespace inference { namespace anakin { -class ActivationOpConverter : public AnakinOpConverter { +template +class ActivationOpConverter : public AnakinOpConverter { public: explicit ActivationOpConverter(const std::string &op_type); @@ -36,18 +37,36 @@ class ActivationOpConverter : public AnakinOpConverter { std::string op_type_; std::string anakin_op_type_; std::map anakin_op_types_{{"tanh", "TanH"}, - {"sigmoid", "Sigmoid"}}; + {"sigmoid", "Sigmoid"}, + {"relu6", "ClippedRelu"}, + {"swish", "Swish"}}; }; -class TanhOpConverter : public ActivationOpConverter { +template +class TanhOpConverter : public ActivationOpConverter { public: - TanhOpConverter() : ActivationOpConverter("tanh") {} + TanhOpConverter() : ActivationOpConverter("tanh") {} }; -class SigmoidOpConverter : public ActivationOpConverter { +template +class SigmoidOpConverter : public ActivationOpConverter { public: - SigmoidOpConverter() : ActivationOpConverter("sigmoid") {} + SigmoidOpConverter() + : ActivationOpConverter("sigmoid") {} }; + +template +class Relu6OpConverter : public ActivationOpConverter { + public: + Relu6OpConverter() : ActivationOpConverter("relu6") {} +}; + +template +class SwishOpConverter : public ActivationOpConverter { + public: + SwishOpConverter() : ActivationOpConverter("swish") {} +}; + } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/affine_channel.cc b/paddle/fluid/inference/anakin/convert/affine_channel.cc new file mode 100644 index 00000000000000..534e7dca81db95 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/affine_channel.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/affine_channel.h" +#include +#include +#include +#include "paddle/fluid/inference/anakin/convert/helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void AffineChannelOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + auto input_name = op_desc.Input("X").front(); + auto output_name = op_desc.Output("Out").front(); + this->engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name}); + + // Copy the Scale to CPUPlace and get the pointer. + auto *scale_v = scope.FindVar(op_desc.Input("Scale").front()); + PADDLE_ENFORCE_NOT_NULL(scale_v); + auto weight1 = pblock_from_var(*scale_v, this->engine_); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + + // Copy the Bias to CPUPlace and get the pointer. + auto *bias_v = scope.FindVar(op_desc.Input("Bias").front()); + PADDLE_ENFORCE_NOT_NULL(bias_v); + auto weight2 = pblock_from_var(*bias_v, this->engine_); + this->engine_->AddOpAttr(op_name, "weight_2", *weight2); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(affine_channel, AffineChannelOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/affine_channel.h b/paddle/fluid/inference/anakin/convert/affine_channel.h new file mode 100644 index 00000000000000..443f6101288af4 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/affine_channel.h @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class AffineChannelOpConverter : public AnakinOpConverter { + public: + AffineChannelOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~AffineChannelOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc index 38cf6172027b3b..b41f5dc925208d 100644 --- a/paddle/fluid/inference/anakin/convert/batch_norm.cc +++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc @@ -18,107 +18,64 @@ #include #include #include - -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; +#include "paddle/fluid/inference/anakin/convert/helper.h" namespace paddle { namespace inference { namespace anakin { -void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void BatchNormOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1); std::map inputs; for (auto k : {"X", "Scale", "Bias", "Mean", "Variance"}) { PADDLE_ENFORCE_EQ(op_desc.Input(k).size(), 1UL); - auto v = op_desc.Input(k).front(); - inputs.insert({k, v}); } + auto input = op_desc.Input("X").front(); auto output = op_desc.Output("Y").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front(); auto epsilon = boost::get(op_desc.GetAttr("epsilon")); - // auto momentum = boost::get(op_desc.GetAttr("momentum")); auto bn_op_name = op_name + ":bn"; auto bn_output = bn_op_name + "_output"; - engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output}); - engine_->AddOpAttr(bn_op_name, "epsilon", epsilon); - engine_->AddOpAttr(bn_op_name, "momentum", static_cast(1.0)); + this->engine_->AddOp(bn_op_name, "BatchNorm", {input}, {bn_output}); + this->engine_->AddOpAttr(bn_op_name, "epsilon", epsilon); + this->engine_->AddOpAttr(bn_op_name, "momentum", static_cast(1.0)); auto scale_op_name = op_name + ":scale"; - auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name, - framework::LoDTensor *tensor) { - auto *v = scope.FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(v); - auto *t = v->GetMutable(); - tensor->Resize(t->dims()); - TensorCopySync(*t, platform::CPUPlace(), tensor); - }; - - framework::LoDTensor bias_t; - framework::LoDTensor mean_t; - framework::LoDTensor scale_t; - framework::LoDTensor variance_t; - get_lod_tensor(inputs["Bias"], &bias_t); - get_lod_tensor(inputs["Mean"], &mean_t); - get_lod_tensor(inputs["Scale"], &scale_t); - get_lod_tensor(inputs["Variance"], &variance_t); - - auto fill_shape = [](size_t n, std::vector shape) { - shape.insert(shape.begin(), 1); - if (shape.size() < n) { - shape.insert(shape.end(), n - shape.size(), 1); - } - return shape; - }; - Shape shape1(fill_shape(4, framework::vectorize2int(mean_t.dims()))); - Shape shape2(fill_shape(4, framework::vectorize2int(variance_t.dims()))); - auto *weight1 = - GraphGlobalMem::Global().template new_block(shape1); - auto *mean_data = static_cast(weight1->h_tensor().mutable_data()); - std::copy_n(mean_t.data(), mean_t.numel(), mean_data); - engine_->AddOpAttr(bn_op_name, "weight_1", *weight1); - - auto *weight2 = - GraphGlobalMem::Global().template new_block(shape2); - auto *variance_data = - static_cast(weight2->h_tensor().mutable_data()); - std::copy_n(variance_t.data(), variance_t.numel(), variance_data); - engine_->AddOpAttr(bn_op_name, "weight_2", *weight2); - - Shape shape3(std::vector({1, 1, 1, 1})); - auto *weight3 = - GraphGlobalMem::Global().template new_block(shape3); - auto *alpha_data = static_cast(weight3->h_tensor().mutable_data()); - float weight3_data[] = {1}; - std::copy(std::begin(weight3_data), std::end(weight3_data), alpha_data); - engine_->AddOpAttr(bn_op_name, "weight_3", *weight3); - - Shape scale_shape(fill_shape(4, framework::vectorize2int(scale_t.dims()))); - auto *scale = - GraphGlobalMem::Global().template new_block(scale_shape); - auto *scale_data = static_cast(scale->h_tensor().mutable_data()); - std::copy_n(scale_t.data(), scale_t.numel(), scale_data); - - Shape bias_shape(fill_shape(4, framework::vectorize2int(bias_t.dims()))); - auto *bias = - GraphGlobalMem::Global().template new_block(bias_shape); - auto *bias_data = static_cast(bias->h_tensor().mutable_data()); - std::copy_n(bias_t.data(), bias_t.numel(), bias_data); - - engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output}); - engine_->AddOpAttr(scale_op_name, "axis", 1); - engine_->AddOpAttr(scale_op_name, "num_axes", 1); - engine_->AddOpAttr(scale_op_name, "bias_term", true); - engine_->AddOpAttr(scale_op_name, "weight_1", *scale); - engine_->AddOpAttr(scale_op_name, "weight_2", *bias); + this->engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output}); + this->engine_->AddOpAttr(scale_op_name, "axis", 1); + this->engine_->AddOpAttr(scale_op_name, "num_axes", 1); + this->engine_->AddOpAttr(scale_op_name, "bias_term", true); + + auto *mean_v = scope.FindVar(op_desc.Input("Mean").front()); + PADDLE_ENFORCE_NOT_NULL(mean_v); + auto weight1 = pblock_from_var(*mean_v, this->engine_); + this->engine_->AddOpAttr(bn_op_name, "weight_1", *weight1); + + auto *variance_v = scope.FindVar(op_desc.Input("Variance").front()); + PADDLE_ENFORCE_NOT_NULL(variance_v); + auto weight2 = + pblock_from_var(*variance_v, this->engine_); + this->engine_->AddOpAttr(bn_op_name, "weight_2", *weight2); + + auto *weight3 = pblock_from_vector( + std::vector({1}), this->engine_); + this->engine_->AddOpAttr(bn_op_name, "weight_3", *weight3); + + auto *scale_v = scope.FindVar(op_desc.Input("Scale").front()); + PADDLE_ENFORCE_NOT_NULL(scale_v); + auto scale = pblock_from_var(*scale_v, this->engine_); + this->engine_->AddOpAttr(scale_op_name, "weight_1", *scale); + + auto *bias_v = scope.FindVar(op_desc.Input("Bias").front()); + PADDLE_ENFORCE_NOT_NULL(bias_v); + auto bias = pblock_from_var(*bias_v, this->engine_); + this->engine_->AddOpAttr(scale_op_name, "weight_2", *bias); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.h b/paddle/fluid/inference/anakin/convert/batch_norm.h index c56735f15b435b..52156aeb0283af 100644 --- a/paddle/fluid/inference/anakin/convert/batch_norm.h +++ b/paddle/fluid/inference/anakin/convert/batch_norm.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class BatchNormOpConverter : public AnakinOpConverter { +template +class BatchNormOpConverter : public AnakinOpConverter { public: BatchNormOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/concat.cc b/paddle/fluid/inference/anakin/convert/concat.cc index ae90c083690da6..584a82ead43fa7 100644 --- a/paddle/fluid/inference/anakin/convert/concat.cc +++ b/paddle/fluid/inference/anakin/convert/concat.cc @@ -15,34 +15,23 @@ #include "paddle/fluid/inference/anakin/convert/concat.h" #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::Precision; -using anakin::saber::NV; -using anakin::saber::X86; -using anakin::saber::Shape; -using anakin::PBlock; -using anakin::PTuple; - namespace paddle { namespace inference { namespace anakin { -void ConcatOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void ConcatOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); int axis = boost::get(op_desc.GetAttr("axis")); auto input_names = op_desc.Input("X"); - // PADDLE_ENFORCE(axis > 0, - // "The axis attr of Concat op should be large than 0 for trt"); auto y_name = op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); - engine_->AddOp(op_name, "Concat", input_names, {y_name}); - engine_->AddOpAttr(op_name, "axis", axis); + this->engine_->AddOp(op_name, "Concat", input_names, {y_name}); + this->engine_->AddOpAttr(op_name, "axis", axis); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/concat.h b/paddle/fluid/inference/anakin/convert/concat.h index 974ff689bfef68..fb5514affa78d2 100644 --- a/paddle/fluid/inference/anakin/convert/concat.h +++ b/paddle/fluid/inference/anakin/convert/concat.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class ConcatOpConverter : public AnakinOpConverter { +template +class ConcatOpConverter : public AnakinOpConverter { public: ConcatOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/conv2d.cc b/paddle/fluid/inference/anakin/convert/conv2d.cc index 308f14604b9c83..70e0adf5ead45d 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d.cc +++ b/paddle/fluid/inference/anakin/convert/conv2d.cc @@ -16,21 +16,18 @@ #include #include #include +#include "paddle/fluid/inference/anakin/convert/helper.h" -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void Conv2dOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL); @@ -39,46 +36,69 @@ void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op, auto input_name = op_desc.Input("Input").front(); auto output_name = op_desc.Output("Output").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front(); - engine_->AddOp(op_name, "Convolution", {input_name}, {output_name}); + this->engine_->AddOp(op_name, "Convolution", {input_name}, {output_name}); auto *filter_v = scope.FindVar(op_desc.Input("Filter").front()); PADDLE_ENFORCE_NOT_NULL(filter_v); - auto *filter_t = filter_v->GetMutable(); - std::unique_ptr weight_tensor( - new framework::LoDTensor()); - weight_tensor->Resize(filter_t->dims()); - TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get()); + auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); + auto weight_shape = framework::vectorize2int(weight_tensor->dims()); PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); - // const int n_output = weight_tensor->dims()[0]; - // const int n_input = weight_tensor->dims()[1]; const int filter_h = weight_tensor->dims()[2]; const int filter_w = weight_tensor->dims()[3]; - // auto filter_num = n_input * filter_h * filter_w ; + auto filter_num = weight_tensor->dims()[0]; - engine_->AddOpAttr(op_name, "filter_num", filter_num); - engine_->AddOpAttr>(op_name, "kernel_size", {filter_h, filter_w}); + this->engine_->template AddOpAttr(op_name, "filter_num", filter_num); + this->engine_->template AddOpAttr>(op_name, "kernel_size", + {filter_h, filter_w}); auto strides = boost::get>(op_desc.GetAttr("strides")); - engine_->AddOpAttr>(op_name, "strides", strides); + this->engine_->template AddOpAttr>(op_name, "strides", strides); auto paddings = boost::get>(op_desc.GetAttr("paddings")); - engine_->AddOpAttr>(op_name, "padding", paddings); + this->engine_->template AddOpAttr>(op_name, "padding", paddings); auto dilations = boost::get>(op_desc.GetAttr("dilations")); - engine_->AddOpAttr>(op_name, "dilation_rate", dilations); + this->engine_->template AddOpAttr>(op_name, "dilation_rate", + dilations); const int groups = boost::get(op_desc.GetAttr("groups")); - engine_->AddOpAttr(op_name, "group", groups); - engine_->AddOpAttr(op_name, "axis", 1); - engine_->AddOpAttr(op_name, "bias_term", false); + this->engine_->AddOpAttr(op_name, "group", groups); + this->engine_->AddOpAttr(op_name, "axis", 1); + this->engine_->AddOpAttr(op_name, "bias_term", false); + + ::anakin::saber::Shape anakin_shape(weight_shape); + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); - auto weight_shape = framework::vectorize2int(filter_t->dims()); - Shape anakin_shape(weight_shape); - auto *weight1 = - GraphGlobalMem::Global().template new_block(anakin_shape); - float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); - std::copy_n(weight_tensor->data(), weight_tensor->numel(), cpu_data); - weight1->d_tensor().set_shape(anakin_shape); - weight1->d_tensor().copy_from(weight1->h_tensor()); - engine_->AddOpAttr(op_name, "weight_1", *weight1); + if (enable_int8) { + const float int8_range = 127.; + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + float weight_scale = boost::get(op_desc.GetAttr("weight_scale")); + PBlock *weight1 = + new PBlock(anakin_shape, ::anakin::AK_INT8); + this->engine_->RegistBlock(weight1); + float *weight_data = weight_tensor->data(); + std::vector weight_int8; + int weight_num = weight_tensor->numel(); + for (int i = 0; i < weight_tensor->numel(); i++) { + bool is_valid_int8 = + ((weight_data[i] >= -128) && (weight_data[i] <= 127)); + PADDLE_ENFORCE(is_valid_int8, + "We are in anakin subgraph int8 mode, the weight of conv " + "should be in range [-128, 127]"); + weight_int8.push_back(static_cast(weight_data[i])); + } + memcpy(static_cast(weight1->h_tensor().mutable_data()), + static_cast(weight_int8.data()), sizeof(char) * weight_num); + weight1->d_tensor().set_shape(anakin_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); + this->engine_->Graph()->SetWeightsScale(op_name, + {weight_scale / int8_range}, false); + this->engine_->AddTensorScale(input_name, in_scale / int8_range); + } else { + auto *weight1 = pblock_from_tensor( + *weight_tensor, weight_shape, this->engine_); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + } } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/conv2d.h b/paddle/fluid/inference/anakin/convert/conv2d.h index dca5d19f468ac6..b22cb8ea9318cf 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d.h +++ b/paddle/fluid/inference/anakin/convert/conv2d.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class Conv2dOpConverter : public AnakinOpConverter { +template +class Conv2dOpConverter : public AnakinOpConverter { public: Conv2dOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc index fa1ab0efeeb5ca..a1568b8bdeeb93 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc +++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc @@ -16,21 +16,18 @@ #include #include #include +#include "paddle/fluid/inference/anakin/convert/helper.h" -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void Conv2dFusionOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL); @@ -40,71 +37,74 @@ void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op, auto input_name = op_desc.Input("Input").front(); auto output_name = op_desc.Output("Output").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front(); - engine_->AddOp(op_name, "Convolution", {input_name}, {output_name}); + this->engine_->AddOp(op_name, "Convolution", {input_name}, {output_name}); auto *filter_v = scope.FindVar(op_desc.Input("Filter").front()); PADDLE_ENFORCE_NOT_NULL(filter_v); - auto *filter_t = filter_v->GetMutable(); + + auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); + auto weight_shape = framework::vectorize2int(weight_tensor->dims()); auto *b_v = scope.FindVar(op_desc.Input("Bias").front()); PADDLE_ENFORCE_NOT_NULL(b_v); - auto *b_t = b_v->GetMutable(); - - std::unique_ptr weight_tensor( - new framework::LoDTensor()); - weight_tensor->Resize(filter_t->dims()); - TensorCopySync((*filter_t), platform::CPUPlace(), weight_tensor.get()); PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); - - // const int n_output = weight_tensor->dims()[0]; - // const int n_input = weight_tensor->dims()[1]; const int filter_h = weight_tensor->dims()[2]; const int filter_w = weight_tensor->dims()[3]; - // auto filter_num = n_input * filter_h * filter_w ; auto filter_num = weight_tensor->dims()[0]; - engine_->AddOpAttr(op_name, "filter_num", filter_num); - engine_->AddOpAttr>(op_name, "kernel_size", {filter_h, filter_w}); + this->engine_->template AddOpAttr(op_name, "filter_num", filter_num); + this->engine_->template AddOpAttr>(op_name, "kernel_size", + {filter_h, filter_w}); auto strides = boost::get>(op_desc.GetAttr("strides")); - engine_->AddOpAttr>(op_name, "strides", strides); + this->engine_->template AddOpAttr>(op_name, "strides", strides); auto paddings = boost::get>(op_desc.GetAttr("paddings")); - engine_->AddOpAttr>(op_name, "padding", paddings); + this->engine_->template AddOpAttr>(op_name, "padding", paddings); auto dilations = boost::get>(op_desc.GetAttr("dilations")); - engine_->AddOpAttr>(op_name, "dilation_rate", dilations); + this->engine_->template AddOpAttr>(op_name, "dilation_rate", + dilations); const int groups = boost::get(op_desc.GetAttr("groups")); - engine_->AddOpAttr(op_name, "group", groups); - engine_->AddOpAttr(op_name, "axis", 1); - engine_->AddOpAttr(op_name, "bias_term", true); - - auto weight_shape = framework::vectorize2int(filter_t->dims()); - Shape anakin_shape(weight_shape); - auto *weight1 = - GraphGlobalMem::Global().template new_block(anakin_shape); - float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); - std::copy_n(weight_tensor->data(), weight_tensor->numel(), cpu_data); - weight1->d_tensor().set_shape(anakin_shape); - weight1->d_tensor().copy_from(weight1->h_tensor()); - engine_->AddOpAttr(op_name, "weight_1", *weight1); - - auto bias_shape = framework::vectorize2int(b_t->dims()); - framework::LoDTensor bias_tensor; - bias_tensor.Resize(b_t->dims()); - TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor); - auto *bias_data = bias_tensor.data(); - bias_shape.insert(bias_shape.begin(), 1); - bias_shape.insert(bias_shape.begin(), 1); - bias_shape.insert(bias_shape.begin(), 1); - // bias_shape.push_back(1); - // bias_shape.push_back(1); - Shape anakin_bias_shape(bias_shape); + this->engine_->AddOpAttr(op_name, "group", groups); + this->engine_->AddOpAttr(op_name, "axis", 1); + this->engine_->AddOpAttr(op_name, "bias_term", true); - auto *weight2 = GraphGlobalMem::Global().template new_block( - anakin_bias_shape); - float *cpu_data2 = static_cast(weight2->h_tensor().mutable_data()); - std::copy_n(bias_data, bias_tensor.numel(), cpu_data2); - weight2->d_tensor().set_shape(anakin_bias_shape); - weight2->d_tensor().copy_from(weight2->h_tensor()); - engine_->AddOpAttr(op_name, "weight_2", *weight2); + ::anakin::saber::Shape anakin_shape(weight_shape); + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); + if (enable_int8) { + const float int8_range = 127.; + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + float weight_scale = boost::get(op_desc.GetAttr("weight_scale")); + PBlock *weight1 = + new PBlock(anakin_shape, ::anakin::AK_INT8); + this->engine_->RegistBlock(weight1); + float *weight_data = weight_tensor->data(); + std::vector weight_int8; + int weight_num = weight_tensor->numel(); + for (int i = 0; i < weight_tensor->numel(); i++) { + bool is_valid_int8 = + ((weight_data[i] >= -128) && (weight_data[i] <= 127)); + PADDLE_ENFORCE(is_valid_int8, + "We are in anakin subgraph int8 mode, the weight of conv " + "should be in range [-128, 127]"); + weight_int8.push_back(static_cast(weight_data[i])); + } + memcpy(static_cast(weight1->h_tensor().mutable_data()), + static_cast(weight_int8.data()), sizeof(char) * weight_num); + weight1->d_tensor().set_shape(anakin_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); + this->engine_->Graph()->SetWeightsScale(op_name, + {weight_scale / int8_range}, false); + this->engine_->AddTensorScale(input_name, in_scale / int8_range); + } else { + auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); + auto weight_shape = framework::vectorize2int(weight_tensor->dims()); + auto *weight1 = pblock_from_tensor( + *weight_tensor, weight_shape, this->engine_); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + auto weight2 = pblock_from_var(*b_v, this->engine_); + this->engine_->AddOpAttr(op_name, "weight_2", *weight2); + } } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h index 0d9ef28183b309..768814d3f996dd 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h +++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class Conv2dFusionOpConverter : public AnakinOpConverter { +template +class Conv2dFusionOpConverter : public AnakinOpConverter { public: Conv2dFusionOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc index 30796f75924271..5bbaeb57a7da46 100644 --- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc @@ -17,17 +17,14 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -void DensityPriorBoxOpConverter::operator()( +template +void DensityPriorBoxOpConverter::operator()( const framework::proto::OpDesc& op, const framework::BlockDesc& block_desc, const framework::Scope& scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -81,22 +78,30 @@ void DensityPriorBoxOpConverter::operator()( std::vector temp_v = {}; - engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name}); - engine_->AddOpAttr>(op_name, "min_size", min_sizes); - engine_->AddOpAttr>(op_name, "max_size", max_sizes); - engine_->AddOpAttr>(op_name, "aspect_ratio", aspect_ratios); - engine_->AddOpAttr>(op_name, "fixed_size", fixed_sizes); - engine_->AddOpAttr>(op_name, "fixed_ratio", fixed_ratios); - engine_->AddOpAttr>(op_name, "density", dens); - engine_->AddOpAttr(op_name, "is_flip", is_flip); - engine_->AddOpAttr(op_name, "is_clip", is_clip); - engine_->AddOpAttr>(op_name, "variance", variances); - engine_->AddOpAttr(op_name, "img_h", static_cast(0)); - engine_->AddOpAttr(op_name, "img_w", static_cast(0)); - engine_->AddOpAttr(op_name, "step_h", step_h); - engine_->AddOpAttr(op_name, "step_w", step_w); - engine_->AddOpAttr(op_name, "offset", offset); - engine_->AddOpAttr>(op_name, "order", t_order); + this->engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, + {output_name}); + this->engine_->template AddOpAttr>(op_name, "min_size", + min_sizes); + this->engine_->template AddOpAttr>(op_name, "max_size", + max_sizes); + this->engine_->template AddOpAttr>(op_name, "aspect_ratio", + aspect_ratios); + this->engine_->template AddOpAttr>(op_name, "fixed_size", + fixed_sizes); + this->engine_->template AddOpAttr>(op_name, "fixed_ratio", + fixed_ratios); + this->engine_->template AddOpAttr>(op_name, "density", dens); + this->engine_->AddOpAttr(op_name, "is_flip", is_flip); + this->engine_->AddOpAttr(op_name, "is_clip", is_clip); + this->engine_->template AddOpAttr>(op_name, "variance", + variances); + this->engine_->AddOpAttr(op_name, "img_h", static_cast(0)); + this->engine_->AddOpAttr(op_name, "img_w", static_cast(0)); + this->engine_->AddOpAttr(op_name, "step_h", step_h); + this->engine_->AddOpAttr(op_name, "step_w", step_w); + this->engine_->AddOpAttr(op_name, "offset", offset); + this->engine_->template AddOpAttr>(op_name, "order", + t_order); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.h b/paddle/fluid/inference/anakin/convert/density_prior_box.h index bf9210711a0f69..5714f57a04b7b3 100644 --- a/paddle/fluid/inference/anakin/convert/density_prior_box.h +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h @@ -22,7 +22,9 @@ namespace paddle { namespace inference { namespace anakin { -class DensityPriorBoxOpConverter : public AnakinOpConverter { +template +class DensityPriorBoxOpConverter + : public AnakinOpConverter { public: DensityPriorBoxOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/detection_out.cc b/paddle/fluid/inference/anakin/convert/detection_out.cc index 262ad28a654609..73dd6f2832541e 100644 --- a/paddle/fluid/inference/anakin/convert/detection_out.cc +++ b/paddle/fluid/inference/anakin/convert/detection_out.cc @@ -16,19 +16,14 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; - namespace paddle { namespace inference { namespace anakin { -void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void DetectionOutOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); auto target_name = op_desc.Input("TargetBox").front(); auto prior_box_name = op_desc.Input("PriorBox").front(); @@ -52,18 +47,19 @@ void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op, "Not support encode_center_size code_type in DetectionOut of anakin"); } - engine_->AddOp(op_name, "DetectionOutput", - {target_name, scores_name, prior_box_name}, {output_name}); - engine_->AddOpAttr(op_name, "share_location", true); - engine_->AddOpAttr(op_name, "variance_encode_in_target", false); - engine_->AddOpAttr(op_name, "class_num", static_cast(0)); - engine_->AddOpAttr(op_name, "background_id", background_label); - engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k); - engine_->AddOpAttr(op_name, "code_type", anakin_code_type); - engine_->AddOpAttr(op_name, "conf_thresh", score_threshold); - engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k); - engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold); - engine_->AddOpAttr(op_name, "nms_eta", nms_eta); + this->engine_->AddOp(op_name, "DetectionOutput", + {target_name, scores_name, prior_box_name}, + {output_name}); + this->engine_->AddOpAttr(op_name, "share_location", true); + this->engine_->AddOpAttr(op_name, "variance_encode_in_target", false); + this->engine_->AddOpAttr(op_name, "class_num", static_cast(0)); + this->engine_->AddOpAttr(op_name, "background_id", background_label); + this->engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k); + this->engine_->AddOpAttr(op_name, "code_type", anakin_code_type); + this->engine_->AddOpAttr(op_name, "conf_thresh", score_threshold); + this->engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k); + this->engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold); + this->engine_->AddOpAttr(op_name, "nms_eta", nms_eta); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/detection_out.h b/paddle/fluid/inference/anakin/convert/detection_out.h index ca78f10fdc2a7c..c34342a66c1c6c 100644 --- a/paddle/fluid/inference/anakin/convert/detection_out.h +++ b/paddle/fluid/inference/anakin/convert/detection_out.h @@ -22,7 +22,8 @@ namespace paddle { namespace inference { namespace anakin { -class DetectionOutOpConverter : public AnakinOpConverter { +template +class DetectionOutOpConverter : public AnakinOpConverter { public: DetectionOutOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/dropout.cc b/paddle/fluid/inference/anakin/convert/dropout.cc index bc9b26dcf27333..6c5f80b5f8e07f 100644 --- a/paddle/fluid/inference/anakin/convert/dropout.cc +++ b/paddle/fluid/inference/anakin/convert/dropout.cc @@ -16,24 +16,16 @@ #include #include #include - -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::Precision; -using anakin::saber::NV; -using anakin::saber::X86; -using anakin::saber::Shape; -using anakin::PBlock; -using anakin::PTuple; +#include "paddle/fluid/inference/anakin/convert/helper.h" namespace paddle { namespace inference { namespace anakin { -void DropoutOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void DropoutOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Mask").size(), 1); @@ -43,21 +35,17 @@ void DropoutOpConverter::operator()(const framework::proto::OpDesc &op, auto out_name = op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); - engine_->AddOp(op_name, "Scale", {x_name}, {out_name}); + this->engine_->AddOp(op_name, "Scale", {x_name}, {out_name}); auto dropout_prob = boost::get(op_desc.GetAttr("dropout_prob")); auto factor = 1 - dropout_prob; - Shape shape1(std::vector({1, 1, 1, 1})); - auto *weight1 = - GraphGlobalMem::Global().template new_block(shape1); - auto *factor_data = static_cast(weight1->h_tensor().mutable_data()); - float weight1_data[] = {factor}; - std::copy(std::begin(weight1_data), std::end(weight1_data), factor_data); + auto *weight1 = pblock_from_vector( + std::vector({factor}), this->engine_); - engine_->AddOpAttr(op_name, "weight_1", *weight1); - engine_->AddOpAttr(op_name, "axis", 0); - engine_->AddOpAttr(op_name, "num_axes", 0); - engine_->AddOpAttr(op_name, "bias_term", false); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + this->engine_->AddOpAttr(op_name, "axis", 0); + this->engine_->AddOpAttr(op_name, "num_axes", 0); + this->engine_->AddOpAttr(op_name, "bias_term", false); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/dropout.h b/paddle/fluid/inference/anakin/convert/dropout.h index 11412e217ef5fa..801aa3dd16f850 100644 --- a/paddle/fluid/inference/anakin/convert/dropout.h +++ b/paddle/fluid/inference/anakin/convert/dropout.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class DropoutOpConverter : public AnakinOpConverter { +template +class DropoutOpConverter : public AnakinOpConverter { public: DropoutOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/elementwise.cc b/paddle/fluid/inference/anakin/convert/elementwise.cc index fe9a896d8266e0..dd32baa0b90018 100644 --- a/paddle/fluid/inference/anakin/convert/elementwise.cc +++ b/paddle/fluid/inference/anakin/convert/elementwise.cc @@ -17,20 +17,14 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::Precision; -using anakin::saber::NV; -using anakin::saber::X86; -using anakin::saber::Shape; -using anakin::PBlock; using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -void ElementwiseAddOpConverter::operator()( +template +void ElementwiseAddOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -43,14 +37,16 @@ void ElementwiseAddOpConverter::operator()( auto out_name = op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); - engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name}); + this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name}); std::string elementwise_type = "Add"; - engine_->AddOpAttr(op_name, "type", elementwise_type); + this->engine_->template AddOpAttr(op_name, "type", + elementwise_type); std::vector coeff = {1.0, 1.0}; - engine_->AddOpAttr>(op_name, "coeff", coeff); + this->engine_->template AddOpAttr>(op_name, "coeff", coeff); } -void ElementwiseMulOpConverter::operator()( +template +void ElementwiseMulOpConverter::operator()( const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -63,21 +59,12 @@ void ElementwiseMulOpConverter::operator()( auto out_name = op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); - engine_->AddOp(op_name, "Scale", {x_name, y_name}, {out_name}); - // Fill a number to weight_1 as a placeholder. - Shape shape1(std::vector({1, 1, 1, 1})); - auto *weight1 = - GraphGlobalMem::Global().template new_block(shape1); - auto *placeholder_data = - static_cast(weight1->h_tensor().mutable_data()); - float weight1_data[] = {1}; - std::copy(std::begin(weight1_data), std::end(weight1_data), placeholder_data); - engine_->AddOpAttr(op_name, "weight_1", *weight1); - - auto axis = boost::get(op_desc.GetAttr("axis")); - engine_->AddOpAttr(op_name, "axis", axis); - engine_->AddOpAttr(op_name, "num_axes", 1); - engine_->AddOpAttr(op_name, "bias_term", false); + this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name}); + std::string elementwise_type = "Prod"; + this->engine_->template AddOpAttr(op_name, "type", + elementwise_type); + std::vector coeff = {1.0, 1.0}; + this->engine_->template AddOpAttr>(op_name, "coeff", coeff); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/elementwise.h b/paddle/fluid/inference/anakin/convert/elementwise.h index e4664493a9d3ce..190a8b55f0e3c2 100644 --- a/paddle/fluid/inference/anakin/convert/elementwise.h +++ b/paddle/fluid/inference/anakin/convert/elementwise.h @@ -20,7 +20,9 @@ namespace paddle { namespace inference { namespace anakin { -class ElementwiseAddOpConverter : public AnakinOpConverter { +template +class ElementwiseAddOpConverter + : public AnakinOpConverter { public: ElementwiseAddOpConverter() = default; @@ -33,7 +35,9 @@ class ElementwiseAddOpConverter : public AnakinOpConverter { private: }; -class ElementwiseMulOpConverter : public AnakinOpConverter { +template +class ElementwiseMulOpConverter + : public AnakinOpConverter { public: ElementwiseMulOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc index a80a1a47e91aa0..0621e3377b3466 100644 --- a/paddle/fluid/inference/anakin/convert/fc.cc +++ b/paddle/fluid/inference/anakin/convert/fc.cc @@ -16,23 +16,19 @@ #include #include #include - -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; +#include "paddle/fluid/inference/anakin/convert/helper.h" namespace paddle { namespace inference { namespace anakin { -void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void FcBaseOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); auto input_names = op_desc.InputNames(); - bool with_bias = input_names.size() == 3; + bool with_bias = input_names.size() >= 3; std::string w_name = "Y"; std::string i_name = "X"; @@ -46,71 +42,74 @@ void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op, // get weights auto *y_v = scope.FindVar(op_desc.Input(w_name).front()); PADDLE_ENFORCE_NOT_NULL(y_v); - auto *y_t = y_v->GetMutable(); - - auto input_name = op_desc.Input(i_name).front(); - auto output_name = op_desc.Output("Out").front(); + auto weight_tensor = tensor_from_var(*y_v, platform::CPUPlace()); + auto weight_shape = framework::vectorize2int(weight_tensor->dims()); - engine_->AddOp(op_name, "Dense", {input_name}, {output_name}); - engine_->AddOpAttr(op_name, "bias_term", with_bias); - engine_->AddOpAttr(op_name, "axis", 1); - - auto weight_shape = framework::vectorize2int(y_t->dims()); int out_dim = weight_shape[1]; - engine_->AddOpAttr(op_name, "out_dim", out_dim); const int w_m = weight_shape[0]; const int w_k = weight_shape[1]; - if (weight_shape.size() < 4UL) { - weight_shape.insert(weight_shape.begin(), 4UL - weight_shape.size(), 1); - } - Shape anakin_shape(weight_shape); + auto input_name = op_desc.Input(i_name).front(); + auto output_name = op_desc.Output("Out").front(); - framework::LoDTensor weight_tensor; - weight_tensor.Resize(y_t->dims()); - TensorCopySync((*y_t), platform::CPUPlace(), &weight_tensor); - auto *weight_data = weight_tensor.data(); - PADDLE_ENFORCE(w_m * w_k == weight_tensor.numel()); + this->engine_->AddOp(op_name, "Dense", {input_name}, {output_name}); + this->engine_->AddOpAttr(op_name, "bias_term", with_bias); + this->engine_->AddOpAttr(op_name, "axis", 1); + this->engine_->AddOpAttr(op_name, "out_dim", out_dim); - std::vector trans_weight_data(weight_tensor.numel()); + auto *weight_data = weight_tensor->data(); + PADDLE_ENFORCE(w_m * w_k == weight_tensor->numel()); + + std::vector trans_weight_data(weight_tensor->numel()); for (int i = 0; i < w_m; i++) { for (int j = 0; j < w_k; j++) { trans_weight_data[i + j * w_m] = weight_data[i * w_k + j]; } } - auto *weight1 = - GraphGlobalMem::Global().template new_block(anakin_shape); - float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); - std::copy_n(trans_weight_data.data(), weight_tensor.numel(), cpu_data); - weight1->d_tensor().set_shape(anakin_shape); - weight1->d_tensor().copy_from(weight1->h_tensor()); - engine_->AddOpAttr(op_name, "weight_1", *weight1); + + int weight_num = weight_tensor->numel(); + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); + if (enable_int8) { + if (weight_shape.size() < 4UL) { + weight_shape.insert(weight_shape.begin(), 4UL - weight_shape.size(), 1); + } + ::anakin::saber::Shape anakin_shape(weight_shape); + const float int8_range = 127.; + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + float weight_scale = boost::get(op_desc.GetAttr("weight_scale")); + PBlock *weight1 = + new PBlock(anakin_shape, ::anakin::AK_INT8); + this->engine_->RegistBlock(weight1); + std::vector weight_int8; + for (int i = 0; i < weight_num; i++) { + bool is_valid_int8 = + ((trans_weight_data[i] >= -128) && (trans_weight_data[i] <= 127)); + PADDLE_ENFORCE(is_valid_int8, + "We are in anakin subgraph int8 mode, the weight of fc " + "should be in range [-128, 127]"); + weight_int8.push_back(static_cast(trans_weight_data[i])); + } + memcpy(static_cast(weight1->h_tensor().mutable_data()), + static_cast(weight_int8.data()), sizeof(char) * weight_num); + weight1->d_tensor().set_shape(anakin_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); + this->engine_->Graph()->SetWeightsScale(op_name, + {weight_scale / int8_range}, false); + this->engine_->AddTensorScale(input_name, in_scale / int8_range); + } else { + auto *weight1 = pblock_from_vector(trans_weight_data, + this->engine_); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + } // get bias if (with_bias) { auto *b_v = scope.FindVar(op_desc.Input("Bias").front()); PADDLE_ENFORCE_NOT_NULL(b_v); - auto *b_t = b_v->GetMutable(); - - auto bias_shape = framework::vectorize2int(b_t->dims()); - framework::LoDTensor bias_tensor; - bias_tensor.Resize(b_t->dims()); - TensorCopySync((*b_t), platform::CPUPlace(), &bias_tensor); - auto *bias_data = bias_tensor.data(); - bias_shape.insert(bias_shape.begin(), 1); - bias_shape.insert(bias_shape.begin(), 1); - bias_shape.insert(bias_shape.begin(), 1); - // bias_shape.push_back(1); - // bias_shape.push_back(1); - Shape anakin_bias_shape(bias_shape); - - auto *weight2 = GraphGlobalMem::Global().template new_block( - anakin_bias_shape); - float *cpu_data2 = static_cast(weight2->h_tensor().mutable_data()); - std::copy_n(bias_data, bias_tensor.numel(), cpu_data2); - weight2->d_tensor().set_shape(anakin_bias_shape); - weight2->d_tensor().copy_from(weight2->h_tensor()); - engine_->AddOpAttr(op_name, "weight_2", *weight2); + auto weight2 = pblock_from_var(*b_v, this->engine_); + this->engine_->AddOpAttr(op_name, "weight_2", *weight2); } } diff --git a/paddle/fluid/inference/anakin/convert/fc.h b/paddle/fluid/inference/anakin/convert/fc.h index fb461908b35e01..6fe65e3ecd4ec4 100644 --- a/paddle/fluid/inference/anakin/convert/fc.h +++ b/paddle/fluid/inference/anakin/convert/fc.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class FcBaseOpConverter : public AnakinOpConverter { +template +class FcBaseOpConverter : public AnakinOpConverter { public: FcBaseOpConverter() = default; @@ -32,13 +33,15 @@ class FcBaseOpConverter : public AnakinOpConverter { }; // with bias -class FcOpConverter : public FcBaseOpConverter { +template +class FcOpConverter : public FcBaseOpConverter { public: FcOpConverter() = default; }; // without bias -class MulOpConverter : public FcBaseOpConverter { +template +class MulOpConverter : public FcBaseOpConverter { public: MulOpConverter() = default; }; diff --git a/paddle/fluid/inference/anakin/convert/flatten.cc b/paddle/fluid/inference/anakin/convert/flatten.cc index 7f5c1510960d10..7ce519a4de36c9 100644 --- a/paddle/fluid/inference/anakin/convert/flatten.cc +++ b/paddle/fluid/inference/anakin/convert/flatten.cc @@ -15,20 +15,16 @@ #include "paddle/fluid/inference/anakin/convert/flatten.h" #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -void FlattenOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void FlattenOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL); @@ -41,8 +37,8 @@ void FlattenOpConverter::operator()(const framework::proto::OpDesc &op, std::vector out_dims = {0, -1, 1, 1}; auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); - engine_->AddOp(op_name, "Reshape", {input}, {output}); - engine_->AddOpAttr>(op_name, "dims", out_dims); + this->engine_->AddOp(op_name, "Reshape", {input}, {output}); + this->engine_->template AddOpAttr>(op_name, "dims", out_dims); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/flatten.h b/paddle/fluid/inference/anakin/convert/flatten.h index c9cc0006eb2448..6e5e059927d4d3 100644 --- a/paddle/fluid/inference/anakin/convert/flatten.h +++ b/paddle/fluid/inference/anakin/convert/flatten.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class FlattenOpConverter : public AnakinOpConverter { +template +class FlattenOpConverter : public AnakinOpConverter { public: FlattenOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/helper.cc b/paddle/fluid/inference/anakin/convert/helper.cc new file mode 100644 index 00000000000000..7804619bf836d9 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/helper.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +std::unique_ptr tensor_from_var( + const framework::Variable& var, const platform::Place& place) { + auto& src = var.Get(); + std::unique_ptr dst(new framework::LoDTensor()); + dst->Resize(src.dims()); + TensorCopySync((src), place, dst.get()); + return dst; +} + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/helper.h b/paddle/fluid/inference/anakin/convert/helper.h new file mode 100644 index 00000000000000..7b0fb211dcd8aa --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/helper.h @@ -0,0 +1,95 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/inference/anakin/engine.h" + +#include "framework/core/net/net.h" +#include "framework/core/types.h" +#include "framework/graph/graph.h" +#include "framework/graph/graph_global_mem.h" +#include "saber/saber_types.h" + +using anakin::saber::Shape; +using anakin::AK_FLOAT; +using anakin::AK_INT8; +using anakin::PBlock; + +namespace paddle { +namespace inference { +namespace anakin { + +std::unique_ptr tensor_from_var( + const framework::Variable& var, const platform::Place& place); + +template +PBlock* pblock_from_tensor(const framework::LoDTensor& tensor, + std::vector shape_vec, + AnakinEngine* engine) { + while (shape_vec.size() < 4) { + shape_vec.insert(shape_vec.begin(), 1); + } + Shape shape(shape_vec); + PBlock* weight = new PBlock(shape, AK_FLOAT); + engine->RegistBlock(weight); + float* cpu_data = static_cast(weight->h_tensor().mutable_data()); + std::copy_n(tensor.data(), tensor.numel(), cpu_data); + weight->d_tensor().set_shape(shape); + weight->d_tensor().copy_from(weight->h_tensor()); + return weight; +} + +template +PBlock* pblock_from_vector(const std::vector& vec, + std::vector shape_vec, + AnakinEngine* engine) { + while (shape_vec.size() < 4) { + shape_vec.insert(shape_vec.begin(), 1); + } + Shape shape(shape_vec); + PBlock* weight = new PBlock(shape, AK_FLOAT); + engine->RegistBlock(weight); + auto* weight_data = static_cast(weight->h_tensor().mutable_data()); + std::copy(std::begin(vec), std::end(vec), weight_data); + weight->d_tensor().set_shape(shape); + weight->d_tensor().copy_from(weight->h_tensor()); + return weight; +} + +template +PBlock* pblock_from_vector(const std::vector& vec, + AnakinEngine* engine) { + int size = vec.size(); + return pblock_from_vector( + vec, std::vector({1, 1, 1, size}), engine); +} + +template +PBlock* pblock_from_var(const framework::Variable& var, + AnakinEngine* engine) { + auto tensor = tensor_from_var(var, platform::CPUPlace()); + auto shape = framework::vectorize2int(tensor->dims()); + return pblock_from_tensor(*tensor, shape, engine); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.cc b/paddle/fluid/inference/anakin/convert/im2sequence.cc index 2cc330c3829f60..5a4e3e61c5e4e4 100644 --- a/paddle/fluid/inference/anakin/convert/im2sequence.cc +++ b/paddle/fluid/inference/anakin/convert/im2sequence.cc @@ -17,23 +17,16 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::Precision; -using anakin::saber::NV; -using anakin::saber::X86; -using anakin::saber::Shape; -using anakin::PBlock; using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void Im2SequenceConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 0); @@ -43,17 +36,19 @@ void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op, auto out_name = op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); - engine_->AddOp(op_name, "Im2Sequence", {x_name}, {out_name}); + this->engine_->AddOp(op_name, "Im2Sequence", {x_name}, {out_name}); std::vector dilations = {1, 1}; auto paddings = boost::get>(op_desc.GetAttr("paddings")); auto strides = boost::get>(op_desc.GetAttr("strides")); auto kernels = boost::get>(op_desc.GetAttr("kernels")); - engine_->AddOpAttr>(op_name, "paddings", paddings); - engine_->AddOpAttr>(op_name, "strides", strides); - engine_->AddOpAttr>(op_name, "window_size", kernels); - engine_->AddOpAttr>(op_name, "dilations", dilations); + this->engine_->template AddOpAttr>(op_name, "paddings", paddings); + this->engine_->template AddOpAttr>(op_name, "strides", strides); + this->engine_->template AddOpAttr>(op_name, "window_size", + kernels); + this->engine_->template AddOpAttr>(op_name, "dilations", + dilations); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.h b/paddle/fluid/inference/anakin/convert/im2sequence.h index 714679c1d96011..8241d4d6f9ce78 100644 --- a/paddle/fluid/inference/anakin/convert/im2sequence.h +++ b/paddle/fluid/inference/anakin/convert/im2sequence.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class Im2SequenceConverter : public AnakinOpConverter { +template +class Im2SequenceConverter : public AnakinOpConverter { public: Im2SequenceConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h index 1ca62658ef26ff..a6ae51bd4b1c67 100644 --- a/paddle/fluid/inference/anakin/convert/op_converter.h +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -32,10 +32,10 @@ namespace paddle { namespace inference { namespace anakin { -using AnakinNvEngine = - AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>; - +template class AnakinOpConverter { + using AnakinEngineT = AnakinEngine; + public: AnakinOpConverter() = default; @@ -45,7 +45,7 @@ class AnakinOpConverter { void ConvertOp(const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, const std::unordered_set ¶meters, - const framework::Scope &scope, AnakinNvEngine *engine, + const framework::Scope &scope, AnakinEngineT *engine, bool test_mode = false) { framework::OpDesc op_desc(op, nullptr); std::string op_type = op_desc.Type(); @@ -65,7 +65,7 @@ class AnakinOpConverter { void ConvertBlock(framework::BlockDesc *block_desc, const std::unordered_set ¶meters, - const framework::Scope &scope, AnakinNvEngine *engine) { + const framework::Scope &scope, AnakinEngineT *engine) { std::unique_lock lock(mutex_); framework::proto::BlockDesc *block = block_desc->Proto(); for (auto i = 0; i < block->ops_size(); i++) { @@ -79,9 +79,8 @@ class AnakinOpConverter { framework::BlockDesc *block_desc, framework::Scope *scope, const std::vector &inputs, const std::unordered_set ¶meters, - const std::vector &outputs, AnakinNvEngine *engine) { + const std::vector &outputs, AnakinEngineT *engine) { ConvertBlock(block_desc, parameters, *scope, engine); - engine->Freeze(); // if the max_batch size int max_batch_size = engine->GetMaxBatchSize(); PADDLE_ENFORCE(max_batch_size > 0, @@ -91,6 +90,18 @@ class AnakinOpConverter { // the block_desc. auto max_input_shape = engine->GetMaxInputShape(); std::map> temp_max_input_shape; + // Register outputs with anakin using the RegistVar interface before Freeze. + // Note that RegistVar's parameters can only be outputs, not inputs. + for (auto &output : outputs) { + engine->Graph()->RegistVar(output); + } + engine->Freeze(); + // Add scale for tensor in int8 mode. + auto tensor_scales = engine->GetTensorScales(); + + for (auto &item : tensor_scales) { + engine->Graph()->SetVarScale(item.first, item.second); + } for (auto &input : inputs) { if (parameters.count(input)) continue; @@ -99,7 +110,7 @@ class AnakinOpConverter { input_shape[0] = max_batch_size; if (max_input_shape.count(input)) { PADDLE_ENFORCE(max_input_shape[input].size() == 4, - "the dimensions of max_input_shape setted from " + "the dimensions of max_input_shape setted from " "config->EnableAnakinEngine must be 4"); for (int i = 1; i < 4; i++) { input_shape[i] = max_input_shape[input][i]; @@ -118,50 +129,104 @@ class AnakinOpConverter { } temp_max_input_shape[input] = input_shape; engine->SetInputShape(input, input_shape); - engine->Graph()->RegistVar(input); // For share from data. } engine->SetMaxInputShape(temp_max_input_shape); engine->Optimize(); - - // For anakin share with fluid tensor. - engine->AllocTmpMem(); - engine->InitGraph(); + engine->InitNet(); } - void SetEngine(AnakinNvEngine *engine) { engine_ = engine; } + void SetEngine(AnakinEngineT *engine) { engine_ = engine; } virtual ~AnakinOpConverter() {} protected: bool test_mode_; - AnakinNvEngine *engine_{nullptr}; + AnakinEngineT *engine_{nullptr}; private: - std::unordered_map converters_; + std::unordered_map *> + converters_; framework::Scope *scope_{nullptr}; std::mutex mutex_; }; +template class AnakinOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +template class AnakinOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; + +template class AnakinOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +template class AnakinOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; } // namespace anakin } // namespace inference } // namespace paddle -#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ - struct anakin_##op_type__##_converter \ - : public ::paddle::framework::Registrar { \ - anakin_##op_type__##_converter() { \ - LOG(INFO) << "register convert " << #op_type__; \ - ::paddle::inference::Registry< \ - ::paddle::inference::anakin::AnakinOpConverter>::Global() \ - .Register<::paddle::inference::anakin::Converter__>(#op_type__); \ - } \ - }; \ - anakin_##op_type__##_converter anakin_##op_type__##_converter__; \ - int TouchConverterRegister_anakin_##op_type__() { \ - anakin_##op_type__##_converter__.Touch(); \ - return 0; \ +#define REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, \ + place_type__, place_class__, \ + precision_type__, precision_class__) \ + struct anakin_##op_type__##_##place_type__##_##precision_type__##_converter \ + : public ::paddle::framework::Registrar { \ + anakin_##op_type__##_##place_type__##_##precision_type__##_converter() { \ + LOG(INFO) << "register convert " << #op_type__ << " "; \ + ::paddle::inference::Registry< \ + ::paddle::inference::anakin::AnakinOpConverter< \ + place_class__, precision_class__>>::Global() \ + .Register(#op_type__); \ + } \ + }; \ + anakin_##op_type__##_##place_type__##_##precision_type__##_converter \ + anakin_##op_type__##_##place_type__##_##precision_type__##_converter__; \ + int Touch_anakin_##op_type__##_##place_type__##_##precision_type__() { \ + anakin_##op_type__##_##place_type__##_##precision_type__##_converter__ \ + .Touch(); \ + return 0; \ } -#define USE_ANAKIN_CONVERTER(op_type__) \ - extern int TouchConverterRegister_anakin_##op_type__(); \ - int use_op_converter_anakin_##op_type__ __attribute__((unused)) = \ - TouchConverterRegister_anakin_##op_type__(); +#define WRAP(...) __VA_ARGS__ + +#define REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, \ + precision_type__) \ + REGISTER_ANAKIN_OP_CONVERTER_BASE( \ + op_type__, \ + ::paddle::inference::anakin::Converter__, \ + CUDA, ::anakin::saber::NV, precision_type__, \ + ::anakin::Precision::precision_type__) + +#define REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, \ + precision_type__) \ + REGISTER_ANAKIN_OP_CONVERTER_BASE( \ + op_type__, \ + ::paddle::inference::anakin::Converter__, \ + CPU, ::anakin::saber::X86, precision_type__, \ + ::anakin::Precision::precision_type__) + +#ifdef PADDLE_WITH_CUDA +#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ + REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \ + REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8); \ + REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \ + REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8) +#else +#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ + REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \ + REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8) +#endif + +#define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__) \ + extern int Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); \ + int use_converter_anakin_##op_type__##_##place_type__##_##precision_type__ \ + __attribute__((unused)) = \ + Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); + +#define USE_ANAKIN_CONVERTER(op_type__) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32) +#define USE_INT8_ANAKIN_CONVERTER(op_type__) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8) + +#define USE_CPU_ANAKIN_CONVERTER(op_type__) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32) +#define USE_CPU_INT8_ANAKIN_CONVERTER(op_type__) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8) diff --git a/paddle/fluid/inference/anakin/convert/pool2d.cc b/paddle/fluid/inference/anakin/convert/pool2d.cc index 87eefe712a5ad2..11e7c717fd689b 100644 --- a/paddle/fluid/inference/anakin/convert/pool2d.cc +++ b/paddle/fluid/inference/anakin/convert/pool2d.cc @@ -17,23 +17,16 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::Precision; -using anakin::saber::NV; -using anakin::saber::X86; -using anakin::saber::Shape; -using anakin::PBlock; using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void Pool2dOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); @@ -65,13 +58,13 @@ void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op, PADDLE_THROW("TensorRT unsupported pooling type!"); } - engine_->AddOp(op_name, "Pooling", {x_name}, {y_name}); - engine_->AddOpAttr>(op_name, "pool_size", ksize); - engine_->AddOpAttr>(op_name, "strides", strides); - engine_->AddOpAttr>(op_name, "padding", paddings); - engine_->AddOpAttr(op_name, "method", anakin_pool_type); - engine_->AddOpAttr(op_name, "global_pooling", global_pooling); - engine_->AddOpAttr(op_name, "cmp_out_shape_floor_as_conv", !ceil_mode); + this->engine_->AddOp(op_name, "Pooling", {x_name}, {y_name}); + this->engine_->template AddOpAttr>(op_name, "pool_size", ksize); + this->engine_->template AddOpAttr>(op_name, "strides", strides); + this->engine_->template AddOpAttr>(op_name, "padding", paddings); + this->engine_->AddOpAttr(op_name, "method", anakin_pool_type); + this->engine_->AddOpAttr(op_name, "global_pooling", global_pooling); + this->engine_->AddOpAttr(op_name, "cmp_out_shape_floor_as_conv", !ceil_mode); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/pool2d.h b/paddle/fluid/inference/anakin/convert/pool2d.h index ec28e48ac848ef..7a06ff1b660a4c 100644 --- a/paddle/fluid/inference/anakin/convert/pool2d.h +++ b/paddle/fluid/inference/anakin/convert/pool2d.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class Pool2dOpConverter : public AnakinOpConverter { +template +class Pool2dOpConverter : public AnakinOpConverter { public: Pool2dOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/relu.cc b/paddle/fluid/inference/anakin/convert/relu.cc index 993437d014b1f9..00853406634bdf 100644 --- a/paddle/fluid/inference/anakin/convert/relu.cc +++ b/paddle/fluid/inference/anakin/convert/relu.cc @@ -16,19 +16,30 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; - namespace paddle { namespace inference { namespace anakin { -void ReluOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void ReluOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + auto input_name = op_desc.Input("X").front(); + auto output_name = op_desc.Output("Out").front(); + + this->engine_->AddOp(op_name, "ReLU", {input_name}, {output_name}); + this->engine_->AddOpAttr(op_name, "alpha", 0); +} + +template +void LeakyReluOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); @@ -37,8 +48,9 @@ void ReluOpConverter::operator()(const framework::proto::OpDesc &op, auto input_name = op_desc.Input("X").front(); auto output_name = op_desc.Output("Out").front(); - engine_->AddOp(op_name, "ReLU", {input_name}, {output_name}); - engine_->AddOpAttr(op_name, "alpha", 0); + float alpha = boost::get(op_desc.GetAttr("alpha")); + this->engine_->AddOp(op_name, "ReLU", {input_name}, {output_name}); + this->engine_->AddOpAttr(op_name, "alpha", alpha); } } // namespace anakin @@ -46,3 +58,4 @@ void ReluOpConverter::operator()(const framework::proto::OpDesc &op, } // namespace paddle REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(leaky_relu, LeakyReluOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/relu.h b/paddle/fluid/inference/anakin/convert/relu.h index 6ede506511917c..f366f05a94ae93 100644 --- a/paddle/fluid/inference/anakin/convert/relu.h +++ b/paddle/fluid/inference/anakin/convert/relu.h @@ -22,7 +22,8 @@ namespace paddle { namespace inference { namespace anakin { -class ReluOpConverter : public AnakinOpConverter { +template +class ReluOpConverter : public AnakinOpConverter { public: ReluOpConverter() = default; @@ -33,6 +34,18 @@ class ReluOpConverter : public AnakinOpConverter { virtual ~ReluOpConverter() {} }; +template +class LeakyReluOpConverter : public AnakinOpConverter { + public: + LeakyReluOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~LeakyReluOpConverter() {} +}; + } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/reshape.cc b/paddle/fluid/inference/anakin/convert/reshape.cc index 17e0a1acb5f4e0..d73736b7fecc75 100644 --- a/paddle/fluid/inference/anakin/convert/reshape.cc +++ b/paddle/fluid/inference/anakin/convert/reshape.cc @@ -15,20 +15,16 @@ #include "paddle/fluid/inference/anakin/convert/reshape.h" #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void ReshapeOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL); @@ -37,13 +33,13 @@ void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op, auto output = op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); - engine_->AddOp(op_name, "Reshape", {input}, {output}); + this->engine_->AddOp(op_name, "Reshape", {input}, {output}); auto shape = boost::get>(op_desc.GetAttr("shape")); if (shape.size() < 4) { shape.insert(shape.end(), 4 - shape.size(), 1); } - engine_->AddOpAttr>(op_name, "dims", shape); + this->engine_->template AddOpAttr>(op_name, "dims", shape); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/reshape.h b/paddle/fluid/inference/anakin/convert/reshape.h index 9ce2ea2a4f3f88..88de2641e60f1a 100644 --- a/paddle/fluid/inference/anakin/convert/reshape.h +++ b/paddle/fluid/inference/anakin/convert/reshape.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class ReshapeOpConverter : public AnakinOpConverter { +template +class ReshapeOpConverter : public AnakinOpConverter { public: ReshapeOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/roi_align.cc b/paddle/fluid/inference/anakin/convert/roi_align.cc new file mode 100644 index 00000000000000..8702f638e10bbf --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/roi_align.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/roi_align.h" +#include +#include + +namespace paddle { +namespace inference { +namespace anakin { + +template +void RoiAlignOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("ROIs").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + auto input_x_name = op_desc.Input("X").front(); + auto input_rois_name = op_desc.Input("ROIs").front(); + auto output_name = op_desc.Output("Out").front(); + + auto spatial_scale = boost::get(op_desc.GetAttr("spatial_scale")); + auto pooled_height = boost::get(op_desc.GetAttr("pooled_height")); + auto pooled_width = boost::get(op_desc.GetAttr("pooled_width")); + auto sampling_ratio = boost::get(op_desc.GetAttr("sampling_ratio")); + + this->engine_->AddOp(op_name, "RoiAlign", {input_x_name, input_rois_name}, + {output_name}); + this->engine_->AddOpAttr(op_name, "spatial_scale", spatial_scale); + this->engine_->AddOpAttr(op_name, "pooled_height", pooled_height); + this->engine_->AddOpAttr(op_name, "pooled_width", pooled_width); + this->engine_->AddOpAttr(op_name, "sampling_ratio", sampling_ratio); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(roi_align, RoiAlignOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/roi_align.h b/paddle/fluid/inference/anakin/convert/roi_align.h new file mode 100644 index 00000000000000..8b5d23a01676f0 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/roi_align.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class RoiAlignOpConverter : public AnakinOpConverter { + public: + RoiAlignOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~RoiAlignOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/scale.cc b/paddle/fluid/inference/anakin/convert/scale.cc index dd68af4f79a6d1..2559ec498c8ba4 100644 --- a/paddle/fluid/inference/anakin/convert/scale.cc +++ b/paddle/fluid/inference/anakin/convert/scale.cc @@ -16,19 +16,14 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; - namespace paddle { namespace inference { namespace anakin { -void ScaleOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void ScaleOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); @@ -44,10 +39,10 @@ void ScaleOpConverter::operator()(const framework::proto::OpDesc &op, PADDLE_ENFORCE(bias_after_scale, "The anakin scale layer only support bias after scale now."); - engine_->AddOp(op_name, "Power", {input_name}, {output_name}); - engine_->AddOpAttr(op_name, "shift", bias); - engine_->AddOpAttr(op_name, "scale", scale); - engine_->AddOpAttr(op_name, "power", static_cast(1.0)); + this->engine_->AddOp(op_name, "Power", {input_name}, {output_name}); + this->engine_->AddOpAttr(op_name, "shift", bias); + this->engine_->AddOpAttr(op_name, "scale", scale); + this->engine_->AddOpAttr(op_name, "power", static_cast(1.0)); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/scale.h b/paddle/fluid/inference/anakin/convert/scale.h index ba3bcdd21494a4..f19a9201934971 100644 --- a/paddle/fluid/inference/anakin/convert/scale.h +++ b/paddle/fluid/inference/anakin/convert/scale.h @@ -22,7 +22,8 @@ namespace paddle { namespace inference { namespace anakin { -class ScaleOpConverter : public AnakinOpConverter { +template +class ScaleOpConverter : public AnakinOpConverter { public: ScaleOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/softmax.cc b/paddle/fluid/inference/anakin/convert/softmax.cc index a6c1e971b16fa7..a4dc5a9156b8f5 100644 --- a/paddle/fluid/inference/anakin/convert/softmax.cc +++ b/paddle/fluid/inference/anakin/convert/softmax.cc @@ -14,19 +14,14 @@ #include "paddle/fluid/inference/anakin/convert/softmax.h" -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; - namespace paddle { namespace inference { namespace anakin { -void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void SoftMaxOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL); @@ -41,8 +36,8 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op, auto input_shape_in_fluid = input_var_desc->GetShape(); size_t input_dims = input_shape_in_fluid.size(); - engine_->AddOp(op_name, "Softmax", {input}, {output}); - engine_->AddOpAttr(op_name, "axis", static_cast(input_dims - 1)); + this->engine_->AddOp(op_name, "Softmax", {input}, {output}); + this->engine_->AddOpAttr(op_name, "axis", static_cast(input_dims - 1)); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/softmax.h b/paddle/fluid/inference/anakin/convert/softmax.h index a16356d5bb61ac..dc431b5b867a26 100644 --- a/paddle/fluid/inference/anakin/convert/softmax.h +++ b/paddle/fluid/inference/anakin/convert/softmax.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class SoftMaxOpConverter : public AnakinOpConverter { +template +class SoftMaxOpConverter : public AnakinOpConverter { public: SoftMaxOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/split.cc b/paddle/fluid/inference/anakin/convert/split.cc index ec582c1812623c..e63edea94ae010 100644 --- a/paddle/fluid/inference/anakin/convert/split.cc +++ b/paddle/fluid/inference/anakin/convert/split.cc @@ -16,23 +16,16 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::Precision; -using anakin::saber::NV; -using anakin::saber::X86; -using anakin::saber::Shape; -using anakin::PBlock; using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -void SplitOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void SplitOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); auto input_name = op_desc.Input("X").front(); auto y_names = op_desc.Output("Out"); @@ -51,14 +44,16 @@ void SplitOpConverter::operator()(const framework::proto::OpDesc &op, num_sum += output_lengths[i]; slice_point.push_back(num_sum); } - engine_->AddOp(op_name, "Slice", {input_name}, y_names); - engine_->AddOpAttr(op_name, "axis", axis); - engine_->AddOpAttr>(op_name, "slice_point", slice_point); + this->engine_->AddOp(op_name, "Slice", {input_name}, y_names); + this->engine_->AddOpAttr(op_name, "axis", axis); + this->engine_->template AddOpAttr>(op_name, "slice_point", + slice_point); // slice_dim is useless in anakin - engine_->AddOpAttr(op_name, "slice_dim", 4); + this->engine_->AddOpAttr(op_name, "slice_dim", 4); } } // namespace anakin } // namespace inference } // namespace paddle + REGISTER_ANAKIN_OP_CONVERTER(split, SplitOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/split.h b/paddle/fluid/inference/anakin/convert/split.h index 184112e589e2bb..819915315d90a5 100644 --- a/paddle/fluid/inference/anakin/convert/split.h +++ b/paddle/fluid/inference/anakin/convert/split.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class SplitOpConverter : public AnakinOpConverter { +template +class SplitOpConverter : public AnakinOpConverter { public: SplitOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/sum.cc b/paddle/fluid/inference/anakin/convert/sum.cc index 2a4178e2371389..870c0793409037 100644 --- a/paddle/fluid/inference/anakin/convert/sum.cc +++ b/paddle/fluid/inference/anakin/convert/sum.cc @@ -17,22 +17,16 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::Precision; -using anakin::saber::NV; -using anakin::saber::X86; -using anakin::saber::Shape; -using anakin::PBlock; using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -void SumOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, bool test_mode) { +template +void SumOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); @@ -43,9 +37,10 @@ void SumOpConverter::operator()(const framework::proto::OpDesc &op, std::vector coeff = {1, 1}; std::string elementwise_type = "Add"; - engine_->AddOp(op_name, "Eltwise", input_names, {out_name}); - engine_->AddOpAttr>(op_name, "coeff", coeff); - engine_->AddOpAttr(op_name, "type", elementwise_type); + this->engine_->AddOp(op_name, "Eltwise", input_names, {out_name}); + this->engine_->template AddOpAttr>(op_name, "coeff", coeff); + this->engine_->template AddOpAttr(op_name, "type", + elementwise_type); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/sum.h b/paddle/fluid/inference/anakin/convert/sum.h index b5d402b77fcf55..aefc64c623e916 100644 --- a/paddle/fluid/inference/anakin/convert/sum.h +++ b/paddle/fluid/inference/anakin/convert/sum.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class SumOpConverter : public AnakinOpConverter { +template +class SumOpConverter : public AnakinOpConverter { public: SumOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/test_activation_op.cc b/paddle/fluid/inference/anakin/convert/test_activation_op.cc index 8bedd4a749a645..4f898252d27980 100644 --- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc @@ -21,12 +21,14 @@ namespace paddle { namespace inference { namespace anakin { -static void test_activation_op(const std::string &op_type) { - auto *converter = Registry::Global().Lookup(op_type); - PADDLE_ENFORCE(converter != nullptr); +template +static void test_activation_op(const std::string& op_type, + const platform::DeviceContext& context, + bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("act-X", {10, 6, 1, 1}); validator.DeclOutputVar("act-Out", {10, 6, 1, 1}); framework::OpDesc desc; @@ -34,6 +36,14 @@ static void test_activation_op(const std::string &op_type) { desc.SetInput("X", {"act-X"}); desc.SetOutput("Out", {"act-Out"}); + if (op_type == "swish") { + desc.SetAttr("beta", 1.0f); + } + + if (op_type == "relu6") { + desc.SetAttr("threshold", 6.0f); + } + LOG(INFO) << "set OP"; validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; @@ -41,13 +51,74 @@ static void test_activation_op(const std::string &op_type) { validator.Execute(5); } -TEST(sigm_op, test) { test_activation_op("sigmoid"); } -TEST(tanh_op, test) { test_activation_op("tanh"); } +#ifdef PADDLE_WITH_CUDA +TEST(sigm_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_activation_op<::anakin::saber::NV>("sigmoid", ctx, true); +} + +TEST(tanh_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_activation_op<::anakin::saber::NV>("tanh", ctx, true); +} + +TEST(relu6_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_activation_op<::anakin::saber::NV>("relu6", ctx, true); +} + +TEST(swish_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_activation_op<::anakin::saber::NV>("swish", ctx, true); +} +#endif + +/* +TEST(sigm_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_activation_op<::anakin::saber::X86>("sigmoid", ctx, false); +} + +TEST(tanh_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_activation_op<::anakin::saber::X86>("tanh", ctx, false); +} + +TEST(relu6_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_activation_op<::anakin::saber::X86>("relu6", ctx, false); +} + +TEST(swish_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_activation_op<::anakin::saber::X86>("swish", ctx, false); +} +*/ + } // namespace anakin } // namespace inference } // namespace paddle USE_OP(sigmoid); USE_OP(tanh); +USE_OP(relu6); +USE_OP(swish); + +USE_CPU_ANAKIN_CONVERTER(sigmoid); +USE_CPU_ANAKIN_CONVERTER(tanh); +USE_CPU_ANAKIN_CONVERTER(relu6); +USE_CPU_ANAKIN_CONVERTER(swish); +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(sigmoid); USE_ANAKIN_CONVERTER(tanh); +USE_ANAKIN_CONVERTER(relu6); +USE_ANAKIN_CONVERTER(swish); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc b/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc new file mode 100644 index 00000000000000..f6399387aa264d --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc @@ -0,0 +1,75 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/affine_channel.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void test_affine_channel_op(const platform::DeviceContext& context, + bool use_gpu) { + // Declare the difference between the inputs. + std::unordered_set parameters({"scale", "bias"}); + + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("x", {1, 3, 5, 2}); + validator.DeclOutputVar("out", {1, 3, 5, 2}); + validator.DeclParamVar("scale", {3}); + validator.DeclParamVar("bias", {3}); + + // Prepare Op descriptions. + framework::OpDesc desc; + desc.SetType("affine_channel"); + desc.SetInput("X", {"x"}); + desc.SetInput("Bias", {"bias"}); + desc.SetInput("Scale", {"scale"}); + desc.SetOutput("Out", {"out"}); + + // Layout must be explicitly specified here as NCHW. + desc.SetAttr("data_layout", std::string("NCHW")); + + validator.SetOp(*desc.Proto()); + validator.Execute(1); +} + +#ifdef PADDLE_WITH_CUDA +TEST(affine_channel_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_affine_channel_op<::anakin::saber::NV>(ctx, true); +} +#endif + +TEST(affine_channel_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_affine_channel_op<::anakin::saber::X86>(ctx, false); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(affine_channel); +USE_CPU_ANAKIN_CONVERTER(affine_channel); +#ifdef PADDLE_WITH_CUDA +USE_ANAKIN_CONVERTER(affine_channel); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc index 2832e1c8d167c6..c008ef1bd5ee25 100644 --- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc @@ -19,12 +19,14 @@ namespace paddle { namespace inference { namespace anakin { -TEST(batch_norm_op, test) { +template +void test_batchnorm_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters( {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean", "batch_norm_variance"}); framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); std::vector param_shape{2}; validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5}); @@ -64,8 +66,26 @@ TEST(batch_norm_op, test) { validator.Execute(1, neglected_output); } +#ifdef PADDLE_WITH_CUDA +TEST(batch_norm_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_batchnorm_op<::anakin::saber::NV>(ctx, true); +} +#endif + +TEST(batch_norm_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_batchnorm_op<::anakin::saber::X86>(ctx, false); +} + } // namespace anakin } // namespace inference } // namespace paddle USE_OP(batch_norm); +USE_CPU_ANAKIN_CONVERTER(batch_norm); + +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(batch_norm); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_concat_op.cc b/paddle/fluid/inference/anakin/convert/test_concat_op.cc index ecf44def5a2429..42dfbeb5cdc406 100644 --- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc @@ -21,10 +21,12 @@ namespace paddle { namespace inference { namespace anakin { -TEST(concat_op, test) { +template +void test_concat_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters({""}); framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("concat_x1", {1, 2, 1, 1}); validator.DeclInputVar("concat_x2", {1, 3, 1, 1}); validator.DeclInputVar("concat_x3", {1, 1, 1, 1}); @@ -44,31 +46,26 @@ TEST(concat_op, test) { validator.Execute(1); } -TEST(concat_op, test2) { - std::unordered_set parameters({""}); - framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); - validator.DeclInputVar("concat_x1", {1, 4}); - validator.DeclInputVar("concat_x2", {3, 4}); - validator.DeclInputVar("concat_x3", {2, 4}); - validator.DeclOutputVar("concat_out", {6, 4}); - - // Prepare Op description - framework::OpDesc desc; - desc.SetType("concat"); - desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"}); - desc.SetOutput("Out", {"concat_out"}); - - int axis = 0; - desc.SetAttr("axis", axis); - - validator.SetOp(*desc.Proto()); +#ifdef PADDLE_WITH_CUDA +TEST(concat_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_concat_op<::anakin::saber::NV>(ctx, true); +} +#endif - validator.Execute(1); +TEST(concat_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_concat_op<::anakin::saber::X86>(ctx, false); } } // namespace anakin } // namespace inference } // namespace paddle USE_OP(concat); +USE_CPU_ANAKIN_CONVERTER(concat); + +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(concat); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc index 6d93e50bc96b08..e95e11c4f96881 100644 --- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc @@ -21,13 +21,12 @@ namespace paddle { namespace inference { namespace anakin { -TEST(conv2d_op, test) { - auto* conv2d_converter = - Registry::Global().Lookup("conv2d"); - ASSERT_TRUE(conv2d_converter != nullptr); +template +void test_conv2d_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters({"conv2d-Y"}); framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("conv2d-X", {1, 3, 3, 3}); validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1}); validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3}); @@ -54,9 +53,27 @@ TEST(conv2d_op, test) { validator.Execute(3); } +#ifdef PADDLE_WITH_CUDA +TEST(conv2d_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_conv2d_op<::anakin::saber::NV>(ctx, true); +} +#endif + +TEST(conv2d_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_conv2d_op<::anakin::saber::X86>(ctx, false); +} + } // namespace anakin } // namespace inference } // namespace paddle USE_OP(conv2d); +USE_CPU_ANAKIN_CONVERTER(conv2d); + +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(conv2d); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc index b2de5ae0a6e58e..ae27e27ded5d92 100644 --- a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc @@ -21,10 +21,12 @@ namespace paddle { namespace inference { namespace anakin { -TEST(dropout_op, native) { +template +void test_dropout_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("x", {1, 1, 2, 2}); validator.DeclOutputVar("out", {1, 1, 2, 2}); validator.DeclOutputVar("mask", {1, 1, 2, 2}); @@ -45,9 +47,26 @@ TEST(dropout_op, native) { validator.Execute(1, neglected_output); } +#ifdef PADDLE_WITH_CUDA +TEST(dropout_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_dropout_op<::anakin::saber::NV>(ctx, true); +} +#endif + +TEST(dropout_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_dropout_op<::anakin::saber::X86>(ctx, false); +} + } // namespace anakin } // namespace inference } // namespace paddle USE_OP(dropout); +USE_CPU_ANAKIN_CONVERTER(dropout); +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(dropout); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc index 3a437f5fdb5656..bff75294908aab 100644 --- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc @@ -21,10 +21,14 @@ namespace paddle { namespace inference { namespace anakin { -static void test_elementwise_op(const std::string &op_type) { +template +static void test_elementwise_op(const std::string& op_type, + const platform::DeviceContext& context, + bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("x", {1, 1, 2, 2}); validator.DeclInputVar("y", {1, 1, 2, 2}); validator.DeclOutputVar("out", {1, 1, 2, 2}); @@ -43,14 +47,41 @@ static void test_elementwise_op(const std::string &op_type) { validator.Execute(1); } -TEST(elementwise_op, native_add) { test_elementwise_op("elementwise_add"); } -TEST(elementwise_op, native_mul) { test_elementwise_op("elementwise_mul"); } +#ifdef PADDLE_WITH_CUDA +TEST(elementwise_op, native_add_gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_elementwise_op<::anakin::saber::NV>("elementwise_add", ctx, true); +} +TEST(elementwise_op, native_mul_gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_elementwise_op<::anakin::saber::NV>("elementwise_mul", ctx, true); +} +#endif + +TEST(elementwise_op, native_add_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_elementwise_op<::anakin::saber::X86>("elementwise_add", ctx, false); +} + +TEST(elementwise_op, native_mul_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_elementwise_op<::anakin::saber::X86>("elementwise_mul", ctx, false); +} } // namespace anakin } // namespace inference } // namespace paddle USE_OP(elementwise_add); -USE_ANAKIN_CONVERTER(elementwise_add); USE_OP(elementwise_mul); +#ifdef PADDLE_WITH_CUDA +USE_ANAKIN_CONVERTER(elementwise_add); USE_ANAKIN_CONVERTER(elementwise_mul); +#endif + +USE_CPU_ANAKIN_CONVERTER(elementwise_add); +USE_CPU_ANAKIN_CONVERTER(elementwise_mul); diff --git a/paddle/fluid/inference/anakin/convert/test_fc_op.cc b/paddle/fluid/inference/anakin/convert/test_fc_op.cc index ee6d1dc291fe37..a24c809c022132 100644 --- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc @@ -20,13 +20,13 @@ namespace paddle { namespace inference { namespace anakin { -TEST(fc_op, test) { - auto* fc_converter = Registry::Global().Lookup("fc"); - ASSERT_TRUE(fc_converter); - +template +void test_mul_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters({"mul_y"}); framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("mul_x", {1, 1, 2, 2}); validator.DeclParamVar("mul_y", {4, 2}); validator.DeclOutputVar("mul_out", {1, 2}); @@ -42,9 +42,26 @@ TEST(fc_op, test) { validator.Execute(10); } +#ifdef PADDLE_WITH_CUDA +TEST(mul_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_mul_op<::anakin::saber::NV>(ctx, true); +} +#endif + +TEST(mul_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_mul_op<::anakin::saber::X86>(ctx, false); +} + } // namespace anakin } // namespace inference } // namespace paddle USE_OP(mul); +USE_CPU_ANAKIN_CONVERTER(fc); +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(fc); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc index d13281f11f03fd..5765f5ebd1f2a0 100644 --- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc @@ -20,13 +20,12 @@ namespace paddle { namespace inference { namespace anakin { -TEST(flatten_op, test) { - auto *converter = Registry::Global().Lookup("flatten"); - ASSERT_TRUE(converter); - +template +void test_flatten_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("flatten-X", {3, 10, 10, 4}); validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1}); framework::OpDesc desc; @@ -42,10 +41,27 @@ TEST(flatten_op, test) { validator.Execute(5); } +#ifdef PADDLE_WITH_CUDA +TEST(flatten_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_flatten_op<::anakin::saber::NV>(ctx, true); +} +#endif + +TEST(flatten_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_flatten_op<::anakin::saber::X86>(ctx, false); +} + } // namespace anakin } // namespace inference } // namespace paddle USE_OP(reshape); USE_OP_ITSELF(flatten); +USE_CPU_ANAKIN_CONVERTER(flatten); +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(flatten); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc index 1ac01946772160..90503b1fbba81e 100644 --- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc @@ -19,15 +19,14 @@ namespace paddle { namespace inference { namespace anakin { -void test_pool2d(bool global_pooling, bool ceil_mode, +template +void test_pool2d(const platform::DeviceContext& context, bool use_gpu, + bool global_pooling, bool ceil_mode, std::string pool_type = "max") { - auto* pool2d_converter = - Registry::Global().Lookup("pool2d"); - ASSERT_TRUE(pool2d_converter); - framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); // The ITensor's Dims should not contain the batch size. // So, the ITensor's Dims of input and output should be C * H * W. @@ -64,56 +63,61 @@ void test_pool2d(bool global_pooling, bool ceil_mode, validator.Execute(1); } -void test_pool2d2(bool global_pooling, bool ceil_mode, - std::string pool_type = "max") { - auto* pool2d_converter = - Registry::Global().Lookup("pool2d"); - ASSERT_TRUE(pool2d_converter); - - framework::Scope scope; - std::unordered_set parameters; - AnakinConvertValidation validator(parameters, &scope); - - // The ITensor's Dims should not contain the batch size. - // So, the ITensor's Dims of input and output should be C * H * W. - validator.DeclInputVar("pool2d_x", {1, 1, 17, 17}); - validator.DeclOutputVar("pool2d_out", {1, 1, 17, 17}); - - // Prepare Op description - framework::OpDesc desc; - desc.SetType("pool2d"); - desc.SetInput("X", {"pool2d_x"}); - desc.SetOutput("Out", {"pool2d_out"}); - - std::vector ksize({3, 3}); - std::vector strides({1, 1}); - std::vector paddings({1, 1}); - std::string pooling_t = pool_type; +#ifdef PADDLE_WITH_CUDA +TEST(Pool2dOpConverter, normal) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_pool2d<::anakin::saber::NV>(ctx, true, false, false); +} +TEST(Pool2dOpConverter, test_global_pooling) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_pool2d<::anakin::saber::NV>(ctx, true, true, false); +} - desc.SetAttr("pooling_type", pooling_t); - desc.SetAttr("ksize", ksize); - desc.SetAttr("strides", strides); - desc.SetAttr("paddings", paddings); - desc.SetAttr("global_pooling", global_pooling); - desc.SetAttr("ceil_mode", true); +TEST(Pool2dOpConverter, max_ceil_test) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_pool2d<::anakin::saber::NV>(ctx, true, false, true); +} - LOG(INFO) << "set OP"; - validator.SetOp(*desc.Proto()); - LOG(INFO) << "execute"; +TEST(Pool2dOpConverter, avg_ceil_test) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_pool2d<::anakin::saber::NV>(ctx, true, false, true, "avg"); +} +#endif - validator.Execute(1); +TEST(Pool2dOpConverter, normal_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_pool2d<::anakin::saber::X86>(ctx, false, false, false); +} +TEST(Pool2dOpConverter, test_global_pooling_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_pool2d<::anakin::saber::X86>(ctx, false, true, false); } -TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); } -TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); } +TEST(Pool2dOpConverter, max_ceil_test_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_pool2d<::anakin::saber::X86>(ctx, false, false, true); +} -TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); } -TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); } -TEST(Pool2dOpConverter, avg_ceil_test2) { test_pool2d2(false, true, "avg"); } +TEST(Pool2dOpConverter, avg_ceil_test_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_pool2d<::anakin::saber::X86>(ctx, false, false, true, "avg"); +} } // namespace anakin } // namespace inference } // namespace paddle USE_OP(pool2d); +USE_CPU_ANAKIN_CONVERTER(pool2d); + +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(pool2d); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_relu_op.cc b/paddle/fluid/inference/anakin/convert/test_relu_op.cc index 04e624518a5a44..3f224796519650 100644 --- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc @@ -21,18 +21,23 @@ namespace paddle { namespace inference { namespace anakin { -static void test_activation_op(const std::string &op_type) { - auto *converter = Registry::Global().Lookup(op_type); - PADDLE_ENFORCE(converter != nullptr); +template +static void test_activation_op(const std::string& op_type, + const platform::DeviceContext& context, + bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("act-X", {10, 6, 1, 1}); validator.DeclOutputVar("act-Out", {10, 6, 1, 1}); framework::OpDesc desc; desc.SetType(op_type); desc.SetInput("X", {"act-X"}); desc.SetOutput("Out", {"act-Out"}); + if (op_type == "leaky_relu") { + desc.SetAttr("alpha", 0.1f); + } LOG(INFO) << "set OP"; validator.SetOp(*desc.Proto()); @@ -41,10 +46,30 @@ static void test_activation_op(const std::string &op_type) { validator.Execute(5); } -TEST(sigm_op, test) { test_activation_op("relu"); } +#ifdef PADDLE_WITH_CUDA +TEST(relu_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_activation_op<::anakin::saber::NV>("relu", ctx, true); +} + +TEST(leaky_relu_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_activation_op<::anakin::saber::NV>("leaky_relu", ctx, true); +} +#endif + } // namespace anakin } // namespace inference } // namespace paddle USE_OP(relu); +USE_OP(leaky_relu); +USE_CPU_ANAKIN_CONVERTER(relu); +USE_CPU_ANAKIN_CONVERTER(leaky_relu); + +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(relu); +USE_ANAKIN_CONVERTER(leaky_relu); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc index 306ebf510f29a8..e102bd3ac3ea0d 100644 --- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc @@ -20,12 +20,12 @@ namespace paddle { namespace inference { namespace anakin { -TEST(reshape, test) { - auto* converter = Registry::Global().Lookup("reshape"); - ASSERT_TRUE(converter); +template +void test_reshape1_op(const platform::DeviceContext& context, bool use_gpu) { framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); // validator.DeclInputVar("reshape-X", {2, 3, 3, 1}); // validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3}); @@ -45,10 +45,12 @@ TEST(reshape, test) { validator.Execute(1); } -TEST(reshape, test2) { +template +void test_reshape2_op(const platform::DeviceContext& context, bool use_gpu) { framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("reshape-X", {1, 2, 4}); validator.DeclOutputVar("reshape-Out", {1, 4, 2}); @@ -66,9 +68,39 @@ TEST(reshape, test2) { validator.Execute(1); } +#ifdef PADDLE_WITH_CUDA +TEST(reshape1_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_reshape1_op<::anakin::saber::NV>(ctx, true); +} + +TEST(reshape2_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_reshape2_op<::anakin::saber::NV>(ctx, true); +} +#endif + +TEST(reshape1_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_reshape2_op<::anakin::saber::X86>(ctx, false); +} + +TEST(reshape2_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_reshape2_op<::anakin::saber::X86>(ctx, false); +} + } // namespace anakin } // namespace inference } // namespace paddle USE_OP(reshape); +USE_CPU_ANAKIN_CONVERTER(reshape); + +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(reshape); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc index 8c14fae0a67b9e..de0b18fdbfd5f7 100644 --- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc @@ -20,12 +20,12 @@ namespace paddle { namespace inference { namespace anakin { -TEST(softmax, test) { - auto* converter = Registry::Global().Lookup("softmax"); - ASSERT_TRUE(converter); +template +void test_softmax_op(const platform::DeviceContext& context, bool use_gpu) { framework::Scope scope; std::unordered_set parameters; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("softmax-X", {1, 10, 2}); validator.DeclOutputVar("softmax-Out", {1, 10, 2}); @@ -41,9 +41,27 @@ TEST(softmax, test) { validator.Execute(1); } +#ifdef PADDLE_WITH_CUDA +TEST(softmax_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_softmax_op<::anakin::saber::NV>(ctx, true); +} +#endif + +TEST(relu_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_softmax_op<::anakin::saber::X86>(ctx, false); +} + } // namespace anakin } // namespace inference } // namespace paddle USE_OP(softmax); +USE_CPU_ANAKIN_CONVERTER(softmax); + +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(softmax); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_split_op.cc b/paddle/fluid/inference/anakin/convert/test_split_op.cc index aa61c01a511c23..9a42ffd853bb07 100644 --- a/paddle/fluid/inference/anakin/convert/test_split_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc @@ -21,12 +21,14 @@ namespace paddle { namespace inference { namespace anakin { -template -void AnakinSliceTest(const std::vector &in_shape, +template +void AnakinSliceTest(const platform::DeviceContext &context, bool use_gpu, + const std::vector &in_shape, const std::vector §ions) { std::unordered_set parameters({""}); framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("split_input", in_shape); std::vector output_vars; @@ -55,51 +57,58 @@ void AnakinSliceTest(const std::vector &in_shape, // batch = 0, axis = 1, same shape TEST(split_op, test_same_shape_axis1_batch1) { - AnakinSliceTest<1>({1, 4, 2, 2}, {2, 2}); + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + AnakinSliceTest<::anakin::saber::NV, 1>(ctx, true, {1, 4, 2, 2}, {2, 2}); } // batch = 0, axis = 1, different shape TEST(split_op, test_different_shape_axis1_batch1) { - AnakinSliceTest<1>({1, 3, 2, 2}, {2, 1}); -} -// batch = 10, axis = 1, same shape -TEST(split_op, test_same_shape_axis1_batch10) { - AnakinSliceTest<1>({1, 4, 2, 2}, {2, 2}); -} -// batch = 10, axis = 1, different shape -TEST(split_op, test_different_shape_axis1_batch10) { - AnakinSliceTest<1>({1, 3, 2, 2}, {2, 1}); + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + AnakinSliceTest<::anakin::saber::NV, 1>(ctx, true, {1, 3, 2, 2}, {2, 1}); } // batch = 0, axis = 2, same shape TEST(split_op, test_same_shape_axis2_batch1) { - AnakinSliceTest<2>({1, 3, 4, 2}, {2, 2}); + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + AnakinSliceTest<::anakin::saber::NV, 2>(ctx, true, {1, 3, 4, 2}, {2, 2}); } // batch = 0, axis = 2, different shape TEST(split_op, test_different_shape_axis2_batch1) { - AnakinSliceTest<2>({1, 3, 3, 2}, {2, 1}); -} -// batch = 10, axis = 2, same shape -TEST(split_op, test_same_shape_axis2_batch10) { - AnakinSliceTest<2>({1, 3, 4, 2}, {2, 2}); -} -// batch = 10, axis = 2, different shape -TEST(split_op, test_different_shape_axis2_batch10) { - AnakinSliceTest<2>({1, 3, 3, 2}, {2, 1}); + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + AnakinSliceTest<::anakin::saber::NV, 2>(ctx, true, {1, 3, 3, 2}, {2, 1}); } + // batch = 0, axis = 3, same shape TEST(split_op, test_same_shape_axis3_batch1) { - AnakinSliceTest<3>({1, 3, 2, 4}, {2, 2}); + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 4}, {2, 2}); } // batch = 0, axis = 3, different shape TEST(split_op, test_different_shape_axis3_batch1) { - AnakinSliceTest<3>({1, 3, 2, 3}, {2, 1}); + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 3}, {2, 1}); } -// batch = 10, axis = 3, same shape -TEST(split_op, test_same_shape_axis3_batch10) { - AnakinSliceTest<3>({1, 3, 2, 4}, {2, 2}); + +TEST(split_op, test_different_shape_axis1_batch1_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + AnakinSliceTest<::anakin::saber::X86, 1>(ctx, false, {1, 3, 2, 3}, {2, 1}); +} + +TEST(split_op, test_different_shape_axis2_batch1_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + AnakinSliceTest<::anakin::saber::X86, 2>(ctx, false, {1, 3, 4, 2}, {2, 2}); } -// batch = 10, axis = 3, different shape -TEST(split_op, test_different_shape_axis3_batch10) { - AnakinSliceTest<3>({1, 3, 2, 3}, {2, 1}); + +TEST(split_op, test_different_shape_axis3_batch1_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + AnakinSliceTest<::anakin::saber::X86, 3>(ctx, false, {1, 3, 2, 4}, {2, 2}); } } // namespace anakin @@ -107,4 +116,7 @@ TEST(split_op, test_different_shape_axis3_batch10) { } // namespace paddle USE_OP(split); +USE_CPU_ANAKIN_CONVERTER(split); +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(split); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_sum_op.cc b/paddle/fluid/inference/anakin/convert/test_sum_op.cc index d6a59a0166be92..65f67ebd129893 100644 --- a/paddle/fluid/inference/anakin/convert/test_sum_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc @@ -22,10 +22,12 @@ namespace paddle { namespace inference { namespace anakin { -TEST(sum, native) { +template +static void test_sum_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("sum_x1", {1, 2, 1, 2}); validator.DeclInputVar("sum_x2", {1, 2, 1, 2}); validator.DeclOutputVar("sum_out", {1, 2, 1, 2}); @@ -40,9 +42,26 @@ TEST(sum, native) { validator.Execute(1); } +#ifdef PADDLE_WITH_CUDA +TEST(sum_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_sum_op<::anakin::saber::NV>(ctx, true); +} +#endif + +TEST(sum_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_sum_op<::anakin::saber::X86>(ctx, false); +} + } // namespace anakin } // namespace inference } // namespace paddle USE_OP(sum); +USE_CPU_ANAKIN_CONVERTER(sum); +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(sum); +#endif diff --git a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc index 016ed26f02f782..51b69dfbb08b73 100644 --- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc @@ -20,12 +20,12 @@ namespace paddle { namespace inference { namespace anakin { -TEST(transpose_op, test) { - auto* converter = Registry::Global().Lookup("transpose"); - ASSERT_TRUE(converter != nullptr); +template +void test_transpose1_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("transpose-X", {2, 3, 4, 5}); validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3}); @@ -43,11 +43,12 @@ TEST(transpose_op, test) { validator.Execute(3); } -// test input shape's dims < 4 -TEST(transpose_op, test2) { +template +void test_transpose2_op(const platform::DeviceContext& context, bool use_gpu) { std::unordered_set parameters; framework::Scope scope; - AnakinConvertValidation validator(parameters, &scope); + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); validator.DeclInputVar("transpose-X", {3, 4, 5}); validator.DeclOutputVar("transpose-Out", {3, 5, 4}); @@ -65,9 +66,38 @@ TEST(transpose_op, test2) { validator.Execute(1); } +#ifdef PADDLE_WITH_CUDA +TEST(transpose1_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_transpose1_op<::anakin::saber::NV>(ctx, true); +} + +TEST(transpose2_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_transpose2_op<::anakin::saber::NV>(ctx, true); +} +#endif + +TEST(transpose1_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_transpose2_op<::anakin::saber::X86>(ctx, false); +} + +TEST(transpose2_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_transpose2_op<::anakin::saber::X86>(ctx, false); +} + } // namespace anakin } // namespace inference } // namespace paddle USE_OP(transpose); +USE_CPU_ANAKIN_CONVERTER(transpose); +#ifdef PADDLE_WITH_CUDA USE_ANAKIN_CONVERTER(transpose); +#endif diff --git a/paddle/fluid/inference/anakin/convert/transpose.cc b/paddle/fluid/inference/anakin/convert/transpose.cc index f35372fe5c315e..28071ca8449cdd 100644 --- a/paddle/fluid/inference/anakin/convert/transpose.cc +++ b/paddle/fluid/inference/anakin/convert/transpose.cc @@ -17,20 +17,16 @@ #include #include -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::saber::NV; -using anakin::saber::Shape; using anakin::PTuple; namespace paddle { namespace inference { namespace anakin { -void TransposeOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::BlockDesc &block_desc, - const framework::Scope &scope, - bool test_mode) { +template +void TransposeOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); @@ -38,7 +34,7 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op, auto input = op_desc.Input("X").front(); auto output = op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); - engine_->AddOp(op_name, "Permute", {input}, {output}); + this->engine_->AddOp(op_name, "Permute", {input}, {output}); auto axis = boost::get>(op_desc.GetAttr("axis")); size_t axis_size = axis.size(); @@ -46,7 +42,7 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op, axis.push_back(axis_size); axis_size += 1; } - engine_->AddOpAttr>(op_name, "dims", axis); + this->engine_->template AddOpAttr>(op_name, "dims", axis); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/transpose.h b/paddle/fluid/inference/anakin/convert/transpose.h index bacbf152bc1231..b7b0a0f209e7d6 100644 --- a/paddle/fluid/inference/anakin/convert/transpose.h +++ b/paddle/fluid/inference/anakin/convert/transpose.h @@ -20,7 +20,8 @@ namespace paddle { namespace inference { namespace anakin { -class TransposeOpConverter : public AnakinOpConverter { +template +class TransposeOpConverter : public AnakinOpConverter { public: TransposeOpConverter() = default; diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h index 029aff6704ff10..2f8f953892c390 100644 --- a/paddle/fluid/inference/anakin/convert/ut_helper.h +++ b/paddle/fluid/inference/anakin/convert/ut_helper.h @@ -32,14 +32,8 @@ limitations under the License. */ #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/platform/enforce.h" -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; using anakin::Precision; -using anakin::saber::NV; using anakin::saber::X86; -using anakin::saber::Shape; -using anakin::PBlock; -using anakin::PTuple; namespace paddle { namespace inference { @@ -55,8 +49,8 @@ float random(float low, float high) { return dist(mt); } -void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, - const platform::DeviceContext& ctx) { +void RandomizeTensor(framework::LoDTensor* tensor, + const platform::Place& place) { auto dims = tensor->dims(); size_t num_elements = analysis::AccuDims(dims, dims.size()); PADDLE_ENFORCE_GT(num_elements, 0); @@ -78,17 +72,19 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, * anakin * layer. */ +template class AnakinConvertValidation { - using AnakinNvEngineT = AnakinEngine; + using AnakinNvEngineT = AnakinEngine; public: AnakinConvertValidation() = delete; AnakinConvertValidation(const std::unordered_set& parameters, - framework::Scope* scope) - : parameters_(parameters), scope_(scope), place_(0) { - PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); - engine_.reset(new AnakinEngine(true)); + framework::Scope* scope, + const platform::DeviceContext& ctx, + bool use_gpu = true) + : parameters_(parameters), scope_(scope), ctx_(ctx), use_gpu_(use_gpu) { + engine_.reset(new AnakinEngine(true)); } // Declare a Variable as input with random initialization. @@ -108,11 +104,10 @@ class AnakinConvertValidation { } void DeclVar(const std::string& name, const std::vector dim_vec) { - platform::CUDADeviceContext ctx(place_); auto* x = scope_->Var(name); auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); - RandomizeTensor(x_tensor, place_, ctx); + RandomizeTensor(x_tensor, ctx_.GetPlace()); std::vector dim_vec_int64; for (auto& ele : dim_vec) { @@ -132,7 +127,7 @@ class AnakinConvertValidation { // should init anakin engine here. auto& block_desc = program_desc_.Block(framework::kRootBlockIndex); - Singleton::Global().ConvertOp( + Singleton>::Global().ConvertOp( desc, block_desc, parameters_, *scope_, engine_.get(), true /*test_mode*/); engine_->Freeze(); @@ -151,7 +146,7 @@ class AnakinConvertValidation { } engine_->SetMaxInputShape(temp_max_input_shape); engine_->Optimize(); - engine_->InitGraph(); + engine_->InitNet(); } // We use the set 'neglected_output' here, because some Ops like batch norm, @@ -160,11 +155,8 @@ class AnakinConvertValidation { void Execute(int batch_size, std::unordered_set neglected_output = {}) { // Execute Fluid Op - platform::CUDADeviceContext ctx(place_); - op_->Run(*scope_, place_); + op_->Run(*scope_, ctx_.GetPlace()); - // std::vector input_vector; - // std::vector output_vector; std::map inputs; for (const auto& input : op_desc_->InputArgumentNames()) { if (parameters_.count(input)) continue; @@ -180,20 +172,27 @@ class AnakinConvertValidation { std::vector fluid_out; auto* var = scope_->FindVar(output); auto tensor = var->GetMutable(); - framework::TensorToVector(*tensor, ctx, &fluid_out); + framework::TensorToVector(*tensor, ctx_, &fluid_out); fluid_outputs.push_back(fluid_out); outputs.insert({output, tensor}); } - engine_->Execute(inputs, outputs, stream_); + if (!use_gpu_) { + engine_->Execute(inputs, outputs); + } else { + cudaStream_t stream; + PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream), 0); + engine_->Execute(inputs, outputs, stream); + } + int i_output = 0; for (const auto& output : op_desc_->OutputArgumentNames()) { if (neglected_output.count(output)) continue; std::vector anakin_out; auto* var = scope_->FindVar(output); auto tensor = var->GetMutable(); - framework::TensorToVector(*tensor, ctx, &anakin_out); + framework::TensorToVector(*tensor, ctx_, &anakin_out); size_t anakin_out_size = anakin_out.size(); auto fluid_out = fluid_outputs[i_output++]; @@ -205,15 +204,24 @@ class AnakinConvertValidation { private: std::unique_ptr engine_{nullptr}; - cudaStream_t stream_; std::unique_ptr op_; std::unique_ptr op_desc_; framework::ProgramDesc program_desc_; const std::unordered_set& parameters_; framework::Scope* scope_; - platform::CUDAPlace place_; + const platform::DeviceContext& ctx_; + bool use_gpu_{true}; }; +template class AnakinConvertValidation<::anakin::saber::NV, + ::anakin::Precision::FP32>; +template class AnakinConvertValidation<::anakin::saber::X86, + ::anakin::Precision::FP32>; + +template class AnakinConvertValidation<::anakin::saber::NV, + ::anakin::Precision::INT8>; +template class AnakinConvertValidation<::anakin::saber::X86, + ::anakin::Precision::INT8>; } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc index ba044c9401a5f0..529a859458a988 100644 --- a/paddle/fluid/inference/anakin/engine.cc +++ b/paddle/fluid/inference/anakin/engine.cc @@ -35,12 +35,15 @@ namespace anakin { template AnakinEngine::AnakinEngine( bool need_summary, int device, int max_batch_size, - std::map> max_input_shape) + std::map> max_input_shape, + std::vector program_inputs, bool auto_config_layout) : graph_(new AnakinGraphT()), net_(new AnakinNetT(need_summary)) { device_ = device; max_batch_size_ = max_batch_size; max_input_shape_ = max_input_shape; + program_inputs_ = program_inputs; + auto_config_layout_ = auto_config_layout; } template @@ -54,8 +57,8 @@ void AnakinEngine::SetInputShape( } template -void AnakinEngine::InitGraph() { - net_->init(*graph_); +void AnakinEngine::InitNet() { + net_->init(*graph_, auto_config_layout_); } template @@ -67,11 +70,11 @@ void AnakinEngine::AddOp( } template -void AnakinEngine::Execute( - const std::map &inputs, - const std::map &outputs, - cudaStream_t stream) { +void AnakinEngine::BindInput( + const std::map &inputs) { +#ifdef PADDLE_WITH_CUDA cudaDeviceSynchronize(); +#endif for (const auto &input : inputs) { auto *tensor = input.second; auto *data = tensor->data(); @@ -85,16 +88,53 @@ void AnakinEngine::Execute( int max_shape_sum = std::accumulate(max_input_shape.begin(), max_input_shape.end(), 1, std::multiplies()); - - PADDLE_ENFORCE(max_shape_sum >= tensor->numel(), - "The anakin input max shape should be greater than" - " or equal to the real input shape, Please set the max " - "input shape using EnableAnakinEngine"); + if (tensor->numel() > max_shape_sum) { + PADDLE_ENFORCE(std::find(program_inputs_.begin(), program_inputs_.end(), + input.first) == program_inputs_.end(), + "The anakin input max shape should be greater than" + " or equal to the real input shape, Please set the max " + "input shape using EnableAnakinEngine"); + VLOG(3) << "Anakin Net will be reset because of the inputs out of range: " + << input.first; + graph_->Reshape(input.first, fluid_input_shape); + net_.reset(new AnakinNetT(true)); + net_->init(*graph_); + anakin_input = net_->get_in(input.first); + } anakin_input->reshape(fluid_input_shape); ::anakin::saber::Tensor tmp_anakin_tensor(data, TargetT(), 0, fluid_input_shape); anakin_input->copy_from(tmp_anakin_tensor); } +} + +template +void AnakinEngine::Execute( + const std::map &inputs, + const std::map &outputs) { + BindInput(inputs); + net_->prediction(); + for (const auto &output : outputs) { + platform::CPUPlace cpu_place; + auto *tensor = output.second; + auto *anakin_output = net_->get_out(output.first); + auto *anakin_data = anakin_output->data(); + auto anakin_output_shape = anakin_output->valid_shape(); + tensor->Resize(framework::make_ddim(anakin_output_shape)); + auto *fluid_data = tensor->mutable_data(cpu_place); + memory::Copy(cpu_place, static_cast(fluid_data), cpu_place, + static_cast(anakin_data), + tensor->numel() * sizeof(float)); + } +} + +#ifdef PADDLE_WITH_CUDA +template +void AnakinEngine::Execute( + const std::map &inputs, + const std::map &outputs, + cudaStream_t stream) { + BindInput(inputs); net_->prediction(); cudaDeviceSynchronize(); for (const auto &output : outputs) { @@ -111,10 +151,11 @@ void AnakinEngine::Execute( } cudaDeviceSynchronize(); } +#endif template void AnakinEngine::Freeze() { - PADDLE_ENFORCE(graph_->Freeze_v3(), "Freeze anakin subgraph."); + PADDLE_ENFORCE(graph_->Freeze(), "Freeze anakin subgraph."); } template @@ -122,6 +163,12 @@ void AnakinEngine::Optimize() { PADDLE_ENFORCE(graph_->Optimize(), "Graph optimization."); } +template +void AnakinEngine::RegistBlock( + ::anakin::PBlock *block_p) { + PADDLE_ENFORCE(graph_->RegistBlock(block_p), "Block register."); +} + template std::unique_ptr> AnakinEngine::Clone() { @@ -130,7 +177,24 @@ AnakinEngine::Clone() { return std::unique_ptr(engine); } +#ifdef PADDLE_WITH_CUDA template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>; +template class AnakinEngineManager<::anakin::saber::NV, + ::anakin::Precision::FP32>; + +template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::INT8>; +template class AnakinEngineManager<::anakin::saber::NV, + ::anakin::Precision::INT8>; +#endif + +template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>; +template class AnakinEngineManager<::anakin::saber::X86, + ::anakin::Precision::FP32>; +template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::INT8>; +template class AnakinEngineManager<::anakin::saber::X86, + ::anakin::Precision::INT8>; + +// template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>; } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h index 4845ffdf5b9dcf..fb40f56511ba25 100644 --- a/paddle/fluid/inference/anakin/engine.h +++ b/paddle/fluid/inference/anakin/engine.h @@ -32,7 +32,6 @@ #include "saber/saber_types.h" using anakin::Precision; -using anakin::saber::NV; namespace anakin { @@ -58,9 +57,11 @@ class AnakinEngine { public: explicit AnakinEngine( bool need_summary = false, int device = 0, int max_batch_size = 1, - std::map> max_input_shape = {}); + std::map> max_input_shape = {}, + std::vector program_inputs = {}, + bool auto_config_layout = false); ~AnakinEngine(); - void InitGraph(); + void InitNet(); void SetInputShape(const std::string &name, std::vector shape); void AddOp(const std::string &name, const std::string &type, const std::vector &inputs, @@ -81,20 +82,35 @@ class AnakinEngine { void SetMaxInputShape(std::map> shape) { max_input_shape_ = shape; } + const std::vector &GetScalableInputs() { + return program_inputs_; + } + void SetScalableInputs(std::vector program_inputs) { + program_inputs_ = program_inputs; + } int GetMaxBatchSize() { return max_batch_size_; } void Freeze(); void Optimize(); - void AllocTmpMem() { - PADDLE_ENFORCE(net_->alloc_memory_first(*graph_), - "anakin alloc temp memory first failed"); - } + void RegistBlock(::anakin::PBlock *block_p); void Save(std::string path) { graph_->save(path); } - bool IsInit() { return initialized_; } int GetDevice() { return device_; } + void AddTensorScale(const std::string &tensor_name, float scale) { + tensor_scales_[tensor_name] = scale; + } + std::unordered_map GetTensorScales() { + return tensor_scales_; + } + void Execute(const std::map &inputs, + const std::map &outputs); +#ifdef PADDLE_WITH_CUDA void Execute(const std::map &inputs, const std::map &outputs, cudaStream_t stream); +#endif + + private: + void BindInput(const std::map &inputs); private: bool initialized_{false}; @@ -103,27 +119,33 @@ class AnakinEngine { int device_; std::unique_ptr graph_; std::unique_ptr net_; + std::vector program_inputs_; + std::unordered_map tensor_scales_; + // Always be false in gpu mode but true in most cpu cases. + bool auto_config_layout_; }; +template class AnakinEngineManager { - using AnakinNvEngineT = AnakinEngine; + using AnakinEngineT = AnakinEngine; public: bool HasEngine(const std::string &name) const { if (engines_.count(name) == 0) return false; return engines_.at(name).get() != nullptr; } - AnakinNvEngineT *Get(const std::string &name) const { + AnakinEngineT *Get(const std::string &name) const { return engines_.at(name).get(); } - AnakinNvEngineT *Create( - bool need_summary, int device, int max_batch_size, - std::map> max_input_shape, - std::string engine_name) { + AnakinEngineT *Create(bool need_summary, int device, int max_batch_size, + std::map> max_input_shape, + std::vector program_inputs, + bool auto_config_layout, std::string engine_name) { std::unique_lock lk(mut_); - auto *p = new AnakinEngine( - need_summary, device, max_batch_size, max_input_shape); + auto *p = new AnakinEngine( + need_summary, device, max_batch_size, max_input_shape, program_inputs, + auto_config_layout); engines_[engine_name].reset(p); return p; } @@ -135,7 +157,7 @@ class AnakinEngineManager { } private: - std::unordered_map> engines_; + std::unordered_map> engines_; std::mutex mut_; }; } // namespace anakin diff --git a/paddle/fluid/inference/anakin/op_teller.cc b/paddle/fluid/inference/anakin/op_teller.cc index 2042fb18ea41f8..6cad00f8ecfe87 100644 --- a/paddle/fluid/inference/anakin/op_teller.cc +++ b/paddle/fluid/inference/anakin/op_teller.cc @@ -44,6 +44,10 @@ struct SimpleOpTypeSetTeller : public Teller { teller_set.insert("sum"); teller_set.insert("depthwise_conv2d"); teller_set.insert("prior_box"); + teller_set.insert("leaky_relu"); + teller_set.insert("affine_channel"); + teller_set.insert("relu6"); + teller_set.insert("swish"); } bool operator()(const std::string& op_type, diff --git a/paddle/fluid/inference/anakin/test_anakin_engine.cc b/paddle/fluid/inference/anakin/test_anakin_engine.cc index 8fd6b8bec9ada6..422f415a5db62d 100644 --- a/paddle/fluid/inference/anakin/test_anakin_engine.cc +++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/fluid/inference/anakin/engine.h" -using anakin::graph::GraphGlobalMem; using anakin::AK_FLOAT; using anakin::Precision; using anakin::saber::NV; @@ -52,11 +51,9 @@ TEST_F(TestAnakinEngine, Execute) { engine_->AddOpAttr("op1", "axis", 1); std::vector shape = {1, 1, 1, 2}; Shape tmp_shape(shape); - // PBlock weight1(tmp_shape); - auto *weight1 = - GraphGlobalMem::Global().template new_block(tmp_shape); - // auto *weight1 = new PBlock(tmp_shape, AK_FLOAT); + PBlock *weight1 = new PBlock(tmp_shape, AK_FLOAT); + engine_->RegistBlock(weight1); float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); cpu_data[0] = 2.; weight1->d_tensor().set_shape(tmp_shape); @@ -68,7 +65,7 @@ TEST_F(TestAnakinEngine, Execute) { // engine_->AddOpAttr("x", "input_shape", input_shape); engine_->SetInputShape("x", {1, 1, 1, 1}); engine_->Optimize(); - engine_->InitGraph(); + engine_->InitNet(); framework::LoDTensor x; framework::LoDTensor y; x.Resize({1, 1, 1, 1}); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index a736ca393ccb71..66e8d8b5287178 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -64,20 +64,20 @@ struct Argument { bool Has(const std::string& key) const { return valid_fields_.count(key); } -#define DECL_ARGUMENT_FIELD(field__, Field, type__) \ - public: \ - type__& field__() { \ - PADDLE_ENFORCE(Has(#field__)); \ - return field__##_; \ - } \ - void Set##Field(const type__& x) { \ - field__##_ = x; \ - valid_fields_.insert(#field__); \ - } \ - DECL_ARGUMENT_FIELD_VALID(field__); \ - type__* field__##_ptr() { return &field__##_; } \ - \ - private: \ +#define DECL_ARGUMENT_FIELD(field__, Field, type__) \ + public: \ + type__& field__() { \ + PADDLE_ENFORCE(Has(#field__), "There is no such field"); \ + return field__##_; \ + } \ + void Set##Field(const type__& x) { \ + field__##_ = x; \ + valid_fields_.insert(#field__); \ + } \ + DECL_ARGUMENT_FIELD_VALID(field__); \ + type__* field__##_ptr() { return &field__##_; } \ + \ + private: \ type__ field__##_; #define DECL_ARGUMENT_FIELD_VALID(field__) \ @@ -169,7 +169,14 @@ struct Argument { anakin_max_shape_t); DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int); DECL_ARGUMENT_FIELD(anakin_min_subgraph_size, AnakinMinSubgraphSize, int); + DECL_ARGUMENT_FIELD(anakin_precision_mode, AnakinPrecisionMode, + AnalysisConfig::Precision); + DECL_ARGUMENT_FIELD(anakin_auto_config_layout, AnakinAutoConfigLayout, bool); DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool); + DECL_ARGUMENT_FIELD(anakin_passes_filter, AnakinPassesFilter, + std::vector); + DECL_ARGUMENT_FIELD(anakin_ops_filter, AnakinOpsFilter, + std::vector); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 78e502c670f0eb..4714c30507c4c3 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -114,6 +114,7 @@ void IRPassManager::CreatePasses(Argument *argument, if (pass_name == "anakin_subgraph_pass") { pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); + pass->Set("use_gpu", new bool(argument->use_gpu())); pass->Set("gpu_device_id", new int(argument->gpu_device_id())); pass->Set("model_from_memory", new bool(argument->model_from_memory())); pass->Set("engine_opt_info", new std::map( @@ -122,6 +123,13 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("max_input_shape", new std::map>( argument->anakin_max_input_shape())); pass->Set("max_batch_size", new int(argument->anakin_max_batch_size())); + bool enable_int8 = + argument->anakin_precision_mode() == AnalysisConfig::Precision::kInt8; + pass->Set("enable_int8", new bool(enable_int8)); + pass->Set("anakin_ops_filter", + new std::vector(argument->anakin_ops_filter())); + pass->Set("auto_config_layout", + new bool(argument->anakin_auto_config_layout())); } pre_pass = pass_name; diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc index b8d8b6fed8ca23..9586ce3e6b0142 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc @@ -39,8 +39,14 @@ void analysis::AnakinSubgraphPass::ApplyImpl( framework::ir::Graph *graph) const { framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph); - auto teller = [](const framework::ir::Node *node) { - if (!node->IsOp() || !node->Op()) return false; + auto &anakin_ops_filter = Get>("anakin_ops_filter"); + + auto teller = [&anakin_ops_filter](const framework::ir::Node *node) { + if (!node->IsOp() || !node->Op()) + return false; + else if (std::find(anakin_ops_filter.begin(), anakin_ops_filter.end(), + node->Op()->Type()) != anakin_ops_filter.end()) + return false; return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op()); }; @@ -191,22 +197,78 @@ void AnakinSubgraphPass::CreateAnakinOp( SetAttr(op_desc->Proto(), "engine_key", engine_key); auto max_input_shape = Get>>("max_input_shape"); - auto max_batch_size = Get("max_batch_size"); + auto program_inputs = program_desc->GetFeedTargetNames(); - auto *anakin_engine = - inference::Singleton::Global().Create( - true, Get("gpu_device_id"), max_batch_size, max_input_shape, - engine_key); + bool use_gpu = Get("use_gpu"); + SetAttr(op_desc->Proto(), "use_gpu", use_gpu); + bool enable_int8 = Get("enable_int8"); + SetAttr(op_desc->Proto(), "enable_int8", enable_int8); + if (enable_int8) { + CreateAnakinEngine<::anakin::Precision::INT8>(&block_desc, params, + input_names, output_mapping, + program_inputs, engine_key); + } else { + CreateAnakinEngine<::anakin::Precision::FP32>(&block_desc, params, + input_names, output_mapping, + program_inputs, engine_key); + } +} + +template <::anakin::Precision PrecisionT> +void AnakinSubgraphPass::CreateAnakinEngine( + framework::BlockDesc *block_desc, const std::vector ¶ms, + const std::set &input_names, + const std::vector &output_mapping, + const std::vector &program_inputs, + const std::string &engine_key) const { + framework::BlockDesc block_desc_temp(nullptr, block_desc->Proto()); + bool use_gpu = Get("use_gpu"); + auto max_batch_size = Get("max_batch_size"); + auto max_input_shape = + Get>>("max_input_shape"); + bool auto_config_layout = Get("auto_config_layout"); + if (use_gpu) { +#ifdef PADDLE_WITH_CUDA + inference::Singleton< + anakin::AnakinEngineManager<::anakin::saber::NV, PrecisionT>>::Global() + .Create(true, Get("gpu_device_id"), max_batch_size, + max_input_shape, program_inputs, false, engine_key); +#endif + } else { + inference::Singleton< + anakin::AnakinEngineManager<::anakin::saber::X86, PrecisionT>>::Global() + .Create(true, Get("gpu_device_id"), max_batch_size, + max_input_shape, program_inputs, auto_config_layout, + engine_key); + } auto *scope = param_scope(); std::unordered_set param_set(params.begin(), params.end()); - framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); - - inference::Singleton::Global() - .ConvertBlockToAnakinEngine( - &block_desc_temp, scope, - std::vector(input_names.begin(), input_names.end()), - param_set, output_mapping, anakin_engine); + if (use_gpu) { +#ifdef PADDLE_WITH_CUDA + auto *anakin_engine = + inference::Singleton>::Global() + .Get(engine_key); + inference::Singleton>::Global() + .ConvertBlockToAnakinEngine( + &block_desc_temp, scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, anakin_engine); +#endif + } else { + auto *anakin_engine = + inference::Singleton>::Global() + .Get(engine_key); + inference::Singleton>::Global() + .ConvertBlockToAnakinEngine( + &block_desc_temp, scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, anakin_engine); + } } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h index e80b8bb612096a..4ab2297b2d4887 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h @@ -15,6 +15,7 @@ #pragma once #include #include +#include #include #include #include "paddle/fluid/framework/ir/pass.h" @@ -36,6 +37,13 @@ class AnakinSubgraphPass : public framework::ir::FusePassBase { const std::vector &graph_params, std::vector *repetitive_params) const; void CleanIntermediateOutputs(framework::ir::Node *node); + template <::anakin::Precision PrecisionT> + void CreateAnakinEngine(framework::BlockDesc *block_desc, + const std::vector ¶ms, + const std::set &input_names, + const std::vector &output_mapping, + const std::vector &program_inputs, + const std::string &engine_key) const; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc index 7c4aab06a1d2b3..8f7c6ac7553676 100644 --- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc @@ -100,7 +100,6 @@ void RenameAndGetOutputs( const std::string arg_value = in_var->arguments(k); const std::string arg_value_with_id = arg_value + std::to_string(var2id[arg_value]); - if (input_names_with_id.count(arg_value_with_id)) { replaced_names.push_back(arg_value); if (graph_var_map.count(arg_value)) { @@ -149,7 +148,6 @@ void RenameAndGetOutputs( const std::string arg_value = out_var->arguments(k); const std::string arg_value_with_id = arg_value + std::to_string(var2id[arg_value]); - if (graph_var_map.count(arg_value)) { add_block_var(arg_value, arg_value_with_id); } diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 0109b4a4fa7617..4fe0c48d8f31d2 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -116,6 +116,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(anakin_max_batchsize_); CP_MEMBER(anakin_max_input_shape_); CP_MEMBER(anakin_min_subgraph_size_); + CP_MEMBER(anakin_precision_mode_); + CP_MEMBER(anakin_auto_config_layout_); + CP_MEMBER(anakin_passes_filter_); + CP_MEMBER(anakin_ops_filter_); // Ir related. CP_MEMBER(enable_ir_optim_); @@ -268,13 +272,18 @@ void AnalysisConfig::Update() { PADDLE_ENFORCE(!use_tensorrt_, "Anakin sub-graph and TensorRT sub-graph are not allowed to " "run at the same time!"); - PADDLE_ENFORCE( - use_gpu_, - "Anakin sub-graph engine need gpu, please use the EnableGpu API."); + if (use_gpu_) { + LOG(INFO) << "Run Anakin GPU mode"; + } else { + LOG(INFO) << "Run Anakin CPU mode"; + } pass_builder()->ClearPasses(); for (const auto &pass : kAnakinSubgraphPasses) { - pass_builder()->AppendPass(pass); + if (std::find(anakin_passes_filter_.begin(), anakin_passes_filter_.end(), + pass) == anakin_passes_filter_.end()) { + pass_builder()->AppendPass(pass); + } } } @@ -389,11 +398,17 @@ void AnalysisConfig::SwitchIrDebug(int x) { } void AnalysisConfig::EnableAnakinEngine( int max_batch_size, std::map> max_input_shape, - int min_subgraph_size) { + int min_subgraph_size, AnalysisConfig::Precision precision_mode, + bool auto_config_layout, std::vector passes_filter, + std::vector ops_filter) { anakin_max_batchsize_ = max_batch_size; anakin_max_input_shape_ = max_input_shape; anakin_min_subgraph_size_ = min_subgraph_size; + anakin_passes_filter_ = passes_filter; + anakin_ops_filter_ = ops_filter; use_anakin_ = true; + anakin_precision_mode_ = precision_mode; + anakin_auto_config_layout_ = auto_config_layout; Update(); } } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 6942604b0723f8..677f5bf130f124 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -382,10 +382,14 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); } - if (config_.use_gpu() && config_.anakin_engine_enabled()) { + if (config_.anakin_engine_enabled()) { argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_); argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_); argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_); + argument_.SetAnakinPrecisionMode(config_.anakin_precision_mode_); + argument_.SetAnakinAutoConfigLayout(config_.anakin_auto_config_layout_); + argument_.SetAnakinPassesFilter(config_.anakin_passes_filter_); + argument_.SetAnakinOpsFilter(config_.anakin_ops_filter_); LOG(INFO) << "Anakin subgraph engine is enabled"; } @@ -888,4 +892,8 @@ USE_ANAKIN_CONVERTER(density_prior_box); USE_ANAKIN_CONVERTER(dropout); USE_ANAKIN_CONVERTER(sum); USE_ANAKIN_CONVERTER(prior_box); +USE_ANAKIN_CONVERTER(leaky_relu); +USE_ANAKIN_CONVERTER(affine_channel); +USE_ANAKIN_CONVERTER(relu6); +USE_ANAKIN_CONVERTER(swish); #endif diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index c67c4b5bd0bfee..ebe289322bdd32 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -152,7 +152,10 @@ struct AnalysisConfig { void EnableAnakinEngine( int max_batch_size = 1, std::map> max_input_shape = {}, - int min_subgraph_size = 6); + int min_subgraph_size = 6, Precision precision = Precision::kFloat32, + bool auto_config_layout = false, + std::vector passes_filter = {}, + std::vector ops_filter = {}); /** A boolean state indicating whether the Anakin sub-graph engine is used. */ @@ -291,6 +294,10 @@ struct AnalysisConfig { int anakin_max_batchsize_; int anakin_min_subgraph_size_{6}; std::map> anakin_max_input_shape_; + Precision anakin_precision_mode_; + bool anakin_auto_config_layout_{false}; + std::vector anakin_passes_filter_; + std::vector anakin_ops_filter_; std::map engine_opt_info_; bool use_mkldnn_quantizer_{false}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 3d72295be4b779..fea291c5528a11 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -73,15 +73,14 @@ void PaddlePassBuilder::ClearPasses() { passes_.clear(); } // The following passes works for Anakin sub-graph engine. const std::vector kAnakinSubgraphPasses({ "infer_clean_graph_pass", // + "quant_conv2d_dequant_fuse_pass", // "simplify_anakin_priorbox_detection_out_pass", // "fillconstant_elementwisemul_fuse", // "fc_fuse_pass", // "conv_elementwise_add_fuse_pass", // - "conv_bn_fuse_pass", // - "conv_elementwise_add_fuse_pass", // "fc_gru_fuse_pass", // - "quant_conv2d_dequant_fuse_pass", // - "anakin_subgraph_pass", + "anakin_subgraph_pass", // + "fc_gru_fuse_pass", // }); GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h index e4feb14b2271a5..11c394c76cd982 100644 --- a/paddle/fluid/operators/anakin/anakin_engine_op.h +++ b/paddle/fluid/operators/anakin/anakin_engine_op.h @@ -34,28 +34,17 @@ limitations under the License. */ namespace paddle { namespace operators { -using FluidDT = framework::proto::VarType_Type; using inference::Singleton; - -using anakin::graph::GraphGlobalMem; -using anakin::AK_FLOAT; -using anakin::Precision; -using anakin::saber::NV; -using anakin::saber::X86; -using anakin::saber::Shape; -using anakin::PBlock; -using anakin::PTuple; using inference::anakin::AnakinEngine; class AnakinEngineOp : public framework::OperatorBase { - using AnakinNvEngineT = AnakinEngine; - private: std::vector input_names_; std::unordered_set param_names_; - mutable AnakinNvEngineT *anakin_engine_; std::string engine_key_; std::string engine_serialized_data_; + bool use_gpu_; + bool enable_int8_; public: AnakinEngineOp(const std::string &type, @@ -66,10 +55,11 @@ class AnakinEngineOp : public framework::OperatorBase { input_names_ = Inputs("Xs"); engine_key_ = Attr("engine_key"); auto params = Attr>("parameters"); + use_gpu_ = Attr("use_gpu"); + enable_int8_ = Attr("enable_int8"); for (const auto ¶m : params) { param_names_.insert(param); } - anakin_engine_ = nullptr; } protected: @@ -80,19 +70,12 @@ class AnakinEngineOp : public framework::OperatorBase { void RunAnakin(const framework::Scope &scope, const platform::Place &dev_place) const { - auto *engine = GetEngine(scope, dev_place); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - auto stream = - reinterpret_cast(dev_ctx).stream(); - PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); std::vector output_maps = Attr>("output_name_mapping"); std::map inputs; - // Convert input tensor from fluid to engine. for (const auto &x : Inputs("Xs")) { if (param_names_.count(x)) continue; auto &t = @@ -110,17 +93,38 @@ class AnakinEngineOp : public framework::OperatorBase { outputs.insert({output_maps[output_index], fluid_t}); output_index += 1; } - engine->Execute(inputs, outputs, stream); + if (enable_int8_) { + Execute<::anakin::Precision::INT8>(inputs, outputs, dev_place); + } else { + Execute<::anakin::Precision::FP32>(inputs, outputs, dev_place); + } } - AnakinNvEngineT *GetEngine(const framework::Scope &scope, - const platform::Place &dev_place) const { - if (anakin_engine_ == nullptr) { - anakin_engine_ = - inference::Singleton::Global() + template <::anakin::Precision PrecisionT> + void Execute(const std::map &inputs, + const std::map &outputs, + const platform::Place &dev_place) const { + if (use_gpu_) { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + auto stream = + reinterpret_cast(dev_ctx) + .stream(); + auto *engine = + inference::Singleton>::Global() + .Get(engine_key_); + engine->Execute(inputs, outputs, stream); +#endif + } else { + auto *engine = + inference::Singleton>::Global() .Get(engine_key_); + engine->Execute(inputs, outputs); } - return anakin_engine_; } }; diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 33bd275e5cc507..7d551106756070 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -235,11 +235,13 @@ struct FindRangeAbsMaxFunctor { int g_find_max; memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max, - sizeof(int), 0); + sizeof(int), ctx.stream()); + ctx.Wait(); if (g_find_max) { int len; memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data, - sizeof(int), 0); + sizeof(int), ctx.stream()); + ctx.Wait(); FindAbsMaxFunctor()(ctx, scale_arr, len, out_scale_data); } @@ -258,25 +260,26 @@ struct FindMovingAverageAbsMaxFunctor { const auto gpu_place = boost::get(ctx.GetPlace()); T accum; - memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data(), - sizeof(T), 0); T state; - memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data(), - sizeof(T), 0); T scale; + memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data(), + sizeof(T), ctx.stream()); + memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data(), + sizeof(T), ctx.stream()); memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T), - 0); - + ctx.stream()); + ctx.Wait(); state = rate * state + 1; accum = rate * accum + scale; scale = accum / state; memory::Copy(gpu_place, out_accum->mutable_data(gpu_place), - platform::CPUPlace(), &accum, sizeof(T), 0); + platform::CPUPlace(), &accum, sizeof(T), ctx.stream()); memory::Copy(gpu_place, out_state->mutable_data(gpu_place), - platform::CPUPlace(), &state, sizeof(T), 0); + platform::CPUPlace(), &state, sizeof(T), ctx.stream()); memory::Copy(gpu_place, out_scale->mutable_data(gpu_place), - platform::CPUPlace(), &scale, sizeof(T), 0); + platform::CPUPlace(), &scale, sizeof(T), ctx.stream()); + ctx.Wait(); } }; diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc new file mode 100644 index 00000000000000..59ba660af79bff --- /dev/null +++ b/paddle/fluid/operators/pixel_shuffle_op.cc @@ -0,0 +1,135 @@ +/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pixel_shuffle_op.h" +#include + +namespace paddle { +namespace operators { + +class PixelShuffleOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of PixelShuffleOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of PixelShuffleOp should not be null."); + + auto input_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + auto upscale_factor = ctx->Attrs().Get("upscale_factor"); + + PADDLE_ENFORCE(input_dims[1] % (upscale_factor * upscale_factor) == 0, + "Upscale_factor should devide the number of channel"); + + auto output_dims = input_dims; + output_dims[0] = input_dims[0]; + output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor); + output_dims[2] = input_dims[2] * upscale_factor; + output_dims[3] = input_dims[3] * upscale_factor; + ctx->SetOutputDim("Out", output_dims); + } +}; + +class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "X", + "(Tensor, default Tensor), " + "the input feature data of PixelShuffleOp, the layout is [N C H W]."); + AddOutput( + "Out", + "(Tensor, default Tensor), the output of " + "PixelShuffleOp. The layout is [N,C/factor^2,H*factor,W*factor]."); + AddAttr("upscale_factor", + "the factor to increase spatial resolution by.") + .SetDefault(1) + .AddCustomChecker([](const int& upscale_factor) { + PADDLE_ENFORCE_GE(upscale_factor, 1, + "upscale_factor should be larger than 0."); + }); + + AddComment(R"DOC( + Pixel Shuffle operator + This operator rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)` + to a tensor of shape :math:`(C, H \times r, W \times r)`. + + This is useful for implementing efficient sub-pixel convolution + with a stride of :math:`1/r`. + + Please refer to the paper: + `Real-Time Single Image and Video Super-Resolution Using an Efficient + Sub-Pixel Convolutional Neural Network `_ + by Shi et. al (2016) for more details. + + )DOC"); + } +}; + +class PixelShuffleGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("pixel_shuffle_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetAttrMap(Attrs()); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return std::unique_ptr(op); + } +}; + +class PixelShuffleGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@Grad) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@Grad) should not be null"); + + auto do_dims = ctx->GetInputDim(framework::GradVarName("Out")); + PADDLE_ENFORCE(do_dims.size() == 4, "The layout of input is NCHW."); + + auto upscale_factor = ctx->Attrs().Get("upscale_factor"); + + auto dx_dims = do_dims; + dx_dims[0] = do_dims[0]; + dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor); + dx_dims[2] = do_dims[2] / upscale_factor; + dx_dims[3] = do_dims[3] / upscale_factor; + ctx->SetOutputDim(framework::GradVarName("X"), dx_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker, + ops::PixelShuffleGradMaker); + +REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp); + +REGISTER_OP_CPU_KERNEL( + pixel_shuffle, + ops::PixelShuffleOpKernel, + ops::PixelShuffleOpKernel); + +REGISTER_OP_CPU_KERNEL( + pixel_shuffle_grad, + ops::PixelShuffleGradOpKernel, + ops::PixelShuffleGradOpKernel); diff --git a/paddle/fluid/operators/pixel_shuffle_op.cu b/paddle/fluid/operators/pixel_shuffle_op.cu new file mode 100644 index 00000000000000..6faf91079e1dac --- /dev/null +++ b/paddle/fluid/operators/pixel_shuffle_op.cu @@ -0,0 +1,26 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pixel_shuffle_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + pixel_shuffle, ops::PixelShuffleOpKernel, + ops::PixelShuffleOpKernel); +REGISTER_OP_CUDA_KERNEL( + pixel_shuffle_grad, + ops::PixelShuffleGradOpKernel, + ops::PixelShuffleGradOpKernel); diff --git a/paddle/fluid/operators/pixel_shuffle_op.h b/paddle/fluid/operators/pixel_shuffle_op.h new file mode 100644 index 00000000000000..1ae1c7e9d50cb9 --- /dev/null +++ b/paddle/fluid/operators/pixel_shuffle_op.h @@ -0,0 +1,82 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class PixelShuffleOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + int factor = ctx.Attr("upscale_factor"); + + auto in_dims = in->dims(); + auto o_dims = out->dims(); + + framework::Tensor t; + t.ShareDataWith(*in); + t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]}); + + std::vector axis = {0, 1, 4, 2, 5, 3}; + + framework::Tensor o; + o.ShareDataWith(*out); + o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor}); + + math::Transpose trans; + auto& dev_ctx = ctx.template device_context(); + trans(dev_ctx, t, &o, axis); + out->Resize(o_dims); + } +}; + +template +class PixelShuffleGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); + + int factor = ctx.Attr("upscale_factor"); + + auto do_dims = dout->dims(); + auto dx_dims = dx->dims(); + + framework::Tensor t; + t.ShareDataWith(*dout); + t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor}); + + std::vector axis = {0, 1, 3, 5, 2, 4}; + + framework::Tensor o; + o.ShareDataWith(*dx); + o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]}); + + math::Transpose trans; + auto& dev_ctx = ctx.template device_context(); + trans(dev_ctx, t, &o, axis); + dx->Resize(dx_dims); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 236afc77f708c3..b650225c64a9a3 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include "paddle/fluid/inference/api/analysis_predictor.h" @@ -229,6 +230,15 @@ void BindAnalysisConfig(py::module *m) { py::arg("min_subgraph_size") = 3, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("use_static") = true) + .def("enable_anakin_engine", &AnalysisConfig::EnableAnakinEngine, + py::arg("max_batch_size") = 1, + py::arg("max_input_shape") = + std::map>(), + py::arg("min_subgraph_size") = 6, + py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, + py::arg("auto_config_layout") = false, + py::arg("passes_filter") = std::vector(), + py::arg("ops_filter") = std::vector()) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, py::arg("x") = true) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index eb6895f2a69ade..811eec90720d40 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -66,6 +66,8 @@ from .compiler import * from paddle.fluid.layers.math_op_patch import monkey_patch_variable from . import install_check +from .dygraph.nn import * +from .dygraph.layers import * Tensor = LoDTensor diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index d55dbbb9c72cb8..bf484b35c7bf9a 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -22,7 +22,7 @@ def enabled(): - return framework._in_dygraph_mode() + return framework.in_dygraph_mode() @signature_safe_contextmanager diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py index f992ae0576c81e..f2b01aece7bf86 100644 --- a/python/paddle/fluid/dygraph/checkpoint.py +++ b/python/paddle/fluid/dygraph/checkpoint.py @@ -97,20 +97,12 @@ def load_persistables(vardict, dirname, filename=None): Examples: .. code-block:: python - my_layer = layer(fluid.dygraph.Layer) + my_layer = layer(fluid.Layer) param_path = "./my_paddle_model" param_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path) param_1 = param_dict['PtbModel_0.w_1'] - or: - my_layer = layer(fluid.dygraph.Layer) - param_path = "./my_paddle_model" - filename = "model.file" - param_dict = fluid.dygraph.load_persistables(my_layer.state_dict(), param_path, - filename=filename) - param_1 = param_dict['PtbModel_0.w_1'] - """ if isinstance(vardict, collections.OrderedDict): return _load_var_from_file(vardict, dirname, filename) diff --git a/python/paddle/fluid/dygraph/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py index f8e607aab8491a..b757f8fff24fce 100644 --- a/python/paddle/fluid/dygraph/layer_object_helper.py +++ b/python/paddle/fluid/dygraph/layer_object_helper.py @@ -16,7 +16,7 @@ import copy import six -from ..framework import Parameter, _in_dygraph_mode +from ..framework import Parameter, in_dygraph_mode from ..param_attr import ParamAttr from .. import core from six.moves import zip diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 014ee41f4c5aa2..39e06e3486cd54 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -139,14 +139,14 @@ def sublayers(self, include_sublayers=True): def clear_gradients(self): for p in self.parameters(): - p._clear_gradient() + p.clear_gradient() - def _build_once(self, *args): + def build_once(self, *args): pass def __call__(self, *inputs): if not self._built: - self._build_once(*inputs) + self.build_once(*inputs) outputs = self.forward(*inputs) self._built = True diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 527c37cb2c4f15..6384e5678837b9 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -19,7 +19,7 @@ from .. import core from ..layers import utils from . import layers -from ..framework import Variable, _in_dygraph_mode, OpProtoHolder, Parameter +from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter from ..param_attr import ParamAttr from ..initializer import Normal, Constant, NumpyArrayInitializer import numpy as np @@ -33,6 +33,109 @@ class Conv2D(layers.Layer): + """ + The convolution2D layer calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input and + Output are in NCHW format, where N is batch size, C is the number of + channels, H is the height of the feature, and W is the width of the feature. + Filter is in MCHW format, where M is the number of output image channels, + C is the number of input image channels, H is the height of the filter, + and W is the width of the filter. If the groups is greater than 1, + C will equal the number of input image channels divided by the groups. + Please refer to UFLDL's `convolution + `_ + for more detials. + If bias attribution and activation type are provided, bias is added to the + output of the convolution, and the corresponding activation function is + applied to the final result. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma (W \\ast X + b) + + Where: + + * :math:`X`: Input value, a tensor with NCHW format. + * :math:`W`: Filter value, a tensor with MCHW format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, C_{in}, H_{in}, W_{in})` + + Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)` + + - Output: + + Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1 + + Args: + input (Variable): The input image with [N, C, H, W] format. + num_filters(int): The number of filter. It is as same as the output + image channel. + filter_size (int|tuple|None): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_H, filter_size_W). + Otherwise, the filter will be a square. + stride (int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + padding (int|tuple): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + dilation (int|tuple): The dilation size. If dilation is a tuple, it must + contain two integers, (dilation_H, dilation_W). Otherwise, the + dilation_H = dilation_W = dilation. Default: dilation = 1. + groups (int): The groups number of the Conv2d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1. + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv2d. If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with :math:`Normal(0.0, std)`, + and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act (str): Activation type, if it is set to None, activation is not appended. + Default: None + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Default: None + + Returns: + Variable: The tensor variable storing the convolution and \ + non-linearity activation result. + + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') + conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu") + """ + def __init__(self, name_scope, num_channels, @@ -265,7 +368,7 @@ def __init__(self, self._param_attr = param_attr self._bias_attr = bias_attr - def _build_once(self, input): + def build_once(self, input): num_channels = input.shape[1] self._dtype = self._helper.input_dtype(input) @@ -332,6 +435,116 @@ def forward(self, input): class Conv3DTranspose(layers.Layer): + """ + **Convlution3D transpose layer** + + The convolution3D transpose layer calculates the output based on the input, + filter, and dilations, strides, paddings. Input(Input) and output(Output) + are in NCDHW format. Where N is batch size, C is the number of channels, + D is the depth of the feature, H is the height of the feature, and W + is the width of the feature. Parameters(dilations, strides, paddings) are + two elements. These two elements represent height and width, respectively. + The details of convolution transpose layer, please refer to the following + explanation and references `therein `_. + If bias attribution and activation type are provided, bias is added to + the output of the convolution, and the corresponding activation function + is applied to the final result. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma (W \\ast X + b) + + In the above equation: + + * :math:`X`: Input value, a tensor with NCDHW format. + * :math:`W`: Filter value, a tensor with MCDHW format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` + + Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)` + + - Output: + + Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + D_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\ + H_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\ + W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 + + Args: + input(Variable): The input image with [N, C, D, H, W] format. + num_filters(int): The number of the filter. It is as same as the output + image channel. + output_size(int|tuple|None): The output image size. If output size is a + tuple, it must contain three integers, (image_D, image_H, image_W). This + parameter only works when filter_size is None. + filter_size(int|tuple|None): The filter size. If filter_size is a tuple, + it must contain three integers, (filter_size_D, filter_size_H, filter_size_W). + Otherwise, the filter will be a square. None if use output size to + calculate filter_size. + padding(int|tuple): The padding size. If padding is a tuple, it must + contain three integers, (padding_D, padding_H, padding_W). Otherwise, the + padding_D = padding_H = padding_W = padding. Default: padding = 0. + stride(int|tuple): The stride size. If stride is a tuple, it must + contain three integers, (stride_D, stride_H, stride_W). Otherwise, the + stride_D = stride_H = stride_W = stride. Default: stride = 1. + dilation(int|tuple): The dilation size. If dilation is a tuple, it must + contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the + dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1. + groups(int): The groups number of the Conv3d transpose layer. Inspired by + grouped convolution in Alex Krizhevsky's Deep CNN paper, in which + when group=2, the first half of the filters is only connected to the + first half of the input channels, while the second half of the + filters is only connected to the second half of the input channels. + Default: groups=1 + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d_transpose. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv3d_transpose + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act (str): Activation type, if it is set to None, activation is not appended. + Default: None. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The tensor variable storing the convolution transpose result. + + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. + + Examples: + .. code-block:: python + + conv3d_transpose = nn.Conv3DTranspose( + 'Conv3DTranspose', + num_filters=12, + filter_size=12, + use_cudnn=False) + transpose_res = conv3d_transpose(base.to_variable(input_array)) + """ + def __init__(self, name_scope, num_filters, @@ -362,7 +575,7 @@ def __init__(self, self._bias_attr = bias_attr self._act = act - def _build_once(self, input): + def build_once(self, input): self._dtype = self._helper.input_dtype(input) self._input_channel = input.shape[1] @@ -436,6 +649,54 @@ def forward(self, input): class Pool2D(layers.Layer): + """ + ${comment} + + Args: + input (Variable): The input tensor of pooling operator. The format of + input tensor is NCHW, where N is batch size, C is + the number of channels, H is the height of the + feature, and W is the width of the feature. + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, + it must contain two integers, (pool_size_Height, pool_size_Width). + Otherwise, the pool kernel size will be a square of an int. + pool_type: ${pooling_type_comment} + pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list, + it must contain two integers, (pool_stride_Height, pool_stride_Width). + Otherwise, the pool stride size will be a square of an int. + pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple, + it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width). + Otherwise, the pool padding size will be a square of an int. + global_pooling (bool): ${global_pooling_comment} + use_cudnn (bool): ${use_cudnn_comment} + ceil_mode (bool): ${ceil_mode_comment} + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + exclusive (bool): Whether to exclude padding points in average pooling + mode, default is true + + Returns: + Variable: The pooling result. + + Raises: + ValueError: If 'pool_type' is not "max" nor "avg" + ValueError: If 'global_pooling' is False and 'pool_size' is -1 + ValueError: If 'use_cudnn' is not a bool value. + + Examples: + + .. code-block:: python + + data = fluid.layers.data( + name='data', shape=[3, 32, 32], dtype='float32') + pool2d = fluid.Pool2D("pool2d",pool_size=2, + pool_type='max', + pool_stride=1, + global_pooling=False) + + pool2d_res = pool2d(data) + """ + def __init__(self, name_scope, pool_size=-1, @@ -495,6 +756,102 @@ def forward(self, input): class FC(layers.Layer): + """ + **Fully Connected Layer** + + This function creates a fully connected layer in the network. It can take + one or multiple tensors as its inputs(input can be a list of Variable, see + Args in detail). It creates a variable called weights for each input tensor, + which represents a fully connected weight matrix from each input unit to + each output unit. The fully connected layer multiplies each input tensor + with its corresponding weight to produce an output Tensor with shape [M, `size`], + where M is batch size. If multiple input tensors are given, the results of + multiple output tensors with shape [M, `size`] will be summed up. If bias_attr + is not None, a bias variable will be created and added to the output. + Finally, if activation is not None, it will be applied to the output as well. + + When the input is single tensor: + + .. math:: + + Out = Act({XW + b}) + + When the input are multiple tensors: + + .. math:: + + Out = Act({\sum_{i=0}^{N-1}X_iW_i + b}) + + In the above equation: + + * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable. + * :math:`X_i`: The i-th input tensor. + * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor. + * :math:`b`: The bias parameter created by this layer (if needed). + * :math:`Act`: The activation function. + * :math:`Out`: The output tensor. + + See below for an example. + + .. code-block:: text + + Given: + data_1.data = [[[0.1, 0.2], + [0.3, 0.4]]] + data_1.shape = (1, 2, 2) # 1 is batch_size + + data_2 = [[[0.1, 0.2, 0.3]]] + data_2.shape = (1, 1, 3) + + out = fluid.layers.fc(input=[data_1, data_2], size=2) + + Then: + out.data = [[0.18669507, 0.1893476]] + out.shape = (1, 2) + + Args: + input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of + the input tensor(s) is at least 2. + size(int): The number of output units in this layer. + num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than + two dimensions. If this happens, the multidimensional tensor will first be flattened + into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input + tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1) + dimensions will be flatten to form the first dimension of the final matrix (height of + the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to + form the second dimension of the final matrix (width of the matrix). For example, suppose + `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. + Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. + param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable + parameters/weights of this layer. + bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias + of this layer. If it is set to False, no bias will be added to the output units. + If it is set to None, the bias is initialized zero. Default: None. + act (str, default None): Activation to be applied to the output of this layer. + is_test(bool): A flag indicating whether execution is in test phase. + name (str, default None): The name of this layer. + + Returns: + Variable: The transformation result. + + Raises: + ValueError: If rank of the input tensor is less than 2. + + Examples: + .. code-block:: python + + # when input is single tensor + data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") + fc = fluid.FC("fc", size=1000, act="tanh") + fc_res = fc(data) + + # when input are multiple tensors + data_1 = fluid.layers.data(name="data_1", shape=[32, 32], dtype="float32") + data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32") + fc = fluid.FC("fc", size=1000, act="tanh") + fc_res = fc([data_1, data_2]) + """ + def __init__(self, name_scope, size, @@ -522,7 +879,7 @@ def _w(self, value, i=0): assert isinstance(value, Parameter) self.__w[i] = value - def _build_once(self, input): + def build_once(self, input): i = 0 for inp, param in self._helper.iter_inputs_and_params(input, self._param_attr): @@ -591,6 +948,91 @@ def forward(self, input): class BatchNorm(layers.Layer): + """ + **Batch Normalization Layer** + + Can be used as a normalizer function for conv2d and fully_connected operations. + The required data format for this layer is one of the following: + + 1. NHWC `[batch, in_height, in_width, in_channels]` + + 2. NCHW `[batch, in_channels, in_height, in_width]` + + Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing + Internal Covariate Shift `_ + for more details. + + :math:`input` is the input features over a mini-batch. + + .. math:: + + \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ + \ mini-batch\ mean \\\\ + \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ + \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + + When use_global_stats = True, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. + They are global (or running) statistics. (It usually got from the + pre-trained model.) + The training and testing (or inference) have the same behavior: + + .. math:: + + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta + + Args: + input(variable): The rank of input variable can be 2, 3, 4, 5. + act(string, Default None): Activation type, linear|relu|prelu|... + is_test (bool, Default False): A flag indicating whether it is in + test phrase or not. + momentum(float, Default 0.9): The value used for the moving_mean and + moving_var computation. The updated formula is: + :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)` + :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)` + Default is 0.9. + epsilon(float, Default 1e-05): A value added to the denominator for + numerical stability. Default is 1e-5. + param_attr(ParamAttr|None): The parameter attribute for Parameter `scale` + of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm. + If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + data_layout(string, default NCHW): NCHW|NHWC + in_place(bool, Default False): Make the input and output of batch norm reuse memory. + name(string, Default None): A name for this layer(optional). If set None, the layer + will be named automatically. + moving_mean_name(string, Default None): The name of moving_mean which store the global Mean. + moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance. + do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not. + fuse_with_relu (bool): if True, this OP performs relu after batch norm. + use_global_stats(bool, Default False): Whether to use global mean and + variance. In inference or test mode, set use_global_stats to true + or is_test to true, and the behavior is equivalent. + In train mode, when setting use_global_stats True, the global mean + and variance are also used during train period. + + Returns: + Variable: A tensor variable which is the result after applying batch normalization on the input. + + Examples: + + .. code-block:: python + fc = fluid.FC('fc', size=200, param_attr='fc1.w') + hidden1 = fc(x) + batch_norm = fluid.BatchNorm("batch_norm", 10) + hidden2 = batch_norm(hidden1) + """ + def __init__(self, name_scope, num_channels, @@ -629,7 +1071,7 @@ def __init__(self, dtype=self._dtype, default_initializer=Constant(1.0)) if use_global_stats and self._param_attr.learning_rate == 0.: - self._scale._stop_gradient = True + self._scale.stop_gradient = True self._bias = self.create_parameter( attr=self._param_attr, @@ -637,7 +1079,7 @@ def __init__(self, dtype=self._dtype, is_bias=True) if use_global_stats and self._param_attr.learning_rate == 0.: - self._bias._stop_gradient = True + self._bias.stop_gradient = True self._mean = self.create_parameter( attr=ParamAttr( @@ -647,7 +1089,7 @@ def __init__(self, do_model_average=do_model_average_for_mean_and_var), shape=param_shape, dtype=self._dtype) - self._mean._stop_gradient = True + self._mean.stop_gradient = True self._variance = self.create_parameter( attr=ParamAttr( @@ -657,7 +1099,7 @@ def __init__(self, do_model_average=do_model_average_for_mean_and_var), shape=param_shape, dtype=self._dtype) - self._variance._stop_gradient = True + self._variance.stop_gradient = True self._in_place = in_place self._momentum = momentum @@ -666,7 +1108,7 @@ def __init__(self, self._fuse_with_relu = fuse_with_relu self._use_global_stats = use_global_stats - def _build_once(self, input): + def build_once(self, input): pass def forward(self, input): @@ -747,7 +1189,7 @@ class Embedding(layers.Layer): dict_size = len(dataset.ids) input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32') - embedding = fluid.dygraph.Embedding(size=[dict_size, 16]) + embedding = fluid.Embedding(size=[dict_size, 16]) fc = embedding(input) """ @@ -797,70 +1239,70 @@ def forward(self, input): class LayerNorm(layers.Layer): - def __init__(self, - name_scope, - scale=True, - shift=True, - begin_norm_axis=1, - epsilon=1e-05, - param_attr=None, - bias_attr=None, - act=None): - """ - ${comment} + """ + ${comment} - The formula is as follows: + The formula is as follows: - .. math:: + .. math:: - \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i + \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i - \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2} + \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2} - h & = f(\\frac{g}{\\sigma}(a - \\mu) + b) + h & = f(\\frac{g}{\\sigma}(a - \\mu) + b) - * :math:`a`: the vector representation of the summed inputs to the neurons - in that layer. + * :math:`a`: the vector representation of the summed inputs to the neurons + in that layer. - * :math:`H`: the number of hidden units in a layers + * :math:`H`: the number of hidden units in a layers - * :math:`g`: the trainable scale parameter. + * :math:`g`: the trainable scale parameter. - * :math:`b`: the trainable bias parameter. + * :math:`b`: the trainable bias parameter. - Args: - input(Variable): The input tensor variable. - scale(bool): Whether to learn the adaptive gain :math:`g` after - normalization. Default True. - shift(bool): Whether to learn the adaptive bias :math:`b` after - normalization. Default True. - begin_norm_axis(int): The normalization will be performed along - dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`. - Default 1. - epsilon(float): The small value added to the variance to prevent - division by zero. Default 1e-05. - param_attr(ParamAttr|None): The parameter attribute for the learnable - gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is - omitted. If :attr:`scale` is True and :attr:`param_attr` is None, - a default :code:`ParamAttr` would be added as scale. The - :attr:`param_attr` is initialized as 1 if it is added. Default None. - bias_attr(ParamAttr|None): The parameter attribute for the learnable - bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is - omitted. If :attr:`shift` is True and :attr:`param_attr` is None, - a default :code:`ParamAttr` would be added as bias. The - :attr:`bias_attr` is initialized as 0 if it is added. Default None. - act(str): Activation to be applied to the output of layer normalizaiton. - Default None. - Returns: - ${y_comment} + Args: + input(Variable): The input tensor variable. + scale(bool): Whether to learn the adaptive gain :math:`g` after + normalization. Default True. + shift(bool): Whether to learn the adaptive bias :math:`b` after + normalization. Default True. + begin_norm_axis(int): The normalization will be performed along + dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`. + Default 1. + epsilon(float): The small value added to the variance to prevent + division by zero. Default 1e-05. + param_attr(ParamAttr|None): The parameter attribute for the learnable + gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is + omitted. If :attr:`scale` is True and :attr:`param_attr` is None, + a default :code:`ParamAttr` would be added as scale. The + :attr:`param_attr` is initialized as 1 if it is added. Default None. + bias_attr(ParamAttr|None): The parameter attribute for the learnable + bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is + omitted. If :attr:`shift` is True and :attr:`param_attr` is None, + a default :code:`ParamAttr` would be added as bias. The + :attr:`bias_attr` is initialized as 0 if it is added. Default None. + act(str): Activation to be applied to the output of layer normalizaiton. + Default None. + Returns: + ${y_comment} - Examples: + Examples: - >>> data = fluid.layers.data(name='data', shape=[3, 32, 32], - >>> dtype='float32') - >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1) - """ + >>> data = fluid.layers.data(name='data', shape=[3, 32, 32], + >>> dtype='float32') + >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1) + """ + def __init__(self, + name_scope, + scale=True, + shift=True, + begin_norm_axis=1, + epsilon=1e-05, + param_attr=None, + bias_attr=None, + act=None): super(LayerNorm, self).__init__(name_scope) self._scale = scale self._shift = shift @@ -870,7 +1312,7 @@ def __init__(self, self._bias_attr = bias_attr self._act = act - def _build_once(self, input): + def build_once(self, input): self._dtype = self._helper.input_dtype(input) input_shape = input.shape param_shape = [ @@ -1232,7 +1674,7 @@ def _init_by_numpy_array(numpy_array): 'remote_prefetch': remote_prefetch } - def _build_once(self, input, label, sample_weight=None): + def build_once(self, input, label, sample_weight=None): assert isinstance(input, Variable) assert isinstance(label, Variable) @@ -1318,7 +1760,7 @@ def __init__(self, name_scope, mode, param_attr=None): raise ValueError('mode should be one of all, channel, element.') self._alpha_shape = [1] - def _build_once(self, input): + def build_once(self, input): if self._mode == 'channel': self._alpha_shape = [1, input.shape[1], 1, 1] elif self._mode == 'element': @@ -1396,7 +1838,7 @@ def __init__(self, self._name = name self._inputs = dict() - def _build_once(self, x, y): + def build_once(self, x, y): self._dtype = self._helper.input_dtype(x) param_shape = [self._size, x.shape[1], y.shape[1]] @@ -1572,7 +2014,7 @@ def __init__(self, self._output_size = output_size self._op_type = 'conv2d_transpose' - def _build_once(self, input): + def build_once(self, input): input_channel = input.shape[1] if (input_channel == self._groups and self._num_filters == input_channel and not self._use_cudnn): @@ -1686,7 +2128,7 @@ def __init__(self, bias_attr=None, param_attr=None, act=None): - assert not _in_dygraph_mode( + assert not in_dygraph_mode( ), "SequenceConv is not supported by dynamic graph mode yet!" super(SequenceConv, self).__init__(name_scope) self._num_filters = num_filters @@ -1696,7 +2138,7 @@ def __init__(self, self._bias_attr = bias_attr self._param_attr = param_attr - def _build_once(self, input): + def build_once(self, input): self._dtype = self._helper.input_dtype(input) filter_shape = [self._filter_size * input.shape[1], self._num_filters] self._filter_param = self.create_parameter( @@ -1726,14 +2168,14 @@ def __init__(self, future_context_size, param_attr=None, act=None): - assert not _in_dygraph_mode( + assert not in_dygraph_mode( ), "RowConv is not supported by dynamic graph mode yet!" super(RowConv, self).__init__(name_scope) self._act = act self._param_attr = param_attr self._future_context_size = future_context_size - def _build_once(self, input): + def build_once(self, input): self._dtype = self._helper.input_dtype(input) filter_shape = [self._future_context_size + 1, input.shape[1]] self._filter_param = self.create_parameter( @@ -1796,7 +2238,7 @@ def __init__(self, if data_layout != 'NCHW': raise ValueError("unsupported data layout:" + data_layout) - def _build_once(self, input): + def build_once(self, input): self._dtype = self._helper.input_dtype(input) param_shape = [input.shape[1]] if self._bias_attr: @@ -1849,7 +2291,7 @@ def __init__(self, name_scope, dim=0, power_iters=1, eps=1e-12, name=None): self._eps = eps self._dim = dim - def _build_once(self, weight): + def build_once(self, weight): self._dtype = self._helper.input_dtype(weight) input_shape = weight.shape h = input_shape[self._dim] @@ -1904,7 +2346,7 @@ def __init__(self, self._bias_attr = bias_attr self._param_attr = param_attr - def _build_once(self, nodes_vector, edge_set): + def build_once(self, nodes_vector, edge_set): assert isinstance(nodes_vector, Variable) assert isinstance(edge_set, Variable) self._dtype = self._helper.input_dtype(nodes_vector) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 7953d98bcbb826..c05e5fb9e3a46e 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -67,6 +67,7 @@ 'cuda_places', 'cpu_places', 'cuda_pinned_places', + 'in_dygraph_mode', ] EMPTY_VAR_NAME = core.kEmptyVarName() @@ -79,7 +80,10 @@ _dygraph_current_expected_place_ = None -def _in_dygraph_mode(): +def in_dygraph_mode(): + ''' + Returns(bool): True if the program is running in dynamic graph mode + ''' return _dygraph_tracer_ is not None @@ -396,7 +400,7 @@ def __init__(self, if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) - if _in_dygraph_mode(): + if in_dygraph_mode(): # record vars in tracer rather than blocks self._ivar = kwargs.get("ivar", None) if not self._ivar: @@ -482,21 +486,21 @@ def __init__(self, self.block.vars[name] = self self.op = None - self.stop_gradient = stop_gradient + self._stop_gradient = stop_gradient self.is_data = is_data - def _numpy(self): + def numpy(self): new_ivar = self._ivar._copy_to(core.CPUPlace(), True) return np.array(new_ivar.value().get_tensor()) - def _backward(self): + def backward(self): self._ivar._run_backward() - def _gradient(self): + def gradient(self): new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True) return np.array(new_ivar.value().get_tensor()) - def _clear_gradient(self): + def clear_gradient(self): self._ivar._clear_gradient() def __str__(self): @@ -516,7 +520,7 @@ def to_string(self, throw_on_error, with_details=False): Returns: str: The debug string. """ - if _in_dygraph_mode(): + if in_dygraph_mode(): # TODO(panyx0718): add more dygraph debug info. return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype, self.shape) @@ -535,7 +539,7 @@ def to_string(self, throw_on_error, with_details=False): __repr__ = __str__ - def _set_desc(self, input): + def set_desc(self, input): """ Set the variable description. @@ -548,43 +552,43 @@ def _set_desc(self, input): self.desc = input @property - def _stop_gradient(self): - if _in_dygraph_mode(): + def stop_gradient(self): + if in_dygraph_mode(): return self._ivar.stop_gradient else: - return self.stop_gradient + return self._stop_gradient - @_stop_gradient.setter - def _stop_gradient(self, s): - if _in_dygraph_mode(): + @stop_gradient.setter + def stop_gradient(self, s): + if in_dygraph_mode(): self._ivar.stop_gradient = s else: - self.stop_gradient = s + self._stop_gradient = s @property def persistable(self): - if _in_dygraph_mode(): + if in_dygraph_mode(): return self._ivar.persistable else: return self.desc.persistable() @persistable.setter def persistable(self, p): - if _in_dygraph_mode(): + if in_dygraph_mode(): return self._ivar.persistable else: self.desc.set_persistable(p) @property def name(self): - if _in_dygraph_mode(): + if in_dygraph_mode(): return self._ivar.name else: return cpt.to_text(self.desc.name()) @name.setter def name(self, new_name): - if _in_dygraph_mode(): + if in_dygraph_mode(): self._ivar.name = new_name else: self.desc.set_name(new_name) @@ -592,14 +596,14 @@ def name(self, new_name): @property def shape(self): # convert to tuple, make it as same as numpy API. - if _in_dygraph_mode(): + if in_dygraph_mode(): return self._ivar.shape else: return tuple(self.desc.shape()) @property def dtype(self): - if _in_dygraph_mode(): + if in_dygraph_mode(): return self._ivar.dtype else: return self.desc.dtype() @@ -611,7 +615,7 @@ def lod_level(self): @property def type(self): - if _in_dygraph_mode(): + if in_dygraph_mode(): return self._ivar.dtype else: return self.desc.type() @@ -721,7 +725,7 @@ def _cloneVar(self, copy=False): name=unique_name.generate(".".join(self.name)), dtype=self.dtype, persistable=self.persistable, - stop_gradient=self._stop_gradient, ) + stop_gradient=self.stop_gradient, ) else: return self @@ -930,7 +934,7 @@ def __init__(self, inputs=None, outputs=None, attrs=None): - if _in_dygraph_mode(): + if in_dygraph_mode(): if type is None: raise ValueError( "`type` to initialized an Operator can not be None.") @@ -1049,7 +1053,7 @@ def find_name(var_list, name): for arg in out_args: out_arg_names.append(cpt.to_text(arg.name)) # TODO(minqiyang): could we remove variable's op in static mode? - if not _in_dygraph_mode(): + if not in_dygraph_mode(): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) @@ -1095,7 +1099,7 @@ def __str__(self): @property def type(self): - if _in_dygraph_mode(): + if in_dygraph_mode(): return self.iop.type else: return self.desc.type() @@ -1638,7 +1642,7 @@ def append_op(self, *args, **kwargs): Returns: Operator: the append Operator. """ - if _in_dygraph_mode(): + if in_dygraph_mode(): op = Operator( block=self, desc=None, @@ -1710,7 +1714,7 @@ def _slice_ops(self, start, end): return self.ops[start:end] def _prepend_op(self, *args, **kwargs): - if _in_dygraph_mode(): + if in_dygraph_mode(): op = Operator( self, None, diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 6aff93dceaf5cf..da2591b98058a2 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -165,7 +165,7 @@ def __call__(self, var, block): 'force_cpu': self._force_cpu or force_init_on_cpu() }, stop_gradient=True) - if not framework._in_dygraph_mode(): + if not framework.in_dygraph_mode(): var.op = op return op @@ -245,7 +245,7 @@ def __call__(self, var, block): attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) - if not framework._in_dygraph_mode(): + if not framework.in_dygraph_mode(): var.op = op return op @@ -324,7 +324,7 @@ def __call__(self, var, block): outputs={"Out": var}, attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) - if not framework._in_dygraph_mode(): + if not framework.in_dygraph_mode(): var.op = op return op @@ -403,7 +403,7 @@ def __call__(self, var, block): outputs={"Out": var}, attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) - if not framework._in_dygraph_mode(): + if not framework.in_dygraph_mode(): var.op = op return op @@ -509,7 +509,7 @@ def __call__(self, var, block): "seed": self._seed }, stop_gradient=True) - if not framework._in_dygraph_mode(): + if not framework.in_dygraph_mode(): var.op = op return op @@ -610,7 +610,7 @@ def __call__(self, var, block): "seed": self._seed }, stop_gradient=True) - if not framework._in_dygraph_mode(): + if not framework.in_dygraph_mode(): var.op = op return op @@ -709,7 +709,7 @@ def __call__(self, var, block): 'shape': list(shape), value_name: values }) - if not framework._in_dygraph_mode(): + if not framework.in_dygraph_mode(): var.op = op return op @@ -768,7 +768,7 @@ def __call__(self, var, block): value_name: values }, stop_gradient=True) - if not framework._in_dygraph_mode(): + if not framework.in_dygraph_mode(): var.op = op return op diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 7eb912645e5077..11e3c4938bef4a 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -17,7 +17,7 @@ import copy import six -from .framework import Parameter, dtype_is_floating, _in_dygraph_mode +from .framework import Parameter, dtype_is_floating, in_dygraph_mode from . import unique_name from paddle.fluid.initializer import Constant, Xavier from .param_attr import ParamAttr diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py index 869a5f54e9cdf5..9eed00b16185d0 100644 --- a/python/paddle/fluid/layer_helper_base.py +++ b/python/paddle/fluid/layer_helper_base.py @@ -17,7 +17,7 @@ import copy import numpy as np -from .framework import Variable, default_main_program, default_startup_program, _in_dygraph_mode, _current_expected_place +from .framework import Variable, default_main_program, default_startup_program, in_dygraph_mode, _current_expected_place from . import unique_name from .param_attr import ParamAttr, WeightNormParamAttr from . import core @@ -54,7 +54,7 @@ def to_variable(self, value, block=None): Return Variable construct from value """ if isinstance(value, np.ndarray): - assert _in_dygraph_mode( + assert in_dygraph_mode( ), "to_variable could only be called in dygraph mode" if not block: @@ -302,7 +302,7 @@ def create_parameter(self, param = self._create_weight_normalize(attr, shape, dtype) WeightNormParamAttr.params_with_weight_norm.append(param) return param - if _in_dygraph_mode(): + if in_dygraph_mode(): # In dygraph mode, we want the returned parameter to be # initialized so that it can be used imperatively. return self.main_program.global_block().create_parameter( @@ -370,7 +370,7 @@ def set_variable_initializer(self, var, initializer): initializer: initializer to use """ assert isinstance(var, Variable) - if _in_dygraph_mode(): + if in_dygraph_mode(): initializer(var, var.block) else: self.startup_program.global_block().create_var( diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index b7d1eeba80d93d..72a2c845b9e8b2 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -349,24 +349,26 @@ def cosine_decay(learning_rate, step_each_epoch, epochs): training progresses. By using this function, the learning rate will be decayed by following cosine decay strategy. - decayed_lr = learning_rate * 0.5 * (math.cos(epoch * math.pi / epochs) + 1) + .. math:: + + decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1) Args: learning_rate(Variable|float): The initial learning rate. step_each_epoch(int): the number of steps in an epoch. epochs(int): the number of epochs. - Returns: - Variable: The decayed learning rate. - - Examples: + Returns: + Variable: The decayed learning rate. - ..code-block:: python + Examples: + .. code-block:: python - base_lr = 0.1 - lr = fluid.layers.cosine_decay( - learning_rate = base_lr, step_each_epoch=10000, epochs=120) + base_lr = 0.1 + lr = fluid.layers.cosine_decay( + learning_rate = base_lr, step_each_epoch=10000, epochs=120) """ + with default_main_program()._lr_schedule_guard(): if imperative_base.enabled(): decay = imperate_lr.CosineDecay(learning_rate, step_each_epoch, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 91414fdeb20778..7dbd26cb9a24df 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -23,7 +23,7 @@ import inspect from ..layer_helper import LayerHelper from ..initializer import Normal, Constant, NumpyArrayInitializer -from ..framework import Variable, OpProtoHolder, _in_dygraph_mode +from ..framework import Variable, OpProtoHolder, in_dygraph_mode from ..dygraph import base from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ @@ -191,6 +191,7 @@ 'kldiv_loss', 'tree_conv', 'npair_loss', + 'pixel_shuffle', 'fsp_matrix', ] @@ -3288,7 +3289,7 @@ def layer_norm(input, >>> dtype='float32') >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1) """ - assert _in_dygraph_mode( + assert in_dygraph_mode( ) is not True, "please use FC instead of fc in dygraph mode!" helper = LayerHelper('layer_norm', **locals()) dtype = helper.input_dtype() @@ -4792,7 +4793,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): the dimension to normalization is rank(X) + axis. -1 is the last dimension. epsilon(float): The epsilon value is used to avoid division by zero, \ - the defalut value is 1e-10. + the defalut value is 1e-12. name(str|None): A name for this layer(optional). If set None, the layer \ will be named automatically. @@ -6454,7 +6455,7 @@ def squeeze(input, axes, name=None): x = layers.data(name='x', shape=[5, 1, 10]) y = layers.sequeeze(input=x, axes=[1]) """ - assert not _in_dygraph_mode(), ( + assert not in_dygraph_mode(), ( "squeeze layer is not supported in dygraph mode yet.") helper = LayerHelper("squeeze", **locals()) out = helper.create_variable_for_type_inference(dtype=input.dtype) @@ -9193,7 +9194,7 @@ def _elementwise_op(helper): op_type = helper.layer_type x = helper.kwargs.get('x', None) y = helper.kwargs.get('y', None) - if _in_dygraph_mode(): + if in_dygraph_mode(): x = base.to_variable(x) y = base.to_variable(y) @@ -10923,6 +10924,65 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002): return l2loss + celoss +def pixel_shuffle(x, upscale_factor): + """ + + **Pixel Shuffle Layer** + + This layer rearranges elements in a tensor of shape [N, C, H, W] + to a tensor of shape [N, C/r**2, H*r, W*r]. + This is useful for implementing efficient sub-pixel convolution + with a stride of 1/r. + Please refer to the paper: `Real-Time Single Image and Video Super-Resolution + Using an Efficient Sub-Pixel Convolutional Neural Network `_ . + by Shi et. al (2016) for more details. + + .. code-block:: text + + Given a 4-D tensor with the shape: + x.shape = [1, 9, 4, 4] + Given upscale_factor: + upscale_factor= 3 + output shape is: + [1, 1, 12, 12] + + Args: + + x(Variable): The input tensor variable. + upscale_factor(int): factor to increase spatial resolution + + Returns: + + Out(Variable): Reshaped tensor according to the new dimension. + + Raises: + + ValueError: If the square of upscale_factor cannot divide the channels of input. + + Examples: + + .. code-block:: python + + input = fluid.layers.data(shape=[9,4,4]) + output = fluid.layers.pixel_shuffle(x=input, upscale_factor=3) + + """ + + helper = LayerHelper("pixel_shuffle", **locals()) + + out = helper.create_variable_for_type_inference(dtype=x.dtype) + + if not isinstance(upscale_factor, int): + raise TypeError("upscale factor must be int type") + + helper.append_op( + type="pixel_shuffle", + inputs={"X": x}, + outputs={"Out": out}, + attrs={"upscale_factor": upscale_factor}) + return out + + def fsp_matrix(x, y): """ diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 94bc3d0854d5b1..c3b7aee2b4d242 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -55,7 +55,7 @@ class Optimizer(object): """ def __init__(self, learning_rate, regularization=None, name=None): - if framework._in_dygraph_mode(): + if framework.in_dygraph_mode(): if not isinstance(learning_rate, float) and \ not isinstance(learning_rate, LearningRateDecay): raise TypeError( @@ -205,7 +205,7 @@ def _add_accumulator(self, name = self._name + "_" + name if (name in self._accumulators and param.name in self._accumulators[name]): - if framework._in_dygraph_mode(): + if framework.in_dygraph_mode(): return self._accumulators[name][param.name] raise Exception("Accumulator {} already exists for parameter {}". format(name, param.name)) @@ -275,15 +275,26 @@ def _create_optimization_pass(self, parameters_and_grads): self._create_global_learning_rate() optimize_ops = [] - for param_and_grad in parameters_and_grads: - if param_and_grad[1] is None: - continue - with param_and_grad[0].block.program._optimized_guard( - param_and_grad), name_scope("optimizer"): - if param_and_grad[0].trainable is True: - optimize_op = self._append_optimize_op(global_block, - param_and_grad) - optimize_ops.append(optimize_op) + if framework.in_dygraph_mode(): + for param_and_grad in parameters_and_grads: + if param_and_grad[1] is None: + continue + with param_and_grad[0].block.program._optimized_guard( + param_and_grad): + if param_and_grad[0].trainable is True: + optimize_op = self._append_optimize_op(global_block, + param_and_grad) + optimize_ops.append(optimize_op) + else: + for param_and_grad in parameters_and_grads: + if param_and_grad[1] is None: + continue + with param_and_grad[0].block.program._optimized_guard( + param_and_grad), name_scope("optimizer"): + if param_and_grad[0].trainable is True: + optimize_op = self._append_optimize_op(global_block, + param_and_grad) + optimize_ops.append(optimize_op) # Get custom finish ops for subclasses # FIXME: Need to fix this once we figure out how to handle dependencies @@ -363,7 +374,7 @@ def backward(self, See examples in `apply_gradients`. """ self._dtype = loss.dtype - if framework._in_dygraph_mode(): + if framework.in_dygraph_mode(): if parameter_list is not None: parameters = parameter_list else: @@ -448,7 +459,7 @@ def apply_optimize(self, loss, startup_program, params_grads): Returns: list: A list of operators appended to the current program. """ - if framework._in_dygraph_mode(): + if framework.in_dygraph_mode(): with program_guard(framework.default_main_program(), framework.default_startup_program()): optimize_ops = self._create_optimization_pass(params_grads) @@ -628,16 +639,16 @@ class DGCMomentumOptimizer(MomentumOptimizer): Original paper is https://arxiv.org/abs/1712.01887 - DGC reduce the communication bandwidth by sending only the important gradients (sparse update):\ + DGC reduces the communication bandwidth by sending only the important gradients (sparse update):\ only gradients larger than a threshold are transmitted. - To avoid losing information, DGC accumulate the rest of the gradients locally. + To avoid losing information, DGC accumulates the rest of the gradients locally. Eventually, these gradients become large enough to be transmitted. - Thus, DGC send the large gradients immediately but eventually send all of the gradients over time. + Thus, DGC sends the large gradients immediately but eventually send all of the gradients over time. - To ensure no loss of accuracy, DGC employs momentum correc-tionandlocal gradient clipping on top of the gradient sparsification to maintain model performance. + To ensure no loss of accuracy, DGC employs momentum correction and local gradient clipping on top of the gradient sparsification to maintain model performance. DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication. @@ -652,7 +663,7 @@ class DGCMomentumOptimizer(MomentumOptimizer): learning_rate (float|Variable): the learning rate used to update parameters. \ Can be a float value or a Variable with one float value as data element. momentum (float): Momentum factor. - rampup_begin_step (int): The begining step from which gradient compression is implemented. + rampup_begin_step (int): The beginning step from which gradient compression is implemented. rampup_step (int): How long it use the sparsity periods. Default is 1. for example: If the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 5, \ it will use 0.75 at 0 step, and 0.9375 at 1 step, and so on. And when reach sparsity array ends, \ @@ -660,9 +671,9 @@ class DGCMomentumOptimizer(MomentumOptimizer): sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). use_nesterov (bool): Enables Nesterov momentum. True means use nesterov. local_grad_clip_norm (float): Clip norm value if needed. - num_trainers: The number of training node. + num_trainers: The number of training nodes. regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. - name: A optional name prefix. + name: An optional name prefix. Examples: .. code-block:: python diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f99759cdaaf374..3fa54f1ed09ac2 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -79,6 +79,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) list(REMOVE_ITEM TEST_OPS test_imperative_resnet) list(REMOVE_ITEM TEST_OPS test_imperative_mnist) +list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext) list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer) list(REMOVE_ITEM TEST_OPS test_layers) foreach(TEST_OP ${TEST_OPS}) @@ -92,6 +93,8 @@ py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS FLAGS_cudnn_deterministic=1) py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS FLAGS_cudnn_deterministic=1) +py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext SERIAL ENVS + FLAGS_cudnn_deterministic=1) if(WITH_DISTRIBUTE) py_test_modules(test_dist_train MODULES test_dist_train SERIAL) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py index 9cb88d4a8553f3..04a36f7cafe7b4 100644 --- a/python/paddle/fluid/tests/unittests/test_base_layer.py +++ b/python/paddle/fluid/tests/unittests/test_base_layer.py @@ -18,7 +18,7 @@ import paddle.fluid as fluid -class L1(fluid.dygraph.Layer): +class L1(fluid.Layer): def __init__(self, prefix): super(L1, self).__init__(prefix) self._param_attr = fluid.ParamAttr( @@ -32,7 +32,7 @@ def forward(self): return self.w1 + self.w2 -class L2(fluid.dygraph.Layer): +class L2(fluid.Layer): def __init__(self, prefix): super(L2, self).__init__(prefix) self.layer1 = L1(self.full_name()) @@ -42,7 +42,7 @@ def forward(self): return self.layer1() + self.layer2() -class L3(fluid.dygraph.Layer): +class L3(fluid.Layer): def __init__(self, prefix): super(L3, self).__init__(prefix) self.layer1 = L2(self.full_name()) @@ -59,7 +59,7 @@ def test_one_level(self): ret = l() self.assertEqual(l.w1.name, "test_one_level/L1_0.w_0") self.assertEqual(l.w2.name, "test_one_level/L1_0.w_1") - self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2]))) + self.assertTrue(np.allclose(ret.numpy(), 0.2 * np.ones([2, 2]))) def test_three_level(self): with fluid.dygraph.guard(): @@ -72,7 +72,7 @@ def test_three_level(self): self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1.w_1") self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0.w_0") self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0.w_1") - self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2]))) + self.assertTrue(np.allclose(ret.numpy(), 0.8 * np.ones([2, 2]))) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 13f2d662178c7e..bc95b90ce4cc1b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -18,11 +18,11 @@ import paddle.fluid as fluid from paddle.fluid import core -from paddle.fluid.dygraph.nn import FC +from paddle.fluid import FC from test_imperative_base import new_program_scope -class MyLayer(fluid.dygraph.Layer): +class MyLayer(fluid.Layer): def __init__(self, name_scope): super(MyLayer, self).__init__(name_scope) @@ -34,7 +34,7 @@ def forward(self, inputs): return [x] -class MyPyLayer(fluid.dygraph.PyLayer): +class MyPyLayer(fluid.PyLayer): def __init__(self): super(MyPyLayer, self).__init__() @@ -48,7 +48,7 @@ def backward(inputs): return np.array(dout) * (1 - np.square(np.array(out))) -class MLP(fluid.dygraph.Layer): +class MLP(fluid.Layer): def __init__(self, name_scope): super(MLP, self).__init__(name_scope) self._fc1 = FC(self.full_name(), @@ -71,7 +71,7 @@ def forward(self, inputs): return x -class SimpleRNNCell(fluid.dygraph.Layer): +class SimpleRNNCell(fluid.Layer): def __init__(self, name_scope, step_input_size, hidden_size, output_size, param_attr): super(SimpleRNNCell, self).__init__(name_scope) @@ -81,7 +81,7 @@ def __init__(self, name_scope, step_input_size, hidden_size, output_size, self._dtype = core.VarDesc.VarType.FP32 self.param_attr = param_attr - def _build_once(self, inputs, pre_hidden): + def build_once(self, inputs, pre_hidden): i2h_param_shape = [self.step_input_size, self.hidden_size] h2h_param_shape = [self.hidden_size, self.hidden_size] h2o_param_shape = [self.output_size, self.hidden_size] @@ -159,7 +159,7 @@ def forward(self, input, pre_hidden): return reduce_out, hidden -class SimpleRNN(fluid.dygraph.Layer): +class SimpleRNN(fluid.Layer): def __init__(self, name_scope): super(SimpleRNN, self).__init__(name_scope) self.seq_len = 4 @@ -200,22 +200,22 @@ def test_sum_op(self): inputs.append(fluid.dygraph.base.to_variable(x)) ret = fluid.layers.sums(inputs) loss = fluid.layers.reduce_sum(ret) - loss._backward() - self.assertTrue(np.allclose(ret._numpy(), x * 10)) - self.assertTrue(np.allclose(inputs[0]._gradient(), x)) + loss.backward() + self.assertTrue(np.allclose(ret.numpy(), x * 10)) + self.assertTrue(np.allclose(inputs[0].gradient(), x)) def test_layer(self): with fluid.dygraph.guard(): cl = core.Layer() cl.forward([]) - l = fluid.dygraph.Layer("l") + l = fluid.Layer("l") self.assertRaises(NotImplementedError, l.forward, []) def test_pylayer_func_id(self): with fluid.dygraph.guard(): - class PyLayer1(fluid.dygraph.PyLayer): + class PyLayer1(fluid.PyLayer): def __init__(self): super(PyLayer1, self).__init__() @@ -257,9 +257,9 @@ def test_pylayer(self): my_py_layer = MyPyLayer() var_inp = fluid.dygraph.base.to_variable(np_inp) outs = my_py_layer(var_inp) - dy_out = np.sum(outs[0]._numpy()) - outs[0]._backward() - dy_grad = var_inp._gradient() + dy_out = np.sum(outs[0].numpy()) + outs[0].backward() + dy_grad = var_inp.gradient() with new_program_scope(): inp = fluid.layers.data( @@ -287,9 +287,9 @@ def test_layer_in_out(self): l = MyLayer("my_layer") x = l(var_inp)[0] self.assertIsNotNone(x) - dy_out = x._numpy() - x._backward() - dy_grad = l._x_for_debug._gradient() + dy_out = x.numpy() + x.backward() + dy_grad = l._x_for_debug.gradient() with new_program_scope(): inp = fluid.layers.data( @@ -314,9 +314,9 @@ def test_mlp(self): var_inp = fluid.dygraph.base.to_variable(np_inp) mlp = MLP("mlp") out = mlp(var_inp) - dy_out = out._numpy() - out._backward() - dy_grad = mlp._fc1._w._gradient() + dy_out = out.numpy() + out.backward() + dy_grad = mlp._fc1._w.gradient() with new_program_scope(): inp = fluid.layers.data( @@ -358,11 +358,11 @@ def test_rnn(self): var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3]) simple_rnn = SimpleRNN("simple_rnn") outs, pre_hiddens = simple_rnn.forward(var_inp) - dy_out = outs[3]._numpy() - outs[3]._backward() - dy_grad_h2o = simple_rnn._cell._h2o_w._gradient() - dy_grad_h2h = simple_rnn._cell._h2h_w._gradient() - dy_grad_i2h = simple_rnn._cell._i2h_w._gradient() + dy_out = outs[3].numpy() + outs[3].backward() + dy_grad_h2o = simple_rnn._cell._h2o_w.gradient() + dy_grad_h2h = simple_rnn._cell._h2h_w.gradient() + dy_grad_i2h = simple_rnn._cell._i2h_w.gradient() with new_program_scope(): inp = fluid.layers.data( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py index a92b7d62fa598a..c28058100a43eb 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py @@ -18,11 +18,11 @@ import paddle import paddle.fluid as fluid from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC +from paddle.fluid import Conv2D, Pool2D, FC from paddle.fluid.dygraph.base import to_variable -class SimpleImgConvPool(fluid.dygraph.Layer): +class SimpleImgConvPool(fluid.Layer): def __init__(self, name_scope, num_channels, @@ -71,7 +71,7 @@ def forward(self, inputs): return x -class MNIST(fluid.dygraph.Layer): +class MNIST(fluid.Layer): def __init__(self, name_scope): super(MNIST, self).__init__(name_scope) @@ -125,21 +125,21 @@ def save_load_persistables(self): img = to_variable(dy_x_data) label = to_variable(y_data) - label._stop_gradient = True + label.stop_gradient = True cost = mnist(img) loss = fluid.layers.cross_entropy(cost, label) avg_loss = fluid.layers.mean(loss) - dy_out = avg_loss._numpy() + dy_out = avg_loss.numpy() - avg_loss._backward() + avg_loss.backward() sgd.minimize(avg_loss) fluid.dygraph.save_persistables(mnist, "save_dir") mnist.clear_gradients() for param in mnist.parameters(): - dy_param_init_value[param.name] = param._numpy() + dy_param_init_value[param.name] = param.numpy() mnist.load_dict( fluid.dygraph.load_persistables(mnist, "save_dir")) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py index ccebd4a54727f3..ca2cffa9c75cc8 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py @@ -32,11 +32,11 @@ NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1)) -class DMF(fluid.dygraph.Layer): +class DMF(fluid.Layer): def __init__(self, name_scope): super(DMF, self).__init__(name_scope) - self._user_latent = fluid.dygraph.FC(self.full_name(), 256) - self._item_latent = fluid.dygraph.FC(self.full_name(), 256) + self._user_latent = fluid.FC(self.full_name(), 256) + self._item_latent = fluid.FC(self.full_name(), 256) self._user_layers = [] self._item_layers = [] @@ -45,13 +45,11 @@ def __init__(self, name_scope): self._user_layers.append( self.add_sublayer( 'user_layer_%d' % i, - fluid.dygraph.FC( - self.full_name(), self._hid_sizes[i], act='relu'))) + fluid.FC(self.full_name(), self._hid_sizes[i], act='relu'))) self._item_layers.append( self.add_sublayer( 'item_layer_%d' % i, - fluid.dygraph.FC( - self.full_name(), self._hid_sizes[i], act='relu'))) + fluid.FC(self.full_name(), self._hid_sizes[i], act='relu'))) def forward(self, users, items): users = self._user_latent(users) @@ -63,19 +61,18 @@ def forward(self, users, items): return fluid.layers.elementwise_mul(users, items) -class MLP(fluid.dygraph.Layer): +class MLP(fluid.Layer): def __init__(self, name_scope): super(MLP, self).__init__(name_scope) - self._user_latent = fluid.dygraph.FC(self.full_name(), 256) - self._item_latent = fluid.dygraph.FC(self.full_name(), 256) + self._user_latent = fluid.FC(self.full_name(), 256) + self._item_latent = fluid.FC(self.full_name(), 256) self._match_layers = [] self._hid_sizes = [128, 64] for i in range(len(self._hid_sizes)): self._match_layers.append( self.add_sublayer( 'match_layer_%d' % i, - fluid.dygraph.FC( - self.full_name(), self._hid_sizes[i], act='relu'))) + fluid.FC(self.full_name(), self._hid_sizes[i], act='relu'))) self._mat def forward(self, users, items): @@ -88,7 +85,7 @@ def forward(self, users, items): return match_vec -class DeepCF(fluid.dygraph.Layer): +class DeepCF(fluid.Layer): def __init__(self, name_scope, num_users, num_items, matrix): super(DeepCF, self).__init__(name_scope) self._num_users = num_users @@ -99,11 +96,11 @@ def __init__(self, name_scope, num_users, num_items, matrix): matrix.dtype, is_bias=False, default_initializer=fluid.initializer.NumpyArrayInitializer(matrix)) - self._rating_matrix._stop_gradient = True + self._rating_matrix.stop_gradient = True self._mlp = MLP(self.full_name()) self._dmf = DMF(self.full_name()) - self._match_fc = fluid.dygraph.FC(self.full_name(), 1, act='sigmoid') + self._match_fc = fluid.FC(self.full_name(), 1, act='sigmoid') def forward(self, users, items): # users_emb = self._user_emb(users) @@ -255,10 +252,10 @@ def test_deefcf(self): fluid.layers.log_loss(prediction, to_variable(labels_np[ slice:slice + BATCH_SIZE]))) - loss._backward() + loss.backward() adam.minimize(loss) deepcf.clear_gradients() - dy_loss = loss._numpy() + dy_loss = loss.numpy() sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss)) self.assertEqual(static_loss, dy_loss) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py index 58faa1cb85af9c..5d773ec1c9db16 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py @@ -22,12 +22,12 @@ import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC +from paddle.fluid import Conv2D, Pool2D, FC from test_imperative_base import new_program_scope from paddle.fluid.dygraph.base import to_variable -class Discriminator(fluid.dygraph.Layer): +class Discriminator(fluid.Layer): def __init__(self, name_scope): super(Discriminator, self).__init__(name_scope) self._fc1 = FC(self.full_name(), size=32, act='elu') @@ -38,7 +38,7 @@ def forward(self, inputs): return self._fc2(x) -class Generator(fluid.dygraph.Layer): +class Generator(fluid.Layer): def __init__(self, name_scope): super(Generator, self).__init__(name_scope) self._fc1 = FC(self.full_name(), size=64, act='elu') @@ -150,7 +150,7 @@ def test_gan_float32(self): x=d_fake, label=to_variable(np.zeros([2, 1], np.float32)))) d_loss = d_loss_real + d_loss_fake - d_loss._backward() + d_loss.backward() sgd.minimize(d_loss) discriminator.clear_gradients() generator.clear_gradients() @@ -160,15 +160,15 @@ def test_gan_float32(self): g_loss = fluid.layers.reduce_mean( fluid.layers.sigmoid_cross_entropy_with_logits( x=d_fake, label=to_variable(np.ones([2, 1], np.float32)))) - g_loss._backward() + g_loss.backward() sgd.minimize(g_loss) for p in discriminator.parameters(): - dy_params[p.name] = p._numpy() + dy_params[p.name] = p.numpy() for p in generator.parameters(): - dy_params[p.name] = p._numpy() + dy_params[p.name] = p.numpy() - dy_g_loss = g_loss._numpy() - dy_d_loss = d_loss._numpy() + dy_g_loss = g_loss.numpy() + dy_d_loss = d_loss.numpy() self.assertEqual(dy_g_loss, static_g_loss) self.assertEqual(dy_d_loss, static_d_loss) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py index a8fb9ecfe4be16..234fcd60404286 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py @@ -15,14 +15,12 @@ import contextlib import unittest import numpy as np -import six import sys import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.optimizer import AdamOptimizer -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC from test_imperative_base import new_program_scope from paddle.fluid.dygraph.base import to_variable @@ -31,7 +29,7 @@ def gen_data(): pass -class GraphConv(fluid.dygraph.Layer): +class GraphConv(fluid.Layer): def __init__(self, name_scope, in_features, out_features): super(GraphConv, self).__init__(name_scope) @@ -50,7 +48,7 @@ def forward(self, features, adj): return fluid.layers.matmul(adj, support) + self.bias -class GCN(fluid.dygraph.Layer): +class GCN(fluid.Layer): def __init__(self, name_scope, num_hidden): super(GCN, self).__init__(name_scope) self.gc = GraphConv(self.full_name(), num_hidden, 32) @@ -134,10 +132,9 @@ def test_gnn_float32(self): loss = fluid.layers.reduce_sum(loss) adam = AdamOptimizer(learning_rate=1e-3) adam.minimize(loss) - self.assertEqual(static_loss, loss._numpy()) - self.assertTrue( - np.allclose(static_weight, model.gc.weight._numpy())) - sys.stderr.write('%s %s\n' % (static_loss, loss._numpy())) + self.assertEqual(static_loss, loss.numpy()) + self.assertTrue(np.allclose(static_weight, model.gc.weight.numpy())) + sys.stderr.write('%s %s\n' % (static_loss, loss.numpy())) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py index 5ab01839fbc20b..76b8d3aa3943e4 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -128,25 +128,25 @@ def test_mnist_float32(self): img = to_variable(dy_x_data) label = to_variable(y_data) - label._stop_gradient = True + label.stop_gradient = True cost = mnist(img) loss = fluid.layers.cross_entropy(cost, label) avg_loss = fluid.layers.mean(loss) - dy_out = avg_loss._numpy() + dy_out = avg_loss.numpy() if epoch == 0 and batch_id == 0: for param in mnist.parameters(): - dy_param_init_value[param.name] = param._numpy() + dy_param_init_value[param.name] = param.numpy() - avg_loss._backward() + avg_loss.backward() sgd.minimize(avg_loss) mnist.clear_gradients() dy_param_value = {} for param in mnist.parameters(): - dy_param_value[param.name] = param._numpy() + dy_param_value[param.name] = param.numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 8b659a3e08e381..b9f93119e83159 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -28,7 +28,7 @@ from test_imperative_base import new_program_scope -class MLP(fluid.dygraph.Layer): +class MLP(fluid.Layer): def __init__(self, name_scope, param_attr=None, bias_attr=None): super(MLP, self).__init__(name_scope) @@ -75,18 +75,18 @@ def _check_mlp(self): cost = mlp(img) avg_loss = fluid.layers.reduce_mean(cost) - dy_out = avg_loss._numpy() + dy_out = avg_loss.numpy() if batch_id == 0: for param in mlp.parameters(): - dy_param_init_value[param.name] = param._numpy() + dy_param_init_value[param.name] = param.numpy() - avg_loss._backward() + avg_loss.backward() optimizer.minimize(avg_loss) mlp.clear_gradients() dy_param_value = {} for param in mlp.parameters(): - dy_param_value[param.name] = param._numpy() + dy_param_value[param.name] = param.numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py index eb8a82430f0620..06ee5f75145778 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py @@ -24,10 +24,9 @@ from test_imperative_base import new_program_scope import numpy as np import six -from paddle.fluid.backward import append_backward -class SimpleLSTMRNN(fluid.dygraph.Layer): +class SimpleLSTMRNN(fluid.Layer): def __init__(self, name_scope, hidden_size, @@ -45,7 +44,7 @@ def __init__(self, self.cell_array = [] self.hidden_array = [] - def _build_once(self, input_embedding, init_hidden=None, init_cell=None): + def build_once(self, input_embedding, init_hidden=None, init_cell=None): self.weight_1_arr = [] self.weight_2_arr = [] self.bias_arr = [] @@ -132,7 +131,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): return real_res, last_hidden, last_cell -class PtbModel(fluid.dygraph.Layer): +class PtbModel(fluid.Layer): def __init__(self, name_scope, hidden_size, @@ -177,7 +176,7 @@ def __init__(self, default_initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale)) - def _build_once(self, input, label, init_hidden, init_cell): + def build_once(self, input, label, init_hidden, init_cell): pass def forward(self, input, label, init_hidden, init_cell): @@ -260,13 +259,13 @@ def test_ptb_rnn_cpu_float32(self): init_cell) if i == 0: for param in ptb_model.parameters(): - dy_param_init[param.name] = param._numpy() - dy_loss._backward() + dy_param_init[param.name] = param.numpy() + dy_loss.backward() sgd.minimize(dy_loss) ptb_model.clear_gradients() if i == batch_num - 1: for param in ptb_model.parameters(): - dy_param_updated[param.name] = param._numpy() + dy_param_updated[param.name] = param.numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed @@ -333,10 +332,10 @@ def test_ptb_rnn_cpu_float32(self): for k in range(3, len(out)): static_param_updated[static_param_name_list[k - 3]] = out[k] - self.assertTrue(np.allclose(static_loss_value, dy_loss._numpy())) - self.assertTrue(np.allclose(static_last_cell_value, last_cell._numpy())) + self.assertTrue(np.allclose(static_loss_value, dy_loss.numpy())) + self.assertTrue(np.allclose(static_last_cell_value, last_cell.numpy())) self.assertTrue( - np.allclose(static_last_hidden_value, last_hidden._numpy())) + np.allclose(static_last_hidden_value, last_hidden.numpy())) for key, value in six.iteritems(static_param_init): # print("static_init name: {}, value {}".format(key, value)) # print("dy_init name: {}, value {}".format(key, dy_param_init[key])) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py index 1d786d58463276..d9ef08b3c491b2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -21,7 +21,7 @@ import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.layer_helper import LayerHelper -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC +from paddle.fluid import Conv2D, Pool2D, BatchNorm, FC from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope @@ -68,7 +68,7 @@ def optimizer_setting(params): return optimizer -class ConvBNLayer(fluid.dygraph.Layer): +class ConvBNLayer(fluid.Layer): def __init__(self, name_scope, num_channels, @@ -99,7 +99,7 @@ def forward(self, inputs): return y -class BottleneckBlock(fluid.dygraph.Layer): +class BottleneckBlock(fluid.Layer): def __init__(self, name_scope, num_channels, @@ -156,7 +156,7 @@ def forward(self, inputs): return layer_helper.append_activation(y) -class ResNet(fluid.dygraph.Layer): +class ResNet(fluid.Layer): def __init__(self, name_scope, layers=50, class_dim=102): super(ResNet, self).__init__(name_scope) @@ -247,7 +247,7 @@ def test_resnet_float32(self): dy_param_init_value = {} for param in resnet.parameters(): - dy_param_init_value[param.name] = param._numpy() + dy_param_init_value[param.name] = param.numpy() for batch_id, data in enumerate(train_reader()): if batch_id >= batch_num: @@ -260,20 +260,20 @@ def test_resnet_float32(self): img = to_variable(dy_x_data) label = to_variable(y_data) - label._stop_gradient = True + label.stop_gradient = True out = resnet(img) loss = fluid.layers.cross_entropy(input=out, label=label) avg_loss = fluid.layers.mean(x=loss) - dy_out = avg_loss._numpy() + dy_out = avg_loss.numpy() if batch_id == 0: for param in resnet.parameters(): if param.name not in dy_param_init_value: - dy_param_init_value[param.name] = param._numpy() + dy_param_init_value[param.name] = param.numpy() - avg_loss._backward() + avg_loss.backward() dy_grad_value = {} for param in resnet.parameters(): @@ -288,7 +288,7 @@ def test_resnet_float32(self): dy_param_value = {} for param in resnet.parameters(): - dy_param_value[param.name] = param._numpy() + dy_param_value[param.name] = param.numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py new file mode 100644 index 00000000000000..3f3f92cde57c80 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py @@ -0,0 +1,481 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np +import six + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC +from paddle.fluid.dygraph.base import to_variable +from test_imperative_base import new_program_scope + +batch_size = 8 +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "batch_size": batch_size, + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + }, + "batch_size": batch_size, + "lr": 0.1, + "total_images": 6149, +} + + +def optimizer_setting(params): + ls = params["learning_strategy"] + if ls["name"] == "piecewise_decay": + if "total_images" not in params: + total_images = 6149 + else: + total_images = params["total_images"] + # TODO(Yancey1989): using lr decay if it is ready. + #batch_size = ls["batch_size"] + #step = int(total_images / batch_size + 1) + + #bd = [step * e for e in ls["epochs"]] + #base_lr = params["lr"] + #lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] + optimizer = fluid.optimizer.SGD(learning_rate=0.01) + + return optimizer + + +class ConvBNLayer(fluid.dygraph.Layer): + def __init__(self, + name_scope, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + super(ConvBNLayer, self).__init__(name_scope) + + self._conv = Conv2D( + self.full_name(), + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=None) + + self._batch_norm = BatchNorm(self.full_name(), num_filters, act=act) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + + return y + + +class SqueezeExcitation(fluid.dygraph.Layer): + def __init__(self, name_scope, num_channels, reduction_ratio): + + super(SqueezeExcitation, self).__init__(name_scope) + self._pool = Pool2D( + self.full_name(), pool_size=0, pool_type='avg', global_pooling=True) + self._squeeze = FC( + self.full_name(), + size=num_channels // reduction_ratio, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.05)), + act='relu') + self._excitation = FC( + self.full_name(), + size=num_channels, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.05)), + act='sigmoid') + + def forward(self, input): + y = self._pool(input) + y = self._squeeze(y) + y = self._excitation(y) + y = fluid.layers.elementwise_mul(x=input, y=y, axis=0) + return y + + +class BottleneckBlock(fluid.dygraph.Layer): + def __init__(self, + name_scope, + num_channels, + num_filters, + stride, + cardinality, + reduction_ratio, + shortcut=True): + super(BottleneckBlock, self).__init__(name_scope) + + self.conv0 = ConvBNLayer( + self.full_name(), + num_channels=num_channels, + num_filters=num_filters, + filter_size=1) + self.conv1 = ConvBNLayer( + self.full_name(), + num_channels=num_filters, + num_filters=num_filters, + filter_size=3, + stride=stride, + groups=cardinality) + self.conv2 = ConvBNLayer( + self.full_name(), + num_channels=num_filters, + num_filters=num_filters * 4, + filter_size=1, + act='relu') + + self.scale = SqueezeExcitation( + self.full_name(), + num_channels=num_filters * 4, + reduction_ratio=reduction_ratio) + + if not shortcut: + self.short = ConvBNLayer( + self.full_name(), + num_channels=num_channels, + num_filters=num_filters * 4, + filter_size=1, + stride=stride) + + self.shortcut = shortcut + + self._num_channels_out = num_filters * 4 + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + scale = self.scale(conv2) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = fluid.layers.elementwise_add(x=short, y=scale) + + layer_helper = LayerHelper(self.full_name(), act='relu') + y = layer_helper.append_activation(y) + return y + + +class SeResNeXt(fluid.dygraph.Layer): + def __init__(self, name_scope, layers=50, class_dim=102): + super(SeResNeXt, self).__init__(name_scope) + + self.layers = layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 6, 3] + num_filters = [128, 256, 512, 1024] + self.conv0 = ConvBNLayer( + self.full_name(), + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + self.pool = Pool2D( + self.full_name(), + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 101: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 23, 3] + num_filters = [128, 256, 512, 1024] + self.conv0 = ConvBNLayer( + self.full_name(), + num_channels=3, + num_filters=3, + filter_size=7, + stride=2, + act='relu') + self.pool = Pool2D( + self.full_name(), + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 152: + cardinality = 64 + reduction_ratio = 16 + depth = [3, 8, 36, 3] + num_filters = [128, 256, 512, 1024] + self.conv0 = ConvBNLayer( + self.full_name(), + num_channels=3, + num_filters=3, + filter_size=7, + stride=2, + act='relu') + self.conv1 = ConvBNLayer( + self.full_name(), + num_channels=64, + num_filters=3, + filter_size=7, + stride=2, + act='relu') + self.conv2 = ConvBNLayer( + self.full_name(), + num_channels=64, + num_filters=3, + filter_size=7, + stride=2, + act='relu') + self.pool = Pool2D( + self.full_name(), + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + self.bottleneck_block_list = [] + num_channels = 64 + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + self.full_name(), + num_channels=num_channels, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=cardinality, + reduction_ratio=reduction_ratio, + shortcut=shortcut)) + num_channels = bottleneck_block._num_channels_out + self.bottleneck_block_list.append(bottleneck_block) + shortcut = True + + self.pool2d_avg = Pool2D( + self.full_name(), pool_size=7, pool_type='avg', global_pooling=True) + import math + stdv = 1.0 / math.sqrt(2048 * 1.0) + + self.out = FC(self.full_name(), + size=class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) + + def forward(self, inputs): + if self.layers == 50 or self.layers == 101: + y = self.conv0(inputs) + y = self.pool(y) + elif self.layers == 152: + y = self.conv0(inputs) + y = self.conv1(inputs) + y = self.conv2(inputs) + y = self.pool(y) + + for bottleneck_block in self.bottleneck_block_list: + y = bottleneck_block(y) + y = self.pool2d_avg(y) + y = fluid.layers.dropout(y, dropout_prob=0.2) + y = self.out(y) + return y + + +class TestImperativeResneXt(unittest.TestCase): + def test_se_resnext_float32(self): + seed = 90 + + batch_size = train_parameters["batch_size"] + batch_num = 2 + epoch_num = 1 + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + se_resnext = SeResNeXt("se_resnext") + optimizer = optimizer_setting(train_parameters) + np.random.seed(seed) + import random + random.seed = seed + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), + batch_size=batch_size, + drop_last=True) + + dy_param_init_value = {} + for param in se_resnext.parameters(): + dy_param_init_value[param.name] = param.numpy() + for epoch_id in range(epoch_num): + for batch_id, data in enumerate(train_reader()): + + if batch_id >= batch_num and batch_num != -1: + break + + dy_x_data = np.array( + [x[0].reshape(3, 224, 224) + for x in data]).astype('float32') + y_data = np.array( + [x[1] for x in data]).astype('int64').reshape( + batch_size, 1) + + img = to_variable(dy_x_data) + label = to_variable(y_data) + label.stop_gradient = True + + out = se_resnext(img) + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=loss) + + dy_out = avg_loss.numpy() + + if batch_id == 0: + for param in se_resnext.parameters(): + if param.name not in dy_param_init_value: + dy_param_init_value[param.name] = param.numpy() + avg_loss.backward() + + #dy_grad_value = {} + #for param in se_resnext.parameters(): + # if param.trainable: + # np_array = np.array(param._ivar._grad_ivar().value() + # .get_tensor()) + # dy_grad_value[param.name + core.grad_var_suffix()] = np_array + + optimizer.minimize(avg_loss) + se_resnext.clear_gradients() + + dy_param_value = {} + for param in se_resnext.parameters(): + dy_param_value[param.name] = param.numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + + se_resnext = SeResNeXt("se_resnext") + optimizer = optimizer_setting(train_parameters) + + np.random.seed(seed) + import random + random.seed = seed + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), + batch_size=batch_size, + drop_last=True) + + img = fluid.layers.data( + name='pixel', shape=[3, 224, 224], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + out = se_resnext(img) + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=loss) + optimizer.minimize(avg_loss) + + # initialize params and fetch them + static_param_init_value = {} + static_param_name_list = [] + static_grad_name_list = [] + for param in se_resnext.parameters(): + static_param_name_list.append(param.name) + for param in se_resnext.parameters(): + if param.trainable: + static_grad_name_list.append(param.name + + core.grad_var_suffix()) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_init_value[static_param_name_list[i]] = out[i] + for epoch_id in range(epoch_num): + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num and batch_num != -1: + break + + static_x_data = np.array( + [x[0].reshape(3, 224, 224) + for x in data]).astype('float32') + y_data = np.array( + [x[1] for x in data]).astype('int64').reshape( + [batch_size, 1]) + + fetch_list = [avg_loss.name] + fetch_list.extend(static_param_name_list) + fetch_list.extend(static_grad_name_list) + out = exe.run( + fluid.default_main_program(), + feed={"pixel": static_x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_grad_value = {} + static_out = out[0] + param_start_pos = 1 + grad_start_pos = len( + static_param_name_list) + param_start_pos + for i in range( + param_start_pos, + len(static_param_name_list) + param_start_pos): + static_param_value[static_param_name_list[ + i - param_start_pos]] = out[i] + for i in range(grad_start_pos, + len(static_grad_name_list) + grad_start_pos): + static_grad_value[static_grad_name_list[ + i - grad_start_pos]] = out[i] + self.assertTrue(np.allclose(static_out, dy_out)) + + self.assertEqual(len(dy_param_init_value), len(static_param_init_value)) + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.allclose(value, dy_param_init_value[key])) + self.assertTrue(np.isfinite(value.all())) + self.assertFalse(np.isnan(value.any())) + # FIXME(Yancey1989): np.array(_ivar.value().get_tensor()) leads to memory lake + #self.assertEqual(len(dy_grad_value), len(static_grad_value)) + #for key, value in six.iteritems(static_grad_value): + # self.assertTrue(np.allclose(value, dy_grad_value[key])) + # self.assertTrue(np.isfinite(value.all())) + # self.assertFalse(np.isnan(value.any())) + + self.assertEqual(len(dy_param_value), len(static_param_value)) + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.allclose(value, dy_param_value[key])) + self.assertTrue(np.isfinite(value.all())) + self.assertFalse(np.isnan(value.any())) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py index ef9d3ffca2dfce..90457cc664c0fb 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py @@ -16,7 +16,8 @@ import unittest import paddle.fluid as fluid -from paddle.fluid.dygraph import Embedding, LayerNorm, FC, to_variable, Layer, guard +from paddle.fluid import Embedding, LayerNorm, FC, Layer +from paddle.fluid.dygraph import to_variable, guard from test_imperative_base import new_program_scope from paddle.fluid import core import numpy as np @@ -116,7 +117,7 @@ class ModelHyperParams(object): # to process after each sub-layer postprocess_cmd = "da" # dropout + residual connection # random seed used in dropout for CE. - dropout_seed = 1 + dropout_seed = None # the flag indicating whether to share embedding and softmax weights. # vocabularies in source and target should be same for weight sharing. weight_sharing = True @@ -166,15 +167,21 @@ def create_data(is_static=False): ] else: enc_inputs = [ - to_variable(src_word_np), to_variable(src_pos_np), - to_variable(src_slf_attn_bias_np) + to_variable( + src_word_np, name='src_word'), to_variable( + src_pos_np, name='src_pos'), to_variable( + src_slf_attn_bias_np, name='src_slf_attn_bias') ] dec_inputs = [ - to_variable(trg_word_np), to_variable(trg_pos_np), - to_variable(trg_slf_attn_bias_np), to_variable(trg_src_attn_bias_np) + to_variable( + trg_word_np, name='trg_word'), to_variable( + trg_pos_np, name='trg_pos'), to_variable( + trg_slf_attn_bias_np, name='trg_slf_attn_bias'), + to_variable( + trg_src_attn_bias_np, name='trg_src_attn_bias') ] - label = to_variable(lbl_word_np) - weight = to_variable(lbl_weight_np) + label = to_variable(lbl_word_np, name='lbl_word') + weight = to_variable(lbl_weight_np, name='lbl_weight') return enc_inputs, dec_inputs, label, weight @@ -211,7 +218,7 @@ def make_all_inputs(input_fields): # The placeholder for batch_size in compile time. Must be -1 currently to be # consistent with some ops' infer-shape output in compile time, such as the # sequence_expand op used in beamsearch decoder. -batch_size = 32 +batch_size = -1 # The placeholder for squence length in compile time. seq_len = ModelHyperParams.max_length # Here list the data shapes and data types of all inputs. @@ -304,35 +311,40 @@ def make_all_inputs(input_fields): batch_num = 5 -np.random.seed = 1 +np.random.seed = 90 src_word_np = np.random.randint( 1, ModelHyperParams.src_vocab_size - 1, - size=(batch_size, seq_len, 1), + size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64') src_pos_np = np.random.randint( - 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64') -src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head, - seq_len, seq_len).astype('float32') + 1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64') +src_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size, + ModelHyperParams.n_head, seq_len, + seq_len).astype('float32') trg_word_np = np.random.randint( 1, ModelHyperParams.src_vocab_size - 1, - size=(batch_size, seq_len, 1), + size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64') trg_pos_np = np.random.randint( - 1, seq_len, size=(batch_size, seq_len, 1), dtype='int64') -trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head, - seq_len, seq_len).astype('float32') -trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head, - seq_len, seq_len).astype('float32') + 1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64') +trg_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size, + ModelHyperParams.n_head, seq_len, + seq_len).astype('float32') +trg_src_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size, + ModelHyperParams.n_head, seq_len, + seq_len).astype('float32') lbl_word_np = np.random.randint( 1, ModelHyperParams.src_vocab_size - 1, - size=(batch_size * seq_len, 1), + size=(TrainTaskConfig.batch_size * seq_len, 1), dtype='int64') -lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32') + +lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len, + 1).astype('float32') pos_inp1 = position_encoding_init(ModelHyperParams.max_length, ModelHyperParams.d_model) @@ -447,7 +459,7 @@ def forward(self, queries, keys, values, attn_bias): x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False) transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) - #scale dot product attention + # scale dot product attention product = fluid.layers.matmul( x=transpose_q, y=transpose_k, @@ -971,16 +983,18 @@ def test_transformer_float32(self): enc_inputs, dec_inputs, label, weights = create_data() dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer( enc_inputs, dec_inputs, label, weights) + if i == 0: for param in transformer.parameters(): - dy_param_init[param.name] = param._numpy() + dy_param_init[param.name] = param.numpy() - dy_avg_cost._backward() + dy_avg_cost.backward() optimizer.minimize(dy_avg_cost) transformer.clear_gradients() + if i == batch_num - 1: for param in transformer.parameters(): - dy_param_updated[param.name] = param._numpy() + dy_param_updated[param.name] = param.numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed @@ -1024,7 +1038,6 @@ def test_transformer_float32(self): static_param_name_list = list() static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer( enc_inputs, dec_inputs, label, weights) - optimizer.minimize(static_avg_cost) for param in transformer.parameters(): static_param_name_list.append(param.name) @@ -1042,8 +1055,8 @@ def test_transformer_float32(self): static_sum_cost, static_avg_cost, static_predict, static_token_num ] - fetch_list.extend(static_param_name_list) + fetch_list.extend(static_param_name_list) out = exe.run(fluid.default_main_program(), feed=feed_dict, fetch_list=fetch_list) @@ -1057,13 +1070,13 @@ def test_transformer_float32(self): 4]] = out[k] self.assertTrue( - np.array_equal(static_avg_cost_value, dy_avg_cost._numpy())) + np.array_equal(static_avg_cost_value, dy_avg_cost.numpy())) self.assertTrue( - np.array_equal(static_sum_cost_value, dy_sum_cost._numpy())) + np.array_equal(static_sum_cost_value, dy_sum_cost.numpy())) self.assertTrue( - np.array_equal(static_predict_value, dy_predict._numpy())) + np.array_equal(static_predict_value, dy_predict.numpy())) self.assertTrue( - np.array_equal(static_token_num_value, dy_token_num._numpy())) + np.array_equal(static_token_num_value, dy_token_num.numpy())) for key, value in six.iteritems(static_param_init): self.assertTrue(np.array_equal(value, dy_param_init[key])) for key, value in six.iteritems(static_param_updated): diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 6cc3c6d90bdae0..6020246dae51f3 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -102,7 +102,7 @@ def test_layer_norm(self): dy_ret = lm(base.to_variable(inp)) self.assertTrue(np.allclose(static_ret, static_ret2)) - self.assertTrue(np.allclose(dy_ret._numpy(), static_ret2)) + self.assertTrue(np.allclose(dy_ret.numpy(), static_ret2)) def test_relu(self): with self.static_graph(): @@ -116,7 +116,7 @@ def test_relu(self): t = np.ones([3, 3], dtype='float32') dy_ret = layers.relu(base.to_variable(t)) - self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, dy_ret.numpy())) def test_matmul(self): with self.static_graph(): @@ -137,7 +137,7 @@ def test_matmul(self): t2 = np.ones([3, 3], dtype='float32') dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2)) - self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, dy_ret.numpy())) def test_conv2d(self): with self.static_graph(): @@ -164,7 +164,7 @@ def test_conv2d(self): 'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2]) dy_ret = conv2d(base.to_variable(images)) - self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, dy_ret.numpy())) self.assertTrue(np.allclose(static_ret, static_ret2)) def test_gru_unit(self): @@ -206,7 +206,7 @@ def test_gru_unit(self): for i in range(len(static_ret)): self.assertTrue(np.allclose(static_ret[i], static_ret2[i])) - self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy())) + self.assertTrue(np.allclose(static_ret[i], dy_ret[i].numpy())) def test_elementwise_math(self): n = np.ones([3, 3], dtype='float32') @@ -248,8 +248,8 @@ def test_elementwise_math(self): ret = layers.elementwise_sub(ret, n5) dy_ret = layers.elementwise_mul(ret, n6) self.assertTrue( - np.allclose(static_ret, dy_ret._numpy()), - '%s vs %s' % (static_ret, dy_ret._numpy())) + np.allclose(static_ret, dy_ret.numpy()), + '%s vs %s' % (static_ret, dy_ret.numpy())) def test_elementwise_minmax(self): n = np.ones([3, 3], dtype='float32') @@ -259,8 +259,8 @@ def test_elementwise_minmax(self): min_ret = layers.elementwise_min(n, n2) max_ret = layers.elementwise_max(n, n2) - self.assertTrue(np.allclose(n, min_ret._numpy())) - self.assertTrue(np.allclose(n2, max_ret._numpy())) + self.assertTrue(np.allclose(n, min_ret.numpy())) + self.assertTrue(np.allclose(n2, max_ret.numpy())) def test_sequence_conv(self): inp_np = np.arange(12).reshape([3, 4]).astype('float32') @@ -327,7 +327,7 @@ def test_conv2d_transpose(self): 'conv2d_transpose', num_filters=10, output_size=28) dy_rlt = conv2d_transpose(base.to_variable(inp_np)) self.assertTrue(np.allclose(static_rlt2, static_rlt)) - self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt)) + self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt)) def test_bilinear_tensor_product(self): inp_np_x = np.array([[1, 2, 3]]).astype('float32') @@ -370,7 +370,7 @@ def test_bilinear_tensor_product(self): dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y)) self.assertTrue(np.allclose(static_rlt2, static_rlt)) - self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt)) + self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt)) def test_prelu(self): inp_np = np.ones([5, 200, 100, 100]).astype('float32') @@ -411,7 +411,7 @@ def test_prelu(self): dy_rlt = prelu(base.to_variable(inp_np)) self.assertTrue(np.allclose(static_rlt2, static_rlt)) - self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt)) + self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt)) def test_embeding(self): inp_word = np.array([[[1]]]).astype('int64') @@ -444,7 +444,7 @@ def test_embeding(self): static_rlt3 = emb2(base.to_variable(inp_word)) self.assertTrue(np.allclose(static_rlt2, static_rlt)) - self.assertTrue(np.allclose(static_rlt3._numpy(), static_rlt)) + self.assertTrue(np.allclose(static_rlt3.numpy(), static_rlt)) def test_nce(self): window_size = 5 @@ -558,7 +558,7 @@ def test_nce(self): nce_loss3 = nce(embs3, words[label_word]) self.assertTrue(np.allclose(static_rlt2, static_rlt)) - self.assertTrue(np.allclose(nce_loss3._numpy(), static_rlt)) + self.assertTrue(np.allclose(nce_loss3.numpy(), static_rlt)) def test_conv3d(self): with self.static_graph(): @@ -585,7 +585,7 @@ def test_conv3d(self): conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2) dy_ret = conv3d(base.to_variable(images)) - self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, dy_ret.numpy())) self.assertTrue(np.allclose(static_ret, static_ret2)) def test_row_conv(self): @@ -679,7 +679,7 @@ def test_group_norm(self): groupNorm = nn.GroupNorm('GroupNorm', groups=2) dy_ret = groupNorm(base.to_variable(input)) - self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, dy_ret.numpy())) self.assertTrue(np.allclose(static_ret, static_ret2)) def test_spectral_norm(self): @@ -729,7 +729,7 @@ def test_spectral_norm(self): spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2) dy_ret = spectralNorm(base.to_variable(input)) - self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, dy_ret.numpy())) self.assertTrue(np.allclose(static_ret, static_ret2)) def test_tree_conv(self): @@ -802,7 +802,7 @@ def test_tree_conv(self): dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj)) self.assertTrue(np.allclose(static_ret, static_ret2)) - self.assertTrue(np.allclose(static_ret, dy_ret._numpy())) + self.assertTrue(np.allclose(static_ret, dy_ret.numpy())) def test_conv3d_transpose(self): input_array = np.arange(0, 48).reshape( @@ -832,7 +832,7 @@ def test_conv3d_transpose(self): use_cudnn=False) dy_rlt = conv3d_transpose(base.to_variable(input_array)) self.assertTrue(np.allclose(static_rlt2, static_rlt)) - self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt)) + self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt)) class TestBook(unittest.TestCase): @@ -1892,6 +1892,14 @@ def test_shuffle_channel(self): self.assertIsNotNone(out) print(str(program)) + def test_pixel_shuffle(self): + program = Program() + with program_guard(program): + x = layers.data(name="X", shape=[9, 4, 4], dtype="float32") + out = layers.pixel_shuffle(x, upscale_factor=3) + self.assertIsNotNone(out) + print(str(program)) + def test_fsp(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py new file mode 100644 index 00000000000000..cc3ae2b3b9d4c4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py @@ -0,0 +1,50 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest + + +class TestPixelShuffle(OpTest): + def setUp(self): + self.op_type = "pixel_shuffle" + n, c, h, w = 2, 9, 4, 4 + up_factor = 3 + shape = [n, c, h, w] + x = np.random.random(shape).astype("float32") + new_shape = (n, c // (up_factor * up_factor), up_factor, up_factor, h, + w) + # reshape to (num,output_channel,upscale_factor,upscale_factor,h,w) + npresult = np.reshape(x, new_shape) + # transpose to (num,output_channel,h,upscale_factor,w,upscale_factor) + npresult = npresult.transpose(0, 1, 4, 2, 5, 3) + oshape = [n, c // (up_factor * up_factor), h * up_factor, w * up_factor] + npresult = np.reshape(npresult, oshape) + + self.inputs = {'X': x} + self.outputs = {'Out': npresult} + self.attrs = {'upscale_factor': up_factor} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +if __name__ == '__main__': + unittest.main()