diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 7a0f9d6f170318..5c9b6851caf54d 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -74,7 +74,7 @@ pass_library(anakin_fillconstant_elementwisemul_fuse inference) # be detected by our pass. The index here represents the number of structures in the # pattern. We use index 3 ~ 6, because these quantities of structures are # common in the models. -foreach (index RANGE 3 6) +foreach (index RANGE 2 6) file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n") endforeach() diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc index fda43948d56768..cab69c408defad 100644 --- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include +#include #include #include "paddle/fluid/framework/ir/graph_viz_pass.h" @@ -123,6 +125,7 @@ std::unique_ptr TransposeFlattenConcatFusePass::ApplyImpl( } template class TransposeFlattenConcatFusePass<1>; +template class TransposeFlattenConcatFusePass<2>; template class TransposeFlattenConcatFusePass<3>; template class TransposeFlattenConcatFusePass<4>; template class TransposeFlattenConcatFusePass<5>; @@ -135,6 +138,9 @@ template class TransposeFlattenConcatFusePass<6>; REGISTER_PASS(transpose_flatten_concat_fuse_pass, paddle::framework::ir::TransposeFlattenConcatFusePass<1>); +REGISTER_PASS(transpose_flatten2_concat_fuse_pass, + paddle::framework::ir::TransposeFlattenConcatFusePass<2>); + REGISTER_PASS(transpose_flatten3_concat_fuse_pass, paddle::framework::ir::TransposeFlattenConcatFusePass<3>); diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h index 2eb7f24ce544c6..4603681e1e8a3c 100644 --- a/paddle/fluid/inference/anakin/convert/op_converter.h +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -117,27 +117,14 @@ class AnakinOpConverter { } temp_max_input_shape[input] = input_shape; engine->SetInputShape(input, input_shape); - // engine->Graph()->RegistVar(input); // For share from data. + engine->Graph()->RegistVar(input); // For share from data. } engine->SetMaxInputShape(temp_max_input_shape); - engine->Optimize(); + + // For anakin share with fluid tensor. + engine->AllocTmpMem(); engine->InitGraph(); - /* - for(auto& input : inputs) { - platform::CUDAPlace gpu_place(engine->GetDevice()); - auto input_var = scope->Var(); - auto input_tensor = input_var->GetMutable(); - auto input_max_shape = temp_max_input_shape[input]; - input_tensor->Resize(framework::make_ddim(input_max_shape)); - auto input_data = input_tensor->mutable_data(gpu_place); - auto* anakin_input = engine->Net()->get_in(input); - - ::anakin::saber::Tensor<::anakin::saber::NV> tmp_anakin_tensor(input_data, - ::anakin::saber::NV(), 0, input_max_shape); - anakin_input->share_from(tmp_anakin_tensor); - } - */ } void SetEngine(AnakinNvEngine *engine) { engine_ = engine; } diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc index 176bc1254b5517..39be7865149c70 100644 --- a/paddle/fluid/inference/anakin/engine.cc +++ b/paddle/fluid/inference/anakin/engine.cc @@ -71,6 +71,7 @@ void AnakinEngine::Execute( const std::map &inputs, const std::map &outputs, cudaStream_t stream) { + cudaDeviceSynchronize(); for (const auto &input : inputs) { auto *tensor = input.second; auto *data = tensor->data(); @@ -97,15 +98,14 @@ void AnakinEngine::Execute( anakin_input = net_->get_in(input.first); } */ - anakin_input->reshape(fluid_input_shape); ::anakin::saber::Tensor tmp_anakin_tensor(data, TargetT(), 0, fluid_input_shape); anakin_input->copy_from(tmp_anakin_tensor); } - cudaDeviceSynchronize(); net_->prediction(); + cudaDeviceSynchronize(); for (const auto &output : outputs) { platform::CUDAPlace gpu_place(device_); auto *tensor = output.second; diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h index 3835ead1946823..4845ffdf5b9dcf 100644 --- a/paddle/fluid/inference/anakin/engine.h +++ b/paddle/fluid/inference/anakin/engine.h @@ -84,17 +84,20 @@ class AnakinEngine { int GetMaxBatchSize() { return max_batch_size_; } void Freeze(); void Optimize(); + void AllocTmpMem() { + PADDLE_ENFORCE(net_->alloc_memory_first(*graph_), + "anakin alloc temp memory first failed"); + } void Save(std::string path) { graph_->save(path); } + + bool IsInit() { return initialized_; } int GetDevice() { return device_; } - // void SaveSerializedData(std::string& data) { graph_->save_to_string(data); - // } - // void LoadSerializedData(const std::string& data) { - // graph_->load_from_string(data); } void Execute(const std::map &inputs, const std::map &outputs, cudaStream_t stream); private: + bool initialized_{false}; int max_batch_size_; std::map> max_input_shape_; int device_; diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index cf02901d963858..f1bf4caed5d0be 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -74,6 +74,19 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { return res; } +PaddleDType ZeroCopyTensor::type() { + EAGER_GET_TENSOR; + auto type = tensor->type(); + if (type == framework::proto::VarType::FP32) { + return PaddleDType::FLOAT32; + } else if (type == framework::proto::VarType::INT64) { + return PaddleDType::INT64; + } else { + LOG(ERROR) << "unknown type, only support float32 and int64 now."; + } + return PaddleDType::FLOAT32; +} + template void ZeroCopyTensor::copy_from_cpu(const T *data) { EAGER_GET_TENSOR; @@ -119,6 +132,7 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { static_cast(pool.Get(gpu_place)); memory::Copy(platform::CPUPlace(), static_cast(data), gpu_place, t_data, ele_num * sizeof(T), dev_ctx->stream()); + cudaDeviceSynchronize(); #else PADDLE_THROW("Not compile with CUDA, should not reach here."); #endif diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index f807289f6aee06..382dee79bb6f6d 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -176,6 +176,8 @@ class ZeroCopyTensor { device_ = device; } + PaddleDType type(); + protected: explicit ZeroCopyTensor(void* scope) : scope_{scope} {} void SetName(const std::string& name) { name_ = name; } @@ -190,6 +192,7 @@ class ZeroCopyTensor { // performance. mutable void* tensor_{nullptr}; PaddlePlace place_; + PaddleDType dtype_; int device_; }; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index f17d2ca17c2e5f..2982f0911ae889 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -99,12 +99,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { #endif }); - /* - for (int i = 6; i >= 3; i--) { + for (int i = 6; i >= 2; i--) { passes_.push_back("transpose_flatten" + std::to_string(i) + "_concat_fuse_pass"); } - */ use_gpu_ = true; } diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h index bbe9a221b2cae7..5da3cc1777625d 100644 --- a/paddle/fluid/operators/anakin/anakin_engine_op.h +++ b/paddle/fluid/operators/anakin/anakin_engine_op.h @@ -97,23 +97,7 @@ class AnakinEngineOp : public framework::OperatorBase { if (param_names_.count(x)) continue; auto &t = inference::analysis::GetFromScope(scope, x); - /* - auto t_shape = framework::vectorize(t.dims()); - auto *anakin_input = engine->Net()->get_in(x); - auto net_shape = anakin_input->shape(); - size_t anakin_net_input_size = net_shape.count() * sizeof(float); - size_t fluid_input_size = t.memory_size(); - - if (fluid_input_size < anakin_net_input_size) { - framework::LoDTensor temp_t; - auto t_dims = t.dims(); - temp_t.Resize(t_dims); - TensorCopySync(t, dev_place, &temp_t); - t.Resize(framework::make_ddim(net_shape)); - t.mutable_data(dev_place); - TensorCopySync(temp_t, dev_place, &t); - } - */ + inputs.insert({x, &t}); } @@ -136,6 +120,41 @@ class AnakinEngineOp : public framework::OperatorBase { inference::Singleton::Global() .Get(engine_key_); } + // BUG here, detect that the tensor data pointer here will change sometime. + // Will fix it later. + /* + // For share with the tensor from fluid, We do the net init in the first net + precit. + if (!anakin_engine_->IsInit()) { + auto temp_max_input_shape = anakin_engine_->GetMaxInputShape(); + anakin_engine_->AllocTmpMem(); + for(auto& input : Inputs("Xs")) { + if (param_names_.count(input)) continue; + platform::CUDAPlace + gpu_place(boost::get(dev_place).device); + auto *input_var = scope.FindVar(input); + auto input_tensor = input_var->GetMutable(); + auto input_max_shape = temp_max_input_shape[input]; + + framework::LoDTensor temp_t; + auto t_dims = input_tensor->dims(); + temp_t.Resize(t_dims); + TensorCopySync(*input_tensor, dev_place, &temp_t); + input_tensor->Resize(framework::make_ddim(input_max_shape)); + input_tensor->mutable_data(dev_place); + TensorCopySync(temp_t, dev_place, input_tensor); + + auto* input_data = input_tensor->mutable_data(gpu_place); + auto* anakin_input = anakin_engine_->Net()->get_in(input); + + ::anakin::saber::Tensor<::anakin::saber::NV> + tmp_anakin_tensor(input_data, + ::anakin::saber::NV(), 0, input_max_shape); + anakin_input->share_from(tmp_anakin_tensor); + } + anakin_engine_->InitGraph(); + } + */ return anakin_engine_; } diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index 64152829b4f000..52c83d8a03fc3f 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -100,6 +100,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { int64_t max_user_size = std::max(static_cast(FLAGS_conv_workspace_size_limit), user_workspace_size); + max_user_size = 64; workspace_size_limit = max_user_size * 1024 * 1024; }