PaddlePaddle · chengduoZH · Jul 23, 2019 · Jul 16, 2019 · Jul 16, 2019 · Jul 17, 2019
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
@@ -88,7 +88,7 @@ struct BuildStrategy {
   bool fuse_elewise_add_act_ops_{false};
   // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
   // should not be sparse types
-  bool fuse_all_optimizer_ops_{false};
+  bool fuse_all_optimizer_ops_{true};
   bool fuse_all_reduce_ops_{false};
   // fuse_relu_depthwise_conv can fuse the `relu ->
   // depthwise_conv`

diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -17,26 +17,14 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/device_memory_aligment.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(skip_fused_all_reduce_check, false, "");
 namespace paddle {
 namespace framework {
 namespace details {
 
-// Note(zcd): Addresses should be aligned, otherwise, the results may have
-// diff.
-static size_t Alignment(size_t size, const platform::Place &place) {
-  // Allow to allocate the minimum chunk size is 4 KB.
-  size_t alignment = 1 << 12;
-  if (platform::is_gpu_place(place)) {
-    // Allow to allocate the minimum chunk size is 256 B.
-    alignment = 1 << 8;
-  }
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
 typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
     GradientAndLoDTensor;
 
@@ -121,7 +109,7 @@ void FusedAllReduceOpHandle::RunImpl() {
     for (size_t k = 1; k < g_tensor.size(); ++k) {
       const void *cur_address = g_tensor.at(k - 1).second->data<void>();
       int64_t len = g_tensor.at(k - 1).second->numel();
-      auto offset = Alignment(len * size_of_dtype, places_[0]);
+      auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
       void *infer_next_address = reinterpret_cast<void *>(
           reinterpret_cast<uintptr_t>(cur_address) + offset);
       const void *next_address = g_tensor.at(k).second->data<void>();
@@ -241,8 +229,8 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
     // Get element number
     int64_t len = grad_tensor.at(i).second->numel();
     PADDLE_ENFORCE_GT(len, 0);
-    //    Alignment(len)
-    *numel += Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
+    *numel +=
+        platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
   }
 }
 

diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -65,6 +65,9 @@ typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
 constexpr char kParamsAndDenseGrads[] = "params_and_dense_grads";
 constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads";
 
+typedef std::vector<ProgramDesc> ProgramDescs;
+constexpr char kProgramDescs[] = "program_descs";
+
 typedef std::vector<std::vector<std::pair<std::string, std::string>>>
     GroupParamsAndGrads;
 constexpr char kGroupParamsAndGrads[] = "group_params_grads";

diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -17,6 +17,8 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -70,6 +72,29 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
       InitializeVariable(pair.first, pair.second);
     }
   }
+
+  const ir::Graph &graph = Graph();
+  if (graph.Has(details::kProgramDescs)) {
+    auto &program_descs =
+        graph.Get<details::ProgramDescs>(details::kProgramDescs);
+    // Init vars
+    auto &fused_grad_vars = graph.Get<details::FusedVars>(details::kFusedVars);
+    for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
+      for (auto &var_name : fused_grad_vars) {
+        auto var = local_exec_scopes_[i]->Var(var_name);
+        var->GetMutable<LoDTensor>();
+      }
+    }
+
+    for (auto &program_desc : program_descs) {
+      for (auto &op_desc : program_desc.Block(0).AllOps()) {
+        for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
+          auto op = OpRegistry::CreateOp(*op_desc);
+          op->Run(*local_exec_scopes_[i], places_[i]);
+        }
+      }
+    }
+  }
 }
 
 void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {

diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -40,7 +40,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
     if (node->Op()->Type() == fuse_op_type) {
       auto grad_name = node->Op()->Input(kGrad);
       PADDLE_ENFORCE_EQ(grad_name.size(), static_cast<size_t>(1));
-      if (IsLoDTensorType(GettypeOfVar(vars_info, grad_name[0]))) {
+      if (IsLoDTensorType(GetTypeOfVar(vars_info, grad_name[0]))) {
         opt_nodes.emplace_back(node);
       }
       ++opt_ops_num;
@@ -61,6 +61,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
   }
   result.Set(details::kFusedOptType, new details::FusedOptType);
   result.Get<details::FusedOptType>(details::kFusedOptType) = fuse_op_type;
+  if (!result.Has(details::kProgramDescs)) {
+    result.Set(details::kProgramDescs, new details::ProgramDescs);
+  }
 
   // Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be
   // initialized in scopes before execution.
@@ -153,7 +156,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
   }
   aux_var_names.pop_back();
   InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names,
-                                    aux_var_set, fused_vars_name);
+                                    aux_var_set, fused_vars_name, &result);
 
   // Step 5: Fuse optimizer Ops and Scale Ops
   FuseOptimizerOps(aux_var_set, fused_vars_name, opt_nodes, &result);
@@ -203,37 +206,21 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
     PADDLE_ENFORCE(iter != vars_info.end());
     PADDLE_ENFORCE(!iter->second.empty());
     PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var());
-    PADDLE_ENFORCE(
-        iter->second.front()->Var()->GetType() == proto::VarType::LOD_TENSOR,
-        "Currently the gradient type only should be LoDTensor when "
-        "fusing optimizer ops.");
+    PADDLE_ENFORCE(IsLoDTensorType(iter->second.front()->Var()->GetType()),
+                   "Currently the gradient type only should be LoDTensor when "
+                   "fusing optimizer ops.");
     for (auto var : iter->second) {
       var->Var()->SetPersistable(true);
     }
   }
 
-  // Init Grads
-  for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
-    auto &scope = *it;
-    VLOG(6) << "Init: " << fused_grad_name;
-    PADDLE_ENFORCE(scope->FindVar(fused_grad_name) == nullptr,
-                   "%s has existed in scope.", fused_grad_name);
-    scope->Var(fused_grad_name)->GetMutable<LoDTensor>();
-    for (auto &grad_var_name : grads) {
-      auto iter = vars_info.find(grad_var_name);
-      PADDLE_ENFORCE(iter != vars_info.end());
-      PADDLE_ENFORCE(!iter->second.empty());
-      PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var());
-      scope->Var(grad_var_name)->GetMutable<LoDTensor>();
-    }
-  }
   // Define Ops
-  ProgramDesc program_desc;
+  result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
+  ProgramDesc &program_desc =
+      result->Get<details::ProgramDescs>(details::kProgramDescs).back();
   auto *global_block = program_desc.MutableBlock(0);
   AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block,
                              false, false);
-  // Run Ops
-  RunInitOps(places, local_scopes, *global_block);
 }
 
 std::unordered_map<std::string, std::vector<Node *>>
@@ -255,7 +242,7 @@ bool FuseOptimizerOpPass::IsLoDTensorType(
   return type == proto::VarType::LOD_TENSOR;
 }
 
-proto::VarType::Type FuseOptimizerOpPass::GettypeOfVar(
+proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar(
     const std::unordered_map<std::string, std::vector<Node *>> &var_nodes,
     const std::string &name) const {
   auto grad_iter = var_nodes.find(name);
@@ -271,47 +258,18 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
     const std::vector<std::string> &aux_var_names,
     const std::unordered_map<std::string, std::vector<std::string>>
         &aux_var_set,
-    const std::unordered_map<std::string, std::string> &fused_vars_name) const {
-  // Init Vars
-  for (auto &var_name : aux_var_names) {
-    auto &fused_var_name = fused_vars_name.at(var_name);
-    InitVars(local_scopes, fused_var_name);
-  }
+    const std::unordered_map<std::string, std::string> &fused_vars_name,
+    ir::Graph *result) const {
   // Define Ops
-  ProgramDesc program_desc;
+  result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
+  ProgramDesc &program_desc =
+      result->Get<details::ProgramDescs>(details::kProgramDescs).back();
   auto *global_block = program_desc.MutableBlock(0);
   for (auto &var_name : aux_var_names) {
     AppendAllocContinuousSpace(
         aux_var_set.at(var_name), aux_var_set.at(var_name),
         fused_vars_name.at(var_name), global_block, true);
   }
-  // Run Ops
-  RunInitOps(places, local_scopes, *global_block);
-}
-
-void FuseOptimizerOpPass::RunInitOps(const std::vector<platform::Place> &places,
-                                     const std::vector<Scope *> &local_scopes,
-                                     const BlockDesc &global_block) const {
-  for (size_t i = 0; i < local_scopes.size(); ++i) {
-    for (auto &op_desc : global_block.AllOps()) {
-      auto op = OpRegistry::CreateOp(*op_desc);
-      op->Run(*local_scopes[i], places[i]);
-    }
-  }
-}
-
-void FuseOptimizerOpPass::InitVars(const std::vector<Scope *> &local_scopes,
-                                   const std::string &fused_var_name) const {
-  // Alloc parameters and auxiliary vars in the respective scope.
-  size_t idx = local_scopes.size();
-  for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
-       ++iter, --idx) {
-    auto &scope = *iter;
-    VLOG(6) << "Init: " << fused_var_name;
-    PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
-                   "%s has exist in scope[%d]", fused_var_name, idx);
-    scope->Var(fused_var_name)->GetMutable<LoDTensor>();
-  }
 }
 
 void FuseOptimizerOpPass::SortParametersAndAuxVars(

diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h
@@ -79,20 +79,13 @@ class FuseOptimizerOpPass : public ir::Pass {
       const std::vector<std::string> &aux_var_names,
       const std::unordered_map<std::string, std::vector<std::string>>
           &aux_var_set,
-      const std::unordered_map<std::string, std::string> &fused_vars_name)
-      const;
-
-  void RunInitOps(const std::vector<platform::Place> &places,
-                  const std::vector<Scope *> &local_scopes,
-                  const BlockDesc &global_block) const;
-
-  void InitVars(const std::vector<Scope *> &local_scopes,
-                const std::string &fused_var_name) const;
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      ir::Graph *result) const;
 
   std::unordered_map<std::string, std::vector<Node *>> GetVarInfo(
       const Graph &result) const;
 
-  proto::VarType::Type GettypeOfVar(
+  proto::VarType::Type GetTypeOfVar(
       const std::unordered_map<std::string, std::vector<Node *>> &var_nodes,
       const std::string &name) const;
 

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -88,6 +88,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
 endif()
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
 
 # FIXME(typhoonzero): operator deps may not needed.
 # op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)

diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_memory_aligment.h"
 
 namespace paddle {
 namespace operators {
@@ -86,8 +87,8 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
         framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
                               &sub_tensor);
 
-        offset +=
-            Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
+        offset += platform::Alignment(len * size_of_dtype, context.GetPlace()) /
+                  size_of_dtype;
       }
     } else if (context.Attr<bool>("set_constant")) {
       math::SetConstant<DeviceContext, T> set_constant;
@@ -106,7 +107,8 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
           ->ShareDataWith(fused_tensor->Slice(
               static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
           .Resize(dim);
-      len = Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
+      len = platform::Alignment(len * size_of_dtype, context.GetPlace()) /
+            size_of_dtype;
       offset += len;
       ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
          << " address: " << out_tensors[i]->data<void>() << ", ";
@@ -115,19 +117,6 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
   }
 
  private:
-  // Note(zcd): Addresses should be aligned, otherwise, the results may have
-  // diff.
-  size_t Alignment(size_t size, const platform::Place &place) const {
-    // Allow to allocate the minimum chunk size is 4 KB.
-    size_t alignment = 1 << 12;
-    if (platform::is_gpu_place(place)) {
-      // Allow to allocate the minimum chunk size is 256 B.
-      alignment = 1 << 8;
-    }
-    size_t remaining = size % alignment;
-    return remaining == 0 ? size : size + (alignment - remaining);
-  }
-
   void GetMemSizeAndDtype(
       const std::vector<const framework::LoDTensor *> &lod_tensors,
       const std::vector<std::string> var_names, size_t *numel,
@@ -156,7 +145,8 @@ class CoalesceTensorOp : public framework::OpKernel<T> {
       PADDLE_ENFORCE_GT(size, 0);
       ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
          << "), ";
-      *numel += Alignment(static_cast<size_t>(size) * size_of_dtype, place) /
+      *numel += platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
+                                    place) /
                 size_of_dtype;
     }
 

diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
@@ -102,17 +102,17 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
   nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
+  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
+  nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 else()
-    cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
+  cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
+  cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
 endif()
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
 
-IF(WITH_GPU)
-  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
-ENDIF()
 nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
 
 if(WITH_GPU)

diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/device_memory_aligment.h"
+
+namespace paddle {
+namespace platform {
+size_t Alignment(size_t size, const platform::Place &place) {
+  size_t alignment = 1024;
+  if (platform::is_cpu_place(place)) {
+    alignment = CpuMinChunkSize();
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    alignment = GpuMinChunkSize();
+#else
+    PADDLE_THROW("Fluid is not compiled with CUDA");
+#endif
+  }
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+}  // namespace platform
+}  // namespace paddle