PaddlePaddle · chengduoZH · Jul 23, 2019 · Jul 16, 2019 · Jul 16, 2019 · Jul 17, 2019
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
@@ -95,5 +95,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
         fuse_elewise_add_act_pass multi_batch_merge_pass 
         fuse_relu_depthwise_conv_pass
         memory_optimize_pass lock_free_optimize_pass
-        alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
+        coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
         fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass record_skip_memory_opt_vars_pass)
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
@@ -108,10 +108,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     }
 
     // for single card training, fuse_all_reduce_ops is unnecessary.
-    // alloc_continuous_space_for_grad_pass should be before of MultiDevPass.
+    // coalesce_grad_tensor_pass should be before of MultiDevPass.
     if (strategy_.fuse_all_reduce_ops_) {
-      VLOG(1) << "Add alloc_continuous_space_for_grad_pass";
-      AppendPass("alloc_continuous_space_for_grad_pass");
+      VLOG(1) << "Add coalesce_grad_tensor_pass";
+      AppendPass("coalesce_grad_tensor_pass");
     }
 
     if (strategy_.fuse_all_optimizer_ops_) {
@@ -301,7 +301,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       pass->Erase(kNCCLCtxs);
       pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
 #endif
-    } else if (pass->Type() == "alloc_continuous_space_for_grad_pass" ||
+    } else if (pass->Type() == "coalesce_grad_tensor_pass" ||
                pass->Type() == "fuse_adam_op_pass" ||
                pass->Type() == "fuse_sgd_op_pass" ||
                pass->Type() == "fuse_momentum_op_pass" ||
@@ -321,7 +321,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                         new bool(use_hierarchical_allreduce_));
 #endif
       }
-    } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
+    } else if (pass->Type() == "coalesce_grad_tensor_pass") {
       pass->Erase(kPlaces);
       pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
       pass->Erase(kLocalScopes);
@@ -389,7 +389,7 @@ USE_PASS(backward_optimizer_op_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
 USE_PASS(inplace_pass);
 USE_PASS(lock_free_optimize_pass);
-USE_PASS(alloc_continuous_space_for_grad_pass);
+USE_PASS(coalesce_grad_tensor_pass);
 USE_PASS(graph_to_program_pass);
 USE_PASS(fuse_adam_op_pass);
 USE_PASS(fuse_sgd_op_pass);

diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -17,26 +17,14 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/device_memory_aligment.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(skip_fused_all_reduce_check, false, "");
 namespace paddle {
 namespace framework {
 namespace details {
 
-// Note(zcd): Addresses should be aligned, otherwise, the results may have
-// diff.
-static size_t Alignment(size_t size, const platform::Place &place) {
-  // Allow to allocate the minimum chunk size is 4 KB.
-  size_t alignment = 1 << 12;
-  if (platform::is_gpu_place(place)) {
-    // Allow to allocate the minimum chunk size is 256 B.
-    alignment = 1 << 8;
-  }
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
 typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
     GradientAndLoDTensor;
 
@@ -121,7 +109,7 @@ void FusedAllReduceOpHandle::RunImpl() {
     for (size_t k = 1; k < g_tensor.size(); ++k) {
       const void *cur_address = g_tensor.at(k - 1).second->data<void>();
       int64_t len = g_tensor.at(k - 1).second->numel();
-      auto offset = Alignment(len * size_of_dtype, places_[0]);
+      auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
       void *infer_next_address = reinterpret_cast<void *>(
           reinterpret_cast<uintptr_t>(cur_address) + offset);
       const void *next_address = g_tensor.at(k).second->data<void>();
@@ -241,8 +229,8 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
     // Get element number
     int64_t len = grad_tensor.at(i).second->numel();
     PADDLE_ENFORCE_GT(len, 0);
-    //    Alignment(len)
-    *numel += Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
+    *numel +=
+        platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
   }
 }
 

diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -62,7 +62,11 @@ typedef std::vector<std::string> FusedGrads;
 constexpr char kFusedGrads[] = "fused_gradients";
 
 typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
-constexpr char kParamsAndGrads[] = "params_grads";
+constexpr char kParamsAndDenseGrads[] = "params_and_dense_grads";
+constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads";
+
+typedef std::vector<ProgramDesc> ProgramDescs;
+constexpr char kProgramDescs[] = "program_descs";
 
 typedef std::vector<std::vector<std::pair<std::string, std::string>>>
     GroupParamsAndGrads;

diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -17,6 +17,8 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -70,6 +72,29 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
       InitializeVariable(pair.first, pair.second);
     }
   }
+
+  const ir::Graph &graph = Graph();
+  if (graph.Has(details::kProgramDescs)) {
+    auto &program_descs =
+        graph.Get<details::ProgramDescs>(details::kProgramDescs);
+    // Init vars
+    auto &fused_grad_vars = graph.Get<details::FusedVars>(details::kFusedVars);
+    for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
+      for (auto &var_name : fused_grad_vars) {
+        auto var = local_exec_scopes_[i]->Var(var_name);
+        var->GetMutable<LoDTensor>();
+      }
+    }
+
+    for (auto &program_desc : program_descs) {
+      for (auto &op_desc : program_desc.Block(0).AllOps()) {
+        for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
+          auto op = OpRegistry::CreateOp(*op_desc);
+          op->Run(*local_exec_scopes_[i], places_[i]);
+        }
+      }
+    }
+  }
 }
 
 void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -45,7 +45,7 @@ cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
 cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 
-cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)
+cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper)
 
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)