Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
58a4f9d
add fuse_grad_op
Jan 23, 2019
a65d5a7
Add Fused AllReduceOpHandle
Jan 24, 2019
c98880e
add unit test
Jan 25, 2019
faffa13
Fuse Gradient Space
Jan 28, 2019
ce7a584
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Jan 28, 2019
0ea6bee
enable fuse all_reduce
Jan 28, 2019
59247a8
refine fused_all_reduce_op_handle
Jan 29, 2019
e8cbcb7
Refine alloc_space_for_var_op
Jan 29, 2019
e5cc50b
polish code
Jan 29, 2019
f0ec72e
remove ophandle from graph attribute
Jan 30, 2019
d7d582d
NeedCollectiveOps
Jan 31, 2019
aabe6cf
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Feb 1, 2019
b24acdc
refine NeedCollectiveOps
Feb 1, 2019
20c7364
Fuse Adam Pass
Jan 31, 2019
e4008e2
Fix bug
Feb 2, 2019
3debcf3
Add RunOnlyOnceProgram
Feb 3, 2019
7bd42bd
polish code
Feb 12, 2019
54d21e9
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Feb 12, 2019
23f9ce3
polish code
Feb 12, 2019
ee07038
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Feb 12, 2019
c527f36
fix lambda func
Feb 13, 2019
80b439f
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Feb 13, 2019
b206305
mv alloc_space_for_vars_op.cc to alloc_continuous_space_for_grad_op.cc
Feb 13, 2019
cec9ef4
mv fuse_gradient_space_pass.cc => alloc_continuous_space_for_grad_pas…
Feb 13, 2019
544bdfe
Add FuseOptimizerOpPass
Feb 13, 2019
195ce18
Add sgd_fuse_pass
Feb 13, 2019
fcb9c81
Polish code
Feb 13, 2019
57fc341
create fuse_all_reduce_op_pass
Feb 13, 2019
46cccba
Polish code
Feb 17, 2019
97be9b6
Add group for gradient
Feb 17, 2019
79dff0b
fix bug
Feb 18, 2019
6696b7c
Polish code
Feb 19, 2019
58df23b
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Feb 19, 2019
e136462
Add block for OpDesc
Feb 19, 2019
82210b1
add test_alloc_continuous_space_op
Feb 20, 2019
487d31e
Polish code
Feb 20, 2019
f450247
Add alloc_continuous_params_pass
Feb 20, 2019
57b2479
add test_alloc_continuous_param_pass.py
Feb 20, 2019
c8f1316
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Feb 24, 2019
71a314d
remove alloc_continuous_space_for_grad_op.
Feb 25, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions paddle/fluid/framework/details/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)

cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)

cc_library(fuse_parameters_pass SRCS fuse_parameters_pass.cc DEPS graph graph_helper)

cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)

Expand All @@ -22,6 +28,8 @@ endif()
if(WITH_GPU)
nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor)
nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor)
if(WITH_DISTRIBUTE)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
Expand All @@ -35,6 +43,8 @@ if(WITH_GPU)
else()
cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
variable_visitor)
cc_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
variable_visitor)
if(WITH_DISTRIBUTE)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim selected_rows_functor sendrecvop_rpc)
Expand Down Expand Up @@ -68,7 +78,10 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)

cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle
data_balance_op_handle fused_broadcast_op_handle)

cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)

set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
if (WITH_GPU)
Expand Down Expand Up @@ -97,5 +110,5 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass
fuse_elewise_add_act_pass multi_batch_merge_pass
fuse_relu_depthwise_conv_pass
memory_optimize_pass lock_free_optimize_pass)
fuse_relu_depthwise_conv_pass fuse_parameters_pass
memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_all_reduce_op_pass)
213 changes: 213 additions & 0 deletions paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <algorithm>
#include <string>
#include <utility>
#include <vector>

#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"

namespace paddle {
namespace framework {
namespace details {

class AllocContinuousSpaceForGradPass : public ir::Pass {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override {
ir::Graph& result = *graph;

ResetAttribute<ParamsAndGrads>(kParamsAndGrads, &result);
ResetAttribute<GroupGradsAndParams>(kGroupGradsAndParams, &result);

// NOTE: The operator nodes should be in topology order.
std::vector<ir::Node*> topo_nodes = ir::TopologySortOperations(result);
auto& params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
for (auto& node : topo_nodes) {
RecordParamsAndGrads(node, &params_grads);
}

// Record parameters and gradients
std::unordered_map<std::string, ir::Node*> vars;
for (ir::Node* node : result.Nodes()) {
if (node->IsVar() && node->Var()) {
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
vars.emplace(node->Var()->Name(), node);
}
}

// Note: Sort the parameters and gradient variables according
// to parameters' name to make variables' name correspond correctly.
auto& group_params_grads =
result.Get<GroupGradsAndParams>(kGroupGradsAndParams);
SortParamsAndGrads(vars, &params_grads);
SetGroupGradsAndParams(vars, params_grads, &group_params_grads);

// Set Gradients as Persistable to prevent this var becoming reusable.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you put your pass after memory optimize so you don't need to make them persistable?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe, but if I make them as persistable, I don't need care about the position of memory_opt_pass.

auto dtype = static_cast<proto::VarType::Type>(0);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why cast?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is 0?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

0 is framework::proto::VarType::Type::VarType_Type_BOOL, the dtype of the input could not be BOOL.

for (auto& p_g : params_grads) {
// Get gradient var
auto iter = vars.find(p_g.second);
PADDLE_ENFORCE(iter != vars.end(), "%s is not found.", p_g.second);
iter->second->Var()->SetPersistable(true);

PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType()));

// Get Dtype
auto ele_dtype = iter->second->Var()->GetDataType();
if (dtype == static_cast<proto::VarType::Type>(0)) {
dtype = ele_dtype;
PADDLE_ENFORCE_NE(ele_dtype, static_cast<proto::VarType::Type>(0));
}
PADDLE_ENFORCE_EQ(ele_dtype, dtype);
}

std::vector<std::string> grads_name;
std::vector<std::string> params_name;
grads_name.reserve(params_grads.size());
params_name.reserve(params_grads.size());
for (auto& p_g : params_grads) {
params_name.emplace_back(p_g.first);
grads_name.emplace_back(p_g.second);
}

// Create the fused variable name.
const std::string prefix(kFusedVarNamePrefix);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is prefix needed?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think when we analyze the var's name, we can recognize the fused_var by the prefix. And it also prevents the name conflict with the existing names.

auto fused_var_name = prefix + "_GRAD";
if (!result.Has(kFusedVars)) {
result.Set(kFusedVars, new FusedVars);
}
result.Get<FusedVars>(kFusedVars).emplace_back(fused_var_name);

// Insert alloc_continuous_space_for_grad to RunOnlyOnceProgram,
// which is executed before running the model with ParallelExecutor.
if (!result.Has(kRunOnlyOnceProgram)) {
result.Set(kRunOnlyOnceProgram, new RunOnlyOnceProgram);
}
result.Get<RunOnlyOnceProgram>(kRunOnlyOnceProgram).emplace_back();
auto& program_desc =
result.Get<RunOnlyOnceProgram>(kRunOnlyOnceProgram).back();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just run the operations in this pass. not need to save if for later?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

auto* global_block = program_desc.MutableBlock(0);

AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
global_block);

return std::move(graph);
}

template <typename AttrType>
void ResetAttribute(const std::string& attr_name, ir::Graph* graph) const {
if (graph->Has(attr_name)) {
VLOG(10) << attr_name << " is reset.";
graph->Erase(attr_name);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

throw error? It's the job of build_strategy to do this.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if this pass is run many times, the result is should be the same.

}
graph->Set(attr_name, new AttrType);
}

void SortParamsAndGrads(
const std::unordered_map<std::string, ir::Node*>& var_nodes,
ParamsAndGrads* params_grads) const {
// TODO(zcd): The sort should be removed.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove?

// std::sort(params_grads->begin(), params_grads->end(),
// [](const std::pair<std::string, std::string>& a,
// const std::pair<std::string, std::string>& b) -> bool {
// return a.first < b.first;
// });
}

void SetGroupGradsAndParams(
const std::unordered_map<std::string, ir::Node*>& var_nodes,
const ParamsAndGrads& params_grads,
GroupGradsAndParams* group_params_grads) const {
// group_size
const size_t group_size = 3;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why 3?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just an experimental.

size_t groups = (params_grads.size() + group_size - 1) / group_size;
group_params_grads->reserve(groups);

size_t j = 0;
for (size_t i = 0; i < groups; ++i) {
group_params_grads->emplace_back();
auto& group_p_g = group_params_grads->back();
group_p_g.reserve(group_size);
VLOG(10) << "Group:" << i;
std::stringstream out;
while (j < params_grads.size()) {
group_p_g.emplace_back(
std::make_pair(params_grads.at(j).second /*grad*/,
params_grads.at(j).first /*param*/));
out << params_grads.at(j).second << "[" << params_grads.at(j).first
<< "] ";
++j;
if (j % group_size == 0) break;
}
VLOG(10) << out.str();
}
}

private:
bool IsSupportedVarType(const proto::VarType::Type& type) const {
// Current only support LOD_TENSOR.
return type == proto::VarType::LOD_TENSOR;
}

void AppendAllocSpaceForVarsOp(const std::vector<std::string>& params_name,
const std::vector<std::string>& grads_name,
const std::string& fused_var_name,
BlockDesc* global_block) const {
auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", params_name);
op_desc->SetOutput("Output", grads_name);
op_desc->SetOutput("FusedOutput", {fused_var_name});
}

void RecordParamsAndGrads(ir::Node* node,
ParamsAndGrads* params_grads) const {
try {
bool is_bk_op =
static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward));
if (!is_bk_op) return;

// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
auto backward_vars =
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast<size_t>(0));

for (size_t i = 0; i < backward_vars.size(); i += 2) {
VLOG(10) << "Trainable parameter: " << backward_vars[i]
<< ", gradient: " << backward_vars[i + 1];

params_grads->emplace_back(std::make_pair(
backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/));
}
} catch (boost::bad_get e) {
}
}
};

} // namespace details
} // namespace framework
} // namespace paddle

REGISTER_PASS(alloc_continuous_space_for_grad_pass,
paddle::framework::details::AllocContinuousSpaceForGradPass);
Loading