Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
fe240d9
fix dataset reading and add support for full dataset (#16559)
Apr 2, 2019
627ca4a
fix repeating passes (#16606)
Apr 2, 2019
bbb654e
fix preprocess script with processbar, integrity check and logs (#16608)
lidanqing-vv Apr 2, 2019
d3b6291
Merge pull request #16643 from sfraczek/fix-repeating-passes
luotao1 Apr 3, 2019
46e18cb
Merge pull request #16634 from junjun315/my-cool-stuff
junjun315 Apr 3, 2019
7bcb070
merge confict, test=release/1.4
junjun315 Apr 3, 2019
4cc6144
[Cherry-pick]Fix the bug of all_reduce_deps_pass (#16648)
Apr 3, 2019
ab6600e
Fix bug of FastThreadedExecutor (#16666)
Apr 4, 2019
af07056
Merge pull request #16671 from junjun315/cherry-pick-fix-load
junjun315 Apr 4, 2019
53c6890
test_analyzer_int8 tests use default pass order
luotao1 Apr 3, 2019
1e21e8b
Merge pull request #16611 from Shixiaowei02/release/1.4
Shixiaowei02 Apr 4, 2019
c6e6cdc
Merge pull request #16681 from luotao1/release/1.4_int8_cache
luotao1 Apr 4, 2019
2b80092
fix avx option (#16684)
tensor-tang Apr 4, 2019
2ebc20f
Cherry-pick #16515 INT8v2 readme to Release 1.4 (#16686)
bingyanghuang Apr 5, 2019
7964366
fix batchmerge (#16626)
gongweibao Apr 7, 2019
065ffcc
fix dgcclipnorm bug test=develop (#16629)
gongweibao Apr 7, 2019
89d09b8
Cherry pick 1.4/ptb fix (#16607)
JiabinYang Apr 8, 2019
29f3441
Cherry pick/fix transformer (#16620)
JiabinYang Apr 8, 2019
7e56055
Imperative fix bugs (cherry pick to 1.4) (#16680)
velconia Apr 8, 2019
266cdf7
Fix dgc bug. (#16709)
gongweibao Apr 9, 2019
a0af374
fix training validation test=release/1.4 (#16716)
baojun-nervana Apr 9, 2019
44f50cf
test=release/1.4, fix test_impertative_transformer (#16708)
JiabinYang Apr 9, 2019
c7cca0a
Fix auto growth bug of optimizer in dygraph mode
velconia Apr 9, 2019
cb9c59b
cherry-pick PR 16547,16736,16739 test=release/1.4 (#16748)
liupluswei Apr 10, 2019
b90c08c
merge conflict, test=release/1.4
junjun315 Apr 10, 2019
b07584d
test=release/1.4, refine test_imperative_transformer (#16737)
JiabinYang Apr 10, 2019
df339c0
Fix mistakes in dgc document. (#16731)
gongweibao Apr 10, 2019
4914da1
Merge pull request #16744 from velconia/locl_rel_1_4_imperative_fix_g…
velconia Apr 10, 2019
ed0f1ae
cherry-pick (#16760), test=release/1.4 (#16775)
liupluswei Apr 10, 2019
463f88a
Merge pull request #16672 from junjun315/cherry-pick-move-nn
junjun315 Apr 10, 2019
8643dbc
cherry-pick from 16691:Anakin subgraph support yolo_v3 and faster-rcnn
NHZlX Apr 11, 2019
7ad182e
Cherry-Pick from 16662 : Anakin subgraph cpu support
NHZlX Apr 11, 2019
e14ab18
Cherry-pick from 1662, 16797.. : add anakin int8 support
NHZlX Apr 11, 2019
480302d
Cherry-pick from 16813 : change singleton to graph RegistBlock
NHZlX Apr 11, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cmake/anakin_subgraph.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ endif()

if(ANAKIN_FOUND)
message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
include_directories(${ANAKIN_ROOT})
include_directories(${ANAKIN_ROOT}/include)
include_directories(${ANAKIN_ROOT}/include/saber)
include_directories(${ANAKIN_ROOT}/saber)
link_directories(${ANAKIN_ROOT})
add_definitions(-DPADDLE_WITH_ANAKIN)
endif()
241 changes: 151 additions & 90 deletions paddle/fluid/framework/details/all_reduce_deps_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,125 +13,186 @@
// limitations under the License.

#include <algorithm>
#include <memory>
#include <map>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/op_graph_view.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/op_proto_maker.h"

namespace paddle {
namespace framework {
namespace details {

VarHandle* GetValidInput(const OpHandleBase* a) {
for (auto p : a->Inputs()) {
VarHandle* b = dynamic_cast<VarHandle*>(p);
if (b) {
return b;
class AllReduceDepsPass : public ir::Pass {
protected:
void ApplyImpl(ir::Graph* graph) const override {
std::vector<AllReduceOpHandle*> all_reduce_op_handles =
GetSortedAllReduceOps(*graph);

for (size_t i = 1; i < all_reduce_op_handles.size(); ++i) {
auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
all_reduce_op_handles[i - 1]->AddOutput(dep_var);
all_reduce_op_handles[i]->AddInput(dep_var);
}
}

return nullptr;
}

void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);

// get vars order
int order = 0;
std::unordered_map<std::string, int> vars;
// TODO(gongwb): use graph topology sort to find the order of operators.
// Note that must assert topology sort is stable
auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
for (auto* op_desc : ops) {
try {
bool is_bk_op =
static_cast<bool>(boost::get<int>(op_desc->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward));
if (!is_bk_op) continue;

auto backward_vars =
boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);

auto outputs = op_desc->Outputs();
for (auto& o_it : outputs) {
for (auto& v : o_it.second) { // values
vars[v] = order;
VLOG(10) << "in all_reduce_deps_pass:" << v;
}
}
order++;
} catch (boost::bad_get e) {
if (VLOG_IS_ON(10)) {
DebugString(*graph, all_reduce_op_handles);
}
}

std::vector<OpHandleBase*> dist_ops;
// get allreduce ops.
for (auto& op : graph_ops) {
// FIXME(gongwb):add broad cast.
if (op->Name() == "all_reduce" || op->Name() == "reduce") {
dist_ops.push_back(op);
std::vector<AllReduceOpHandle*> GetSortedAllReduceOps(
const ir::Graph& graph) const {
std::vector<AllReduceOpHandle*> all_reduce_op_handles;
std::unordered_map<OpHandleBase*, size_t> pending_ops;
std::unordered_set<OpHandleBase*> ready_ops;
std::unordered_set<OpHandleBase*> next_ready_ops;

auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(graph);
size_t num_of_ops = op_handles.size();
for (OpHandleBase* op : op_handles) {
size_t not_ready_vars = op->NotReadyInputSize();
if (not_ready_vars) {
pending_ops.insert({op, not_ready_vars});
} else {
ready_ops.insert(op);
}
}
}

VLOG(10) << "dist_ops size:" << dist_ops.size()
<< ", outputs size:" << vars.size() << ", ops size:" << ops.size();

std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
OpHandleBase* op2) {
VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));

PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
op1->DebugString(), op2->DebugString());

auto l_it = vars.find(i0->name());
auto r_it = vars.find(i1->name());

PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(),
"can't find var's name %s and %s in opdesc", i0->name(),
i1->name());

if (l_it->second < r_it->second) return true;
GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);

size_t has_run_ops = ready_ops.size();
while (has_run_ops != num_of_ops) {
for (auto* op : ready_ops) {
for (auto& ready_var : op->Outputs()) {
for (auto* pend_op : ready_var->PendingOps()) {
auto& deps = --pending_ops[pend_op];
if (deps == 0) {
next_ready_ops.insert(pend_op);
}
}
}
}

if (l_it->second == r_it->second) {
return i0->name() < i1->name();
PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle.");
ready_ops.clear();
std::swap(ready_ops, next_ready_ops);
GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
has_run_ops += ready_ops.size();
}
return all_reduce_op_handles;
}

return false;
});

// add dependency.
auto& sorted_ops = dist_ops;
for (size_t i = 1; i < sorted_ops.size(); ++i) {
auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());

auto* pre_op = sorted_ops[i - 1];
auto* op = sorted_ops[i];

pre_op->AddOutput(dep_var);
op->AddInput(dep_var);
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
void GetSortedAllReduceOps(
const std::unordered_set<OpHandleBase*>& ready_ops,
std::vector<AllReduceOpHandle*>* all_reduce_op_handles) const {
std::vector<AllReduceOpHandle*> current_all_reduce_op_handles;
for (auto& op_handle : ready_ops) {
auto all_reduce_op_handle = dynamic_cast<AllReduceOpHandle*>(op_handle);
if (all_reduce_op_handle) {
current_all_reduce_op_handles.emplace_back(all_reduce_op_handle);
}
}

VLOG(10) << "add all_reduce sequential dependencies between " << pre_op
<< " and " << op;
// NOTE(zcd): For distributed training, it is important to keep the order of
// allReduce on each node consistent. Otherwise, hang may occur.
// Sort the current_all_reduce_op_handles according to the name of input.
sort(current_all_reduce_op_handles.begin(),
current_all_reduce_op_handles.end(),
[](const AllReduceOpHandle* left,
const AllReduceOpHandle* right) -> bool {
auto left_in_vars = DynamicCast<VarHandle>(left->Inputs());
auto right_in_vars = DynamicCast<VarHandle>(right->Inputs());
PADDLE_ENFORCE_GT(left_in_vars.size(), 0);
PADDLE_ENFORCE_EQ(left_in_vars.size(), right_in_vars.size());
return left_in_vars[0]->Name() > right_in_vars[0]->Name();
});

all_reduce_op_handles->insert(all_reduce_op_handles->end(),
current_all_reduce_op_handles.begin(),
current_all_reduce_op_handles.end());
}

VLOG(10) << "pre_op:" << pre_op->DebugString()
<< ", op:" << op->DebugString();
void DebugString(
const ir::Graph& graph,
const std::vector<AllReduceOpHandle*>& all_reduce_op_handles) const {
// get vars order
std::map<int, std::vector<std::string>> vars =
GetSoredGradientsFromStaleProgram(graph);
std::stringstream out;
size_t grads_of_stale_program = 0;
out << "Get Order From kStaleProgramOpDescs: ";
for (auto& var : vars) {
out << "Order " << var.first << " [";
for (auto& var_name : var.second) {
out << var_name << ", ";
++grads_of_stale_program;
}
out << "], ";
}
VLOG(10) << out.str();

std::stringstream out2;
out2 << "Get Order From Topological order: ";
for (auto& op : all_reduce_op_handles) {
bool find_valid_input = false;
for (auto& in_var : op->Inputs()) {
if (dynamic_cast<VarHandle*>(in_var)) {
out2 << in_var->Name() << ", ";
find_valid_input = true;
break;
}
}
PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input.");
}
VLOG(10) << out2.str();
if (grads_of_stale_program != all_reduce_op_handles.size()) {
VLOG(10)
<< "The gradients number of stale program and graph is not equal.";
}
}
}

std::map<int, std::vector<std::string>> GetSoredGradientsFromStaleProgram(
const ir::Graph& graph) const {
std::map<int, std::vector<std::string>> vars;
auto ops = graph.Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
int order = 0;
for (auto* op_desc : ops) {
try {
bool is_bk_op =
static_cast<bool>(boost::get<int>(op_desc->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward));
if (!is_bk_op) continue;

auto backward_vars =
boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
if (backward_vars.empty()) continue;

PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
for (size_t i = 1; i < backward_vars.size(); i += 2) {
vars[order].emplace_back(backward_vars[i]);
VLOG(1) << "get parameter and gradient: " << backward_vars[i - 1]
<< ", " << backward_vars[i];
}
order++;
} catch (boost::bad_get e) {
}
}
return vars;
}
};
} // namespace details
} // namespace framework
} // namespace paddle
Expand Down
8 changes: 6 additions & 2 deletions paddle/fluid/framework/details/all_reduce_op_handle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
// asynchronous nccl allreduce or synchronous issue:
// https://github.com/PaddlePaddle/Paddle/issues/15049
DEFINE_bool(
sync_nccl_allreduce, false,
sync_nccl_allreduce, true,
"If set true, will call `cudaStreamSynchronize(nccl_stream)`"
"after allreduce, this mode can get better performance in some scenarios.");

Expand All @@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
}
}
// TODO(gongwb) :polish them!
if (is_encoded) {
VLOG(1) << "Use dgc allreduce mode";
}
}
#else
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
Expand Down Expand Up @@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {
paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
auto encode_var_name = original_name + g_dgc_encoded;
auto *in_var = local_scope->FindVar(encode_var_name);
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
auto &in = in_var->Get<LoDTensor>();
ins.emplace_back(&in);

Expand Down
17 changes: 8 additions & 9 deletions paddle/fluid/framework/details/build_strategy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -163,22 +163,21 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
"graph_printer", new details::GraphvizSSAGraphPrinter);
}

// Verify that the graph is correct for multi-device executor.
AppendPass("multi_devices_check_pass");

if (VLOG_IS_ON(2)) {
AppendPass("all_reduce_deps_pass");
}

if (SeqOnlyAllReduceOps(strategy_)) {
VLOG(10) << "Add all_reduce_deps_pass";
// experimental shows that the program will be faster if append
// all_reduce_deps_pass here.
if (!strategy_.enable_parallel_graph_ &&
(SeqOnlyAllReduceOps(strategy_) ||
strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) {
AppendPass("all_reduce_deps_pass");
}

if (strategy_.remove_unnecessary_lock_) {
VLOG(10) << "Add modify_op_lock_and_record_event_pass";
AppendPass("modify_op_lock_and_record_event_pass");
}

// Verify that the graph is correct for multi-device executor.
AppendPass("multi_devices_check_pass");
}

// Convert graph to run on multi-devices.
Expand Down
10 changes: 5 additions & 5 deletions paddle/fluid/framework/details/build_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,11 @@ struct BuildStrategy {

bool sync_batch_norm_{false};

bool memory_optimize_{true};
// TODO(dzhwinter):
// make enable_inplace, memory_optimize_
// memory_early_delete_ true by default
bool enable_inplace_{true};
// FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4
// to open them by default, we need to solve the fetch variable issue
bool memory_optimize_{false};

bool enable_inplace_{false};

bool enable_sequential_execution_{false};

Expand Down
Loading