Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion paddle/fluid/framework/details/all_reduce_op_handle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
}
}
// TODO(gongwb) :polish them!
if (is_encoded) {
VLOG(1) << "Use dgc allreduce mode";
}
}
#else
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
Expand Down Expand Up @@ -86,7 +90,7 @@ void AllReduceOpHandle::RunImplEncoded() {
paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
auto encode_var_name = original_name + g_dgc_encoded;
auto *in_var = local_scope->FindVar(encode_var_name);
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
auto &in = in_var->Get<LoDTensor>();
ins.emplace_back(&in);

Expand Down
2 changes: 1 addition & 1 deletion python/paddle/fluid/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,7 +752,7 @@ def _append_dgc_ops(self, param_and_grads):
force_cpu=True)

for param_var, grad_var in param_and_grads:
var_numel = reduce(lambda x, y: x * y, param_var.shape)
var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
if var_numel < 16384 or \
param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \
grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \
Expand Down
10 changes: 8 additions & 2 deletions python/paddle/fluid/parallel_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,11 @@ def __init__(self,
self._scope = scope if scope is not None else executor.global_scope()

if main_program is not None and main_program._enable_dgc:
assert num_trainers > 1, "dgc is not useful for single trainer training."
assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce
assert num_trainers * len(
self._places) > 1, "dgc is not useful for single card training"
assert use_cuda
self._places) > 1, "dgc is not useful for single card training."
assert use_cuda, "dgc only used when cuda is used."

main_program = main_program if main_program is not None \
else framework.default_main_program()
Expand All @@ -123,6 +124,11 @@ def __init__(self,
exec_strategy=exec_strategy,
share_vars_from=share_vars_from._compiled_program
if share_vars_from else None)

# FIXME(gongwb): I will move dgc from dist mode to allreduce mode in next pr.
if main_program._enable_dgc:
self._compiled_program._build_strategy.is_distribution = True

self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
self._exe = executor.Executor(self._place)
self._compiled_program._compile(place=self._place, scope=self._scope)
Expand Down