Skip to content

Commit a5a7918

Browse files
committed
Revert "[cherry-pick][hybrid performance] Grad fuse for gradient merge under pipeline mode (PaddlePaddle#35004) (PaddlePaddle#35299)"
This reverts commit e931cd1.
1 parent 7e1dee7 commit a5a7918

File tree

10 files changed

+11
-534
lines changed

10 files changed

+11
-534
lines changed

paddle/fluid/framework/distributed_strategy.proto

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,6 @@ message DistributedStrategy {
200200
optional int32 fuse_grad_size_in_num = 31 [ default = 8 ];
201201
optional bool calc_comm_same_stream = 32 [ default = false ];
202202
optional bool asp = 33 [ default = false ];
203-
optional bool fuse_grad_merge = 34 [ default = false ];
204203

205204
optional RecomputeConfig recompute_configs = 101;
206205
optional AMPConfig amp_configs = 102;

paddle/fluid/operators/coalesce_tensor_op.cc

Lines changed: 6 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -20,49 +20,10 @@
2020
#include "paddle/fluid/framework/var_type.h"
2121
#include "paddle/fluid/operators/math/math_function.h"
2222
#include "paddle/fluid/platform/device_memory_aligment.h"
23-
#ifdef PADDLE_WITH_ASCEND_CL
24-
#include "paddle/fluid/operators/npu_op_runner.h"
25-
#endif
2623

2724
namespace paddle {
2825
namespace operators {
2926

30-
template <typename DeviceContext>
31-
struct FillConstantVisitor {
32-
FillConstantVisitor(const DeviceContext &dev_ctx,
33-
framework::LoDTensor *tensor, const float value)
34-
: dev_ctx_(dev_ctx), tensor_(tensor), value_(value) {}
35-
36-
template <typename T>
37-
void apply(typename std::enable_if<std::is_same<T, int8_t>::value ||
38-
std::is_same<T, int16_t>::value>::type * =
39-
nullptr) const {
40-
PADDLE_THROW(platform::errors::InvalidArgument(
41-
"Not support data type for set_constant attr"));
42-
}
43-
44-
template <typename T>
45-
void apply(typename std::enable_if<!(std::is_same<T, int8_t>::value ||
46-
std::is_same<T, int16_t>::value)>::type
47-
* = nullptr) const {
48-
#ifdef PADDLE_WITH_ASCEND_CL
49-
if (platform::is_npu_place(dev_ctx_.GetPlace())) {
50-
FillNpuTensorWithConstant<T>(tensor_, static_cast<T>(value_));
51-
} else {
52-
math::SetConstant<DeviceContext, T> set_constant;
53-
set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
54-
}
55-
#else
56-
math::SetConstant<DeviceContext, T> set_constant;
57-
set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
58-
#endif
59-
}
60-
61-
const DeviceContext &dev_ctx_;
62-
framework::LoDTensor *tensor_;
63-
float value_;
64-
};
65-
6627
template <typename DeviceContext, typename T>
6728
class CoalesceTensorOpKernel : public framework::OpKernel<T> {
6829
public:
@@ -109,7 +70,6 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
10970
auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
11071
bool use_align = context.Attr<bool>("use_align");
11172
auto align_size = context.Attr<int>("align_size");
112-
auto size_of_dtype = context.Attr<int>("user_defined_size_of_dtype");
11373

11474
if (context.Attr<bool>("check_name")) {
11575
for (size_t i = 0; i < in_var_names.size(); ++i) {
@@ -134,9 +94,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
13494
size_t numel = 0;
13595
auto dtype = static_cast<framework::proto::VarType::Type>(
13696
context.Attr<int>("dtype"));
137-
if (size_of_dtype == -1) {
138-
size_of_dtype = framework::SizeOfType(dtype);
139-
}
97+
size_t size_of_dtype = framework::SizeOfType(dtype);
14098
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype,
14199
context.GetPlace(), use_align, align_size);
142100

@@ -163,9 +121,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
163121
: len;
164122
}
165123
} else if (context.Attr<bool>("set_constant")) {
166-
framework::VisitDataType(
167-
dtype, FillConstantVisitor<DeviceContext>(
168-
dev_ctx, fused_tensor, context.Attr<float>("constant")));
124+
// TODO(Liu yuang) ADD NPU SET_CONSTANT FUNCTION.
125+
math::SetConstant<DeviceContext, T> set_constant;
126+
set_constant(dev_ctx, fused_tensor,
127+
static_cast<T>(context.Attr<float>("constant")));
169128
} else if (context.Attr<bool>("persist_output")) {
170129
for (size_t i = 0; i < out_var_names.size(); ++i) {
171130
size_t len = static_cast<size_t>(out_tensors[i]->numel());
@@ -268,13 +227,10 @@ class CoalesceTensorOp : public framework::OperatorWithKernel {
268227
}
269228
auto use_align = ctx->Attrs().Get<bool>("use_align");
270229
auto align_size = ctx->Attrs().Get<int>("align_size");
271-
auto size_of_dtype = ctx->Attrs().Get<int>("user_defined_size_of_dtype");
272230

273231
auto dtype = static_cast<framework::proto::VarType::Type>(
274232
ctx->Attrs().Get<int>("dtype"));
275-
if (size_of_dtype == -1) {
276-
size_of_dtype = framework::SizeOfType(dtype);
277-
}
233+
size_t size_of_dtype = framework::SizeOfType(dtype);
278234

279235
auto alignment = [](size_t size, size_t align_size) {
280236
size_t remaining = size % align_size;
@@ -352,15 +308,6 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
352308
.SetDefault(true);
353309
AddAttr<int>("align_size", "The alignment size when use_align is True")
354310
.SetDefault(-1);
355-
AddAttr<int>("user_defined_size_of_dtype",
356-
"The user defined size of dtype. This is used to coalesce "
357-
"grad vars and merged_grad vars at the same time. For some "
358-
"strategy, the dtype of fused_grad_vars and the dtype of "
359-
"fused_grad_merged_vars are not identical, which will cause "
360-
"the shape of these two coalesced vars are different. To "
361-
"make sure the shape of these two vars are identical with "
362-
"each other, this attr is added.")
363-
.SetDefault(-1);
364311
AddComment(R"DOC(
365312
CoalesceTensor Operator.
366313

python/paddle/distributed/fleet/base/distributed_strategy.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -967,28 +967,6 @@ def _calc_comm_same_stream(self, same):
967967
"WARNING: calc_comm_same_stream should have value of boolean type"
968968
)
969969

970-
@property
971-
def fuse_grad_merge(self):
972-
"""
973-
Set whether fuse the grad for gradient merge.
974-
Note: this flag will only effect the gradient merge under pipeline mode
975-
The default value for the fuse_grad_merge is False
976-
Examples:
977-
.. code-block:: python
978-
import paddle.distributed.fleet as fleet
979-
strategy = fleet.DistributedStrategy()
980-
strategy.fuse_param_grad = True
981-
"""
982-
return self.strategy.fuse_grad_merge
983-
984-
@fuse_grad_merge.setter
985-
@is_strict_auto
986-
def fuse_grad_merge(self, fuse_grad_merge):
987-
if isinstance(fuse_grad_merge, bool):
988-
self.strategy.fuse_grad_merge = fuse_grad_merge
989-
else:
990-
print("WARNING: fuse_grad_merge should have value of boolean type")
991-
992970
@property
993971
def fuse_grad_size_in_num(self):
994972
"""

python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,6 @@ def remove_param(input_name):
122122
for idx, op in enumerate(block.ops):
123123
if is_optimizer_op(op):
124124
break
125-
# TODO (Yuang Liu): tmp solution for fuse_grad_merge + optimize_cast
126-
if not offload and op.type == 'coalesce_tensor':
127-
continue
128125
for input_name in op.desc.input_arg_names():
129126
if input_name not in param_to_idx:
130127
continue

python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -341,11 +341,7 @@ def insert_allreduce_ops(block,
341341
if len(allreduce_vars) == 0:
342342
return
343343

344-
if user_defined_strategy and \
345-
user_defined_strategy.fuse_all_reduce_ops and \
346-
not user_defined_strategy.fuse_grad_merge:
347-
# If fuse_grad_merge is enable, the grad vars have already been fused during
348-
# gradient merge pass, therefore, those vars are not need to be fused here
344+
if user_defined_strategy and user_defined_strategy.fuse_all_reduce_ops:
349345
insert_fused_allreduce_ops(block, insert_idx, ring_id, allreduce_vars,
350346
op_role, use_calc_stream,
351347
user_defined_strategy.fuse_grad_size_in_MB)

python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -319,9 +319,7 @@ def _insert_allreduce_for_pp(self):
319319
main_block._remove_op(idx)
320320

321321
accumulated_grad_names = self._pp_optimizer._accumulate_gradients(
322-
main_block,
323-
fp16_allreduce=fp16_allreduce,
324-
user_defined_strategy=strategy)
322+
main_block, fp16_allreduce=fp16_allreduce)
325323

326324
len_of_ops = len(main_block.ops)
327325
first_optimize_op_index = get_first_optimize_op_idx(main_block)

0 commit comments

Comments
 (0)