Skip to content

Commit 20a1e3a

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into all_in_dense_tensor
2 parents e7d62d0 + 8c20d66 commit 20a1e3a

File tree

398 files changed

+4541
-2907
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

398 files changed

+4541
-2907
lines changed

paddle/fluid/distributed/fleet_executor/carrier.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -221,8 +221,8 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
221221
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
222222
if (platform::is_gpu_place(place)) {
223223
if (framework::IsFastEagerDeletionModeEnabled()) {
224-
gc.reset(new framework::UnsafeFastGPUGarbageCollector(
225-
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
224+
gc.reset(new framework::UnsafeFastGPUGarbageCollector(place,
225+
max_memory_size));
226226
}
227227
}
228228
#endif

paddle/fluid/distributed/ps.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ message CommonAccessorParameter {
172172
optional string entry = 7;
173173
optional int32 trainer_num = 8;
174174
optional bool sync = 9;
175+
optional uint32 table_num = 10;
176+
optional uint32 table_dim = 11;
175177
}
176178

177179
message TableAccessorSaveParameter {

paddle/fluid/distributed/service/brpc_ps_client.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1071,8 +1071,8 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
10711071
for (size_t i = 0; i < worker_param.downpour_table_param_size(); ++i) {
10721072
if (worker_param.downpour_table_param(i).table_id() == table_id) {
10731073
var_name = worker_param.downpour_table_param(i).common().table_name();
1074-
var_num = worker_param.downpour_table_param(i).accessor().fea_dim();
1075-
var_shape = worker_param.downpour_table_param(i).accessor().embedx_dim();
1074+
var_num = worker_param.downpour_table_param(i).common().table_num();
1075+
var_shape = worker_param.downpour_table_param(i).common().table_dim();
10761076
break;
10771077
}
10781078
}

paddle/fluid/distributed/service/brpc_utils.cc

Lines changed: 28 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -109,13 +109,12 @@ void SerializeLodTensor(framework::Variable* var,
109109
iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
110110
} else {
111111
#ifdef PADDLE_WITH_CUDA
112-
char* temp_ptr =
113-
new char[tensor->numel() * framework::SizeOfType(tensor->type())];
112+
char* temp_ptr = new char[tensor->numel() *
113+
framework::SizeOfType(tensor->type())]; // NOLINT
114114
auto stream =
115115
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
116116
memory::Copy(
117-
platform::CPUPlace(), temp_ptr,
118-
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
117+
platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
119118
tensor->numel() * framework::SizeOfType(tensor->type()), stream);
120119
auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
121120
iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
@@ -151,13 +150,12 @@ void SerializeSelectedRows(framework::Variable* var,
151150
iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
152151
} else {
153152
#ifdef PADDLE_WITH_CUDA
154-
char* temp_ptr =
155-
new char[tensor->numel() * framework::SizeOfType(tensor->type())];
153+
char* temp_ptr = new char[tensor->numel() *
154+
framework::SizeOfType(tensor->type())]; // NOLINT
156155
auto stream =
157156
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
158157
memory::Copy(
159-
platform::CPUPlace(), temp_ptr,
160-
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
158+
platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
161159
tensor->numel() * framework::SizeOfType(tensor->type()), stream);
162160
auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
163161
iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
@@ -207,7 +205,7 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
207205
}
208206

209207
void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
210-
butil::IOBufBytesIterator& io_buffer_itr,
208+
butil::IOBufBytesIterator& io_buffer_itr, // NOLINT
211209
const platform::DeviceContext& ctx) {
212210
const auto place = ctx.GetPlace();
213211
framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
@@ -232,30 +230,30 @@ void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
232230

233231
// IO Buffer
234232
if (platform::is_cpu_place(place)) {
235-
unsigned long data_len;
236-
io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
233+
unsigned long data_len; // NOLINT
234+
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
237235
io_buffer_itr.copy_and_forward(tensor_data, data_len);
238236
} else if (platform::is_gpu_place(place)) {
239237
#ifdef PADDLE_WITH_CUDA
240-
unsigned long data_len;
241-
char* temp_ptr =
242-
new char[tensor->numel() * framework::SizeOfType(tensor->type())];
243-
io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
244-
io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len);
238+
unsigned long data_len; // NOLINT
239+
char* temp_ptr = new char[tensor->numel() *
240+
framework::SizeOfType(tensor->type())]; // NOLINT
241+
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
242+
io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); // NOLINT
245243
auto stream =
246244
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
247-
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
248-
platform::CPUPlace(), (void*)temp_ptr,
249-
tensor->numel() * framework::SizeOfType(tensor->type()),
250-
stream);
245+
memory::Copy(
246+
place, tensor_data, platform::CPUPlace(), (void*)temp_ptr, // NOLINT
247+
tensor->numel() * framework::SizeOfType(tensor->type()), stream);
251248
delete[] temp_ptr;
252249
#endif
253250
}
254251
}
255252

256-
void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
257-
butil::IOBufBytesIterator& io_buffer_itr,
258-
const platform::DeviceContext& ctx) {
253+
void DeserializeSelectedRows(
254+
framework::Variable* var, const VarMsg& msg,
255+
butil::IOBufBytesIterator& io_buffer_itr, // NOLINT
256+
const platform::DeviceContext& ctx) {
259257
const auto place = ctx.GetPlace();
260258
auto* slr = var->GetMutable<framework::SelectedRows>();
261259
framework::Tensor* tensor = slr->mutable_value();
@@ -272,20 +270,19 @@ void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
272270
tensor->mutable_data(place, VarMessageToVarType(msg.data_type()));
273271
// IO Buffer
274272
if (platform::is_cpu_place(place)) {
275-
unsigned long data_len;
276-
io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
273+
unsigned long data_len; // NOLINT
274+
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
277275
io_buffer_itr.copy_and_forward(tensor_data, data_len);
278276
} else if (platform::is_gpu_place(place)) {
279277
#ifdef PADDLE_WITH_CUDA
280-
char* temp_ptr =
281-
new char[tensor->numel() * framework::SizeOfType(tensor->type())];
282-
unsigned long data_len;
283-
io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
278+
char* temp_ptr = new char[tensor->numel() *
279+
framework::SizeOfType(tensor->type())]; // NOLINT
280+
unsigned long data_len; // NOLINT
281+
io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT
284282
io_buffer_itr.copy_and_forward(temp_ptr, data_len);
285283
auto stream =
286284
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
287-
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
288-
platform::CPUPlace(), temp_ptr,
285+
memory::Copy(place, tensor_data, platform::CPUPlace(), temp_ptr,
289286
tensor->numel() * framework::SizeOfType(tensor->type()),
290287
stream);
291288
delete[] temp_ptr;

paddle/fluid/distributed/service/heter_client.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
4444
auto stream =
4545
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
4646
memory::Copy(
47-
platform::CPUPlace(), temp_ptr,
48-
BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
47+
platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
4948
tensor->numel() * framework::SizeOfType(tensor->type()), stream);
5049
float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr);
5150
micro_id = static_cast<int>(temp_ptr_float[0]);

paddle/fluid/distributed/table/ctr_accessor.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,8 @@ std::string CtrCommonAccessor::parse_to_string(const float* v, int param) {
305305
auto show = common_feature_value.show(const_cast<float*>(v));
306306
auto click = common_feature_value.click(const_cast<float*>(v));
307307
auto score = show_click_score(show, click);
308-
if (score >= _config.embedx_threshold()) {
308+
if (score >= _config.embedx_threshold() &&
309+
param > common_feature_value.embedx_w_index()) {
309310
for (auto i = common_feature_value.embedx_w_index();
310311
i < common_feature_value.dim(); ++i) {
311312
os << " " << v[i];

paddle/fluid/distributed/table/memory_sparse_table.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ namespace paddle {
2727
namespace distributed {
2828

2929
// TODO(zhaocaibei123): configure
30-
bool FLAGS_pserver_create_value_when_push = false;
30+
bool FLAGS_pserver_create_value_when_push = true;
3131
int FLAGS_pserver_table_save_max_retry = 3;
3232
bool FLAGS_pserver_enable_create_feasign_randomly = false;
3333

@@ -494,7 +494,6 @@ int32_t MemorySparseTable::push_sparse(const uint64_t* keys,
494494
values + push_data_idx * update_value_col;
495495
auto itr = local_shard.find(key);
496496
if (itr == local_shard.end()) {
497-
VLOG(0) << "sparse table push_sparse: " << key << "not found!";
498497
if (FLAGS_pserver_enable_create_feasign_randomly &&
499498
!_value_accesor->create_value(1, update_data)) {
500499
continue;

paddle/fluid/eager/accumulation/gradient_accumulation.cc

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
4343
TensorAddFunctor(int64_t numel, const T* x, T* y)
4444
: numel_(numel), x_(x), y_(y) {}
4545

46-
void operator()(const paddle::platform::CPUPlace& place) {
46+
void operator()(const paddle::platform::CPUPlace& place) const {
4747
paddle::platform::CPUDeviceContext* ctx =
4848
dynamic_cast<paddle::platform::CPUDeviceContext*>(
4949
paddle::platform::DeviceContextPool::Instance().Get(place));
@@ -56,7 +56,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
5656
// TODO(jiabin): Support xpu here from gradient_accumulator.cc
5757

5858
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
59-
void operator()(const paddle::platform::CUDAPlace& place) {
59+
void operator()(const paddle::platform::CUDAPlace& place) const {
6060
paddle::platform::CUDADeviceContext* ctx =
6161
dynamic_cast<paddle::platform::CUDADeviceContext*>(
6262
paddle::platform::DeviceContextPool::Instance().Get(place));
@@ -66,7 +66,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
6666
blas.AXPY(numel_, 1., x_, y_);
6767
}
6868
#else
69-
void operator()(const paddle::platform::CUDAPlace& place) {
69+
void operator()(const paddle::platform::CUDAPlace& place) const {
7070
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
7171
"Gradient accumulation on place (%s) "
7272
"is not supported in imperative mode",
@@ -76,22 +76,22 @@ class TensorAddFunctor : public boost::static_visitor<> {
7676

7777
// TODO(jiabin): Support Npu here from gradient_accumulator.cc
7878
// there is NO blas in CUDAPinnedPlace
79-
void operator()(const paddle::platform::CUDAPinnedPlace& place) {
79+
void operator()(const paddle::platform::CUDAPinnedPlace& place) const {
8080
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
8181
"Gradient accumulation on place (%s) "
8282
"is not supported in imperative mode",
8383
place));
8484
}
8585

8686
#ifdef PADDLE_WITH_ASCEND_CL
87-
void operator()(const paddle::platform::NPUPlace& place) {
87+
void operator()(const paddle::platform::NPUPlace& place) const {
8888
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
8989
"Gradient accumulation on place (%s) "
9090
"is not supported in imperative mode",
9191
place));
9292
}
9393
#else
94-
void operator()(const paddle::platform::NPUPlace& place) {
94+
void operator()(const paddle::platform::NPUPlace& place) const {
9595
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
9696
"Gradient accumulation on place (%s) "
9797
"is not supported in imperative mode",
@@ -100,14 +100,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
100100
#endif
101101

102102
#ifdef PADDLE_WITH_XPU
103-
void operator()(const paddle::platform::XPUPlace& place) {
103+
void operator()(const paddle::platform::XPUPlace& place) const {
104104
paddle::platform::XPUDeviceContext* ctx =
105105
dynamic_cast<paddle::platform::XPUDeviceContext*>(
106106
paddle::platform::DeviceContextPool::Instance().Get(place));
107107
xpu::add<T>(ctx->x_context(), x_, y_, y_, static_cast<int>(numel_));
108108
}
109109
#else
110-
void operator()(const paddle::platform::XPUPlace& place) {
110+
void operator()(const paddle::platform::XPUPlace& place) const {
111111
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
112112
"Gradient accumulation on place (%s) "
113113
"is not supported in imperative mode",
@@ -116,14 +116,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
116116
#endif
117117

118118
#ifdef PADDLE_WITH_MLU
119-
void operator()(const paddle::platform::MLUPlace& place) {
119+
void operator()(const paddle::platform::MLUPlace& place) const {
120120
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
121121
"Gradient accumulation on place (%s) "
122122
"is not supported in imperative mode",
123123
place));
124124
}
125125
#else
126-
void operator()(const paddle::platform::MLUPlace& place) {
126+
void operator()(const paddle::platform::MLUPlace& place) const {
127127
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
128128
"Gradient accumulation on place (%s) "
129129
"is not supported in imperative mode",
@@ -132,22 +132,22 @@ class TensorAddFunctor : public boost::static_visitor<> {
132132
#endif
133133

134134
#ifdef PADDLE_WITH_IPU
135-
void operator()(const paddle::platform::IPUPlace& place) {
135+
void operator()(const paddle::platform::IPUPlace& place) const {
136136
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
137137
"Gradient accumulation on place (%s) "
138138
"is not supported in imperative mode",
139139
place));
140140
}
141141
#else
142-
void operator()(const paddle::platform::IPUPlace& place) {
142+
void operator()(const paddle::platform::IPUPlace& place) const {
143143
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
144144
"Gradient accumulation on place (%s) "
145145
"is not supported in imperative mode",
146146
place));
147147
}
148148
#endif
149149

150-
void operator()(const paddle::platform::NPUPinnedPlace& place) {
150+
void operator()(const paddle::platform::NPUPinnedPlace& place) const {
151151
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
152152
"Gradient accumulation on place (%s) "
153153
"is not supported in imperative mode",
@@ -157,7 +157,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
157157
private:
158158
int64_t numel_;
159159
const T* x_;
160-
T* y_;
160+
mutable T* y_;
161161
};
162162

163163
template <typename DeviceContext, typename T>
@@ -218,7 +218,7 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
218218
if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
219219
TensorAddFunctor<cpp_type> func(numel, src_tensor->data<cpp_type>(), \
220220
dst_tensor->mutable_data<cpp_type>()); \
221-
boost::apply_visitor(func, place); \
221+
paddle::platform::VisitPlace(place, func); \
222222
return; \
223223
}
224224

@@ -294,7 +294,7 @@ void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
294294
TensorAddFunctor<cpp_type> func( \
295295
numel, src_tensor.data<cpp_type>(), \
296296
dst_tensor->mutable_data<cpp_type>(place)); \
297-
boost::apply_visitor(func, place); \
297+
paddle::platform::VisitPlace(place, func); \
298298
return; \
299299
}
300300

paddle/fluid/eager/auto_code_generator/eager_generator.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1852,7 +1852,7 @@ static std::string GenerateGradNodeCCContents(
18521852
" %s\n"
18531853
" return outputs;\n";
18541854
generated_grad_function_body = paddle::string::Sprintf(
1855-
BWD_RETURN_TEMPLATE, outs_size, generated_grad_function_body);
1855+
BWD_RETURN_TEMPLATE, in_vars.size(), generated_grad_function_body);
18561856

18571857
// [Generation] Get Full Grad Function
18581858
const char* GRAD_FUNCTION_TEMPLATE =

paddle/fluid/eager/backward.cc

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,17 @@ void RunBackward(const std::vector<egr::EagerTensor>& tensors,
103103
VLOG(2) << "Out Rank of Tensor is slot: " << input_info.first
104104
<< ", rank: " << input_info.second;
105105
// Get target GradNodeBase from target tensors
106-
GradNodeBase* grad_node = auto_grad_meta->GetMutableGradNode().get();
106+
auto shared_grad_node = auto_grad_meta->GetMutableGradNode();
107+
108+
if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
109+
auto_grad_meta->StopGradient()) {
110+
VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
111+
"stop_gradient=True: "
112+
<< tensor.name();
113+
continue;
114+
}
115+
116+
GradNodeBase* grad_node = shared_grad_node.get();
107117

108118
// Prepare GradTensorHolder
109119
if (!node_input_buffers_dict.count(grad_node)) {
@@ -192,19 +202,38 @@ void RunBackward(const std::vector<egr::EagerTensor>& tensors,
192202
// Since we make edge has as same rank as bwd outputs, we indexing them
193203
// with
194204
// the same rank(i, j)
195-
VLOG(6) << "Get Edge with slot: " << i << ", rank: " << j;
196-
egr::EagerTensor& grad_output_tensor = grad_output_tensors[i][j];
197-
if (!grad_output_tensor.defined() ||
198-
!grad_output_tensor.initialized()) {
199-
VLOG(6) << "We get grad_output_tensor with slot: " << i
200-
<< ", rank: " << j << " as uninitialized or undefined tensor";
201-
}
202-
GradNodeBase* next_node = edge.GetMutableGradNode().get();
205+
auto next_node_shared = edge.GetMutableGradNode();
203206

204207
// Next node could be nullptr if it is leaf tensor with no
205208
// AccumulationNode attached
206209
// Or it could also originated from dispensable inputs
207-
if (!next_node) continue;
210+
if (!next_node_shared || !next_node_shared.get() ||
211+
grad_output_tensors[i].empty()) {
212+
continue;
213+
}
214+
PADDLE_ENFORCE_LT(
215+
j, grad_output_tensors[i].size(),
216+
paddle::platform::errors::Fatal(
217+
"Rank of grad_output_tensors should be less than "
218+
"grad_output_tensors[i].size(), which is: %d. This error may "
219+
"indicate autoprune or autograd api error. ",
220+
grad_output_tensors.size()));
221+
egr::EagerTensor& grad_output_tensor = grad_output_tensors[i][j];
222+
223+
if ((!grad_output_tensor.defined() ||
224+
!grad_output_tensor.initialized())) {
225+
if (!grad_output_tensor.Var().IsInitialized()) {
226+
VLOG(6)
227+
<< "We get grad_output_tensor with slot: " << i
228+
<< ", rank: " << j
229+
<< " as uninitialized or undefined in both tensor and variable";
230+
}
231+
}
232+
VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i
233+
<< ", rank: " << j
234+
<< " 's name is: " << grad_output_tensor.name();
235+
236+
auto* next_node = next_node_shared.get();
208237

209238
if (!node_input_buffers_dict.count(next_node)) {
210239
node_input_buffers_dict[next_node] =

0 commit comments

Comments
 (0)