Skip to content

Commit 05da032

Browse files
Merge pull request #27 from PaddlePaddle/develop
update
2 parents e1a92d6 + f13dcfb commit 05da032

File tree

129 files changed

+9609
-1192
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

129 files changed

+9609
-1192
lines changed

cmake/external/xpu.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ ELSE ()
3535
ENDIF()
3636

3737
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
38-
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210818")
38+
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210830")
3939
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
4040
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
4141
SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)

cmake/operators.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ function(op_library TARGET)
183183
list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
184184
list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
185185
list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
186+
list(REMOVE_ITEM hip_srcs "svd_op.cu")
186187
list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
187188
list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
188189
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}

paddle/fluid/framework/distributed_strategy.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ message PipelineConfig {
158158
optional int32 micro_batch_size = 1 [ default = 1 ];
159159
optional int32 accumulate_steps = 2 [ default = 1 ];
160160
optional string schedule_mode = 3 [ default = '1F1B' ];
161+
optional bool p2p_cache_shape = 4 [ default = true ];
161162
}
162163

163164
message TensorParallelConfig {

paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class Optimizer {
4444
if (w < optimizer_config::min_bound) w = optimizer_config::min_bound;
4545
if (w > optimizer_config::max_bound) w = optimizer_config::max_bound;
4646

47-
add_g2sum = scaled_grad * scaled_grad;
47+
add_g2sum += scaled_grad * scaled_grad;
4848

4949
g2sum += add_g2sum;
5050
}
@@ -64,7 +64,7 @@ class Optimizer {
6464
w[i] = optimizer_config::mf_min_bound;
6565
if (w[i] > optimizer_config::mf_max_bound)
6666
w[i] = optimizer_config::mf_max_bound;
67-
add_g2sum = scaled_grad * scaled_grad;
67+
add_g2sum += scaled_grad * scaled_grad;
6868
}
6969

7070
g2sum += add_g2sum / n;

paddle/fluid/framework/new_executor/interpretercore.cc

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
143143
main_program_(main_prog),
144144
global_scope_(global_scope),
145145
d2h_ctx_pool_({place}),
146-
h2d_ctx_pool_({place}),
147-
fetch_context_pool_({place}) {
146+
h2d_ctx_pool_({place}) {
148147
is_build_ = false;
149148

150149
garbages_.reset(new GarbageQueue());
@@ -339,9 +338,6 @@ void InterpreterCore::BuildInstructionCtx(Instruction* instr_node,
339338
new RuntimeInferShapeContext(*op_base, *instr_node->runtime_ctx_.get()));
340339

341340
auto* dev_ctx = instr_node->dev_ctx_;
342-
if (instr_node->kernel_func_.operator_base_->Type() == "fetch_v2") {
343-
dev_ctx = fetch_context_pool_.Get(place);
344-
}
345341
Scope scope;
346342

347343
instr_node->execution_ctx_.reset(new ExecutionContext(
@@ -356,12 +352,6 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
356352
instr_node.kernel_func_.operator_base_)
357353
->InferShape(instr_node.infershape_ctx_.get());
358354

359-
if (instr_node.kernel_func_.operator_base_->Type() == "fetch_v2") {
360-
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
361-
auto* dev_ctx = pool.Get(place_);
362-
dev_ctx->Wait(); // TODO(wanghuancoder)
363-
}
364-
365355
instr_node.kernel_func_.compute_func_(*instr_node.execution_ctx_.get());
366356
}
367357

@@ -411,8 +401,6 @@ void InterpreterCore::ExecuteInstructionList(
411401
working_var_ref);
412402
}
413403

414-
fetch_context_pool_.Get(place)->Wait();
415-
416404
for (size_t i = 0; i < working_var_ref.size(); ++i) {
417405
if (working_var_ref[i].var_ref_count_ != 0) {
418406
std::cerr << " var ref is not zero " << i << std::endl;
@@ -671,6 +659,9 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
671659
expected_kernel_key);
672660
if (!platform::is_same_place(kernel_type_for_var.place_,
673661
expected_kernel_key.place_)) {
662+
if (op_base->Type() == "fetch_v2") {
663+
op_base->SetAttr("deepcopy", false);
664+
}
674665
// need trans place
675666
// 1. add var in scope
676667
// 2. add copy op

paddle/fluid/framework/new_executor/interpretercore.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,6 @@ class InterpreterCore {
114114
size_t max_memory_size_;
115115
size_t cur_memory_size_;
116116
std::unique_ptr<WorkQueue> gc_queue_;
117-
118-
platform::DeviceContextPool fetch_context_pool_;
119117
};
120118
} // namespace framework
121119
} // namespace paddle

paddle/fluid/framework/operator.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1254,10 +1254,10 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
12541254
}
12551255
#endif
12561256
#ifdef PADDLE_WITH_XPU
1257-
if ((kernel_iter == kernels.end() &&
1258-
is_xpu_place(expected_kernel_key.place_) &&
1259-
!paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) ||
1260-
paddle::platform::is_in_xpu_black_list(type_)) {
1257+
if (is_xpu_place(expected_kernel_key.place_) &&
1258+
(kernel_iter == kernels.end() ||
1259+
!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
1260+
paddle::platform::is_in_xpu_black_list(type_))) {
12611261
VLOG(3) << "missing XPU kernel: " << type_
12621262
<< ", expected_kernel_key:" << expected_kernel_key
12631263
<< ", fallbacking to CPU one!";

paddle/fluid/framework/ps_gpu_trainer.cc

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
5757
trainer_desc.downpour_param().stat_var_names(i));
5858
}
5959
VLOG(3) << "going to initialize pull dense worker";
60-
pull_dense_worker_ = PullDenseWorker::GetInstance();
61-
pull_dense_worker_->Initialize(trainer_desc);
6260
SetDebug(trainer_desc.debug());
6361
trainer_desc_ = trainer_desc;
6462
workers_.resize(place_num);
@@ -112,15 +110,21 @@ void PSGPUTrainer::InitTrainerEnv(const ProgramDesc& main_program,
112110
}
113111
}
114112
}
113+
for (auto& var : main_program.Block(0).AllVars()) {
114+
if (var->Persistable()) {
115+
auto it = std::find(need_merge_var_names_.begin(),
116+
need_merge_var_names_.end(), var->Name());
117+
if (it == need_merge_var_names_.end()) {
118+
VLOG(2) << "train param: " << var->Name();
119+
trainable_param_.push_back(var->Name());
120+
}
121+
}
122+
}
115123
place_ = place;
116124
return;
117125
}
118126

119127
void PSGPUTrainer::InitOtherEnv(const ProgramDesc& main_program) {
120-
pull_dense_worker_->SetRootScope(root_scope_);
121-
for (size_t i = 0; i < places_.size(); ++i) {
122-
pull_dense_worker_->AddThreadScope(workers_[i]->GetThreadScope());
123-
}
124128
VLOG(3) << "init other env done.";
125129
}
126130

@@ -141,15 +145,27 @@ Scope* PSGPUTrainer::GetWorkerScope(int thread_id) { return nullptr; }
141145
template <typename T>
142146
void PSGPUTrainer::MergeToRootScope(LoDTensor* root_tensor, LoDTensor* tensor) {
143147
LoDTensor tmp_root;
144-
TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root);
148+
TensorCopySync(*root_tensor, platform::CPUPlace(), &tmp_root);
145149
T* tmp_root_data = tmp_root.data<T>();
146150
LoDTensor tmp_tensor;
147-
TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor);
151+
TensorCopySync(*tensor, platform::CPUPlace(), &tmp_tensor);
148152
T* data = tmp_tensor.data<T>();
149153
for (int i = 0; i < tmp_tensor.numel(); i++) {
150154
tmp_root_data[i] += data[i];
151155
}
152-
TensorCopy(tmp_root, platform::CPUPlace(), root_tensor);
156+
TensorCopySync(tmp_root, platform::CPUPlace(), root_tensor);
157+
}
158+
159+
void PSGPUTrainer::MergeDenseParam() {
160+
auto thread_scope = workers_[0]->GetThreadScope();
161+
for (auto& name : trainable_param_) {
162+
VLOG(2) << "merge var " << name << " to root scope";
163+
Variable* root_var = root_scope_->FindVar(name);
164+
LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
165+
Variable* var = thread_scope->FindVar(name);
166+
LoDTensor* tensor = var->GetMutable<LoDTensor>();
167+
TensorCopySync((*tensor), root_tensor->place(), root_tensor);
168+
}
153169
}
154170

155171
void PSGPUTrainer::Finalize() {
@@ -187,7 +203,7 @@ void PSGPUTrainer::Finalize() {
187203
_ForEachDataType_(MergeCallback);
188204
}
189205
}
190-
pull_dense_worker_->MergeDenseParam();
206+
MergeDenseParam();
191207
root_scope_->DropKids();
192208
}
193209
} // namespace framework

paddle/fluid/framework/trainer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ class PSGPUTrainer : public TrainerBase {
265265
}
266266
virtual std::string GetDumpPath(int tid) { return ""; }
267267
virtual void InitDumpEnv() {}
268+
virtual void MergeDenseParam();
268269

269270
template <typename T>
270271
void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
@@ -274,6 +275,7 @@ class PSGPUTrainer : public TrainerBase {
274275
DownpourWorkerParameter param_;
275276
std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
276277
std::vector<std::string> need_merge_var_names_;
278+
std::vector<std::string> trainable_param_;
277279
float scale_datanorm_;
278280
paddle::platform::Place place_;
279281
ProgramDesc program_;

paddle/fluid/imperative/prepared_operator.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,10 +131,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
131131
auto& kernels = kernels_iter->second;
132132
auto kernel_iter = kernels.find(expected_kernel_key);
133133
#ifdef PADDLE_WITH_XPU
134-
if ((kernel_iter == kernels.end() &&
135-
is_xpu_place(expected_kernel_key.place_) &&
136-
!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key)) ||
137-
paddle::platform::is_in_xpu_black_list(op.Type())) {
134+
if (is_xpu_place(expected_kernel_key.place_) &&
135+
(kernel_iter == kernels.end() ||
136+
!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
137+
paddle::platform::is_in_xpu_black_list(op.Type()))) {
138138
VLOG(3) << "missing XPU kernel: " << op.Type()
139139
<< ", expected_kernel_key:" << expected_kernel_key
140140
<< ", fallbacking to CPU one!";

0 commit comments

Comments
 (0)