Skip to content

Commit 3ce9983

Browse files
Merge pull request #19 from PaddlePaddle/develop
update
2 parents 25ba21c + dd33d28 commit 3ce9983

70 files changed

Lines changed: 2257 additions & 578 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

paddle/fluid/framework/device_worker_factory.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,6 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
6969
REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
7070
#endif
7171

72-
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
73-
(defined PADDLE_WITH_PSLIB)
74-
REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker);
75-
#endif
76-
7772
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
7873
(defined PADDLE_WITH_PSLIB)
7974
REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);

paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ IF(WITH_GPU)
88
SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
99
endif()
1010
nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS ${HETERPS_DEPS})
11-
nv_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm)
11+
nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
1212
nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
1313
ENDIF()
1414
IF(WITH_ROCM)
1515
hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
16-
hip_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm)
16+
hip_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
1717
hip_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
1818
ENDIF()

paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,7 @@ x.second );
765765
unsigned long long get_num_collisions() const { return m_collisions; }
766766

767767
void print() {
768-
for (size_type i = 0; i < 10; ++i) {
768+
for (size_type i = 0; i < 5; ++i) {
769769
std::cout << i << ": " << m_hashtbl_values[i].first << ","
770770
<< m_hashtbl_values[i].second << std::endl;
771771
}

paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ void HeterComm<KeyType, ValType, GradType>::init_path() {
115115
path_.resize(total_gpu);
116116

117117
if (!topo_aware_) {
118-
VLOG(1) << "init path without topo aware";
118+
VLOG(3) << "init path without topo aware";
119119
for (int i = 0; i < total_gpu; ++i) {
120120
path_[i].resize(total_gpu);
121121
for (int j = 0; j < total_gpu; ++j) {
@@ -130,7 +130,7 @@ void HeterComm<KeyType, ValType, GradType>::init_path() {
130130
}
131131
}
132132
} else {
133-
VLOG(1) << "init path with topo aware";
133+
VLOG(3) << "init path with topo aware";
134134
for (int i = 0; i < total_gpu; ++i) {
135135
path_[i].resize(total_gpu);
136136
for (int j = 0; j < total_gpu; ++j) {

paddle/fluid/framework/fleet/ps_gpu_wrapper.cc

Lines changed: 117 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,7 @@ namespace framework {
4040
std::shared_ptr<PSGPUWrapper> PSGPUWrapper::s_instance_ = NULL;
4141
bool PSGPUWrapper::is_initialized_ = false;
4242

43-
void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
44-
uint64_t table_id, int feature_dim) {
43+
void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task) {
4544
VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin";
4645
platform::Timer timeline;
4746
timeline.Start();
@@ -68,8 +67,6 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
6867
thread_keys_.resize(thread_keys_thread_num_);
6968
for (int i = 0; i < thread_keys_thread_num_; i++) {
7069
thread_keys_[i].resize(thread_keys_shard_num_);
71-
for (int j = 0; j < thread_keys_shard_num_; j++) {
72-
}
7370
}
7471
const std::deque<Record>& vec_data = input_channel->GetData();
7572
size_t total_len = vec_data.size();
@@ -139,17 +136,16 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
139136
local_ptr[i].resize(local_keys[i].size());
140137
}
141138
timeline.Start();
142-
auto ptl_func = [this, &local_keys, &local_ptr, &table_id,
143-
&fleet_ptr](int i) {
139+
auto ptl_func = [this, &local_keys, &local_ptr, &fleet_ptr](int i) {
144140
size_t key_size = local_keys[i].size();
145141
#ifdef PADDLE_WITH_PSLIB
146142
auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
147-
reinterpret_cast<char**>(local_ptr[i].data()), table_id,
143+
reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
148144
local_keys[i].data(), key_size);
149145
#endif
150146
#ifdef PADDLE_WITH_PSCORE
151147
auto tt = fleet_ptr->_worker_ptr->pull_sparse_ptr(
152-
reinterpret_cast<char**>(local_ptr[i].data()), table_id,
148+
reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
153149
local_keys[i].data(), key_size);
154150
#endif
155151
tt.wait();
@@ -255,7 +251,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
255251
}
256252
}
257253
#endif
258-
VLOG(1) << "GpuPs build hbmps done";
254+
VLOG(3) << "GpuPs build hbmps done";
259255

260256
device_mutex[dev]->unlock();
261257
}
@@ -272,11 +268,8 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
272268
<< " seconds.";
273269
}
274270

275-
void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
271+
void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
276272
int device_num = heter_devices_.size();
277-
std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
278-
gpu_task->Reset();
279-
BuildTask(gpu_task, table_id, feature_dim);
280273
platform::Timer timeline;
281274
timeline.Start();
282275

@@ -291,15 +284,21 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
291284
delete HeterPs_;
292285
HeterPs_ = nullptr;
293286
}
287+
if (size_max <= 0) {
288+
VLOG(1) << "Skip build gpu ps cause feasign nums = " << size_max;
289+
return;
290+
}
294291
std::vector<std::thread> threads(device_num);
295292
HeterPs_ = HeterPsBase::get_instance(size_max, resource_);
296293
HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_);
297294
auto build_func = [this, &gpu_task, &feature_keys_count](int i) {
298-
std::cout << "building table: " << i << std::endl;
295+
VLOG(3) << "building table: " << i;
299296
this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(),
300297
gpu_task->device_values_[i].data(),
301298
feature_keys_count[i], 500000, 2);
302-
HeterPs_->show_one_table(i);
299+
if (feature_keys_count[i] > 0) {
300+
HeterPs_->show_one_table(i);
301+
}
303302
};
304303
for (size_t i = 0; i < threads.size(); i++) {
305304
threads[i] = std::thread(build_func, i);
@@ -310,7 +309,109 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
310309
timeline.Pause();
311310
VLOG(1) << "GpuPs build table total costs: " << timeline.ElapsedSec()
312311
<< " s.";
313-
gpu_task_pool_.Push(gpu_task);
312+
}
313+
314+
void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
315+
platform::Timer timer;
316+
VLOG(3) << "Begin LoadIntoMemory(), dataset[" << dataset_ << "]";
317+
timer.Start();
318+
dataset_->LoadIntoMemory();
319+
timer.Pause();
320+
VLOG(0) << "LoadIntoMemory cost: " << timer.ElapsedSec() << "s";
321+
322+
// local shuffle
323+
if (is_shuffle) {
324+
dataset_->LocalShuffle();
325+
}
326+
327+
std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
328+
gpu_task->Reset();
329+
data_ready_channel_->Put(gpu_task);
330+
VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]";
331+
}
332+
333+
void PSGPUWrapper::start_build_thread() {
334+
running_ = true;
335+
VLOG(3) << "start build CPU&GPU ps thread.";
336+
build_cpu_threads_ = std::thread([this] { build_cpu_thread(); });
337+
build_gpu_threads_ = std::thread([this] { build_gpu_thread(); });
338+
}
339+
340+
void PSGPUWrapper::build_cpu_thread() {
341+
while (running_) {
342+
std::shared_ptr<HeterContext> gpu_task = nullptr;
343+
if (!data_ready_channel_->Get(gpu_task)) {
344+
continue;
345+
}
346+
VLOG(3) << "thread BuildTask start.";
347+
platform::Timer timer;
348+
timer.Start();
349+
// build cpu ps data process
350+
BuildTask(gpu_task);
351+
timer.Pause();
352+
VLOG(1) << "thread BuildTask end, cost time: " << timer.ElapsedSec() << "s";
353+
buildcpu_ready_channel_->Put(gpu_task);
354+
}
355+
VLOG(3) << "build cpu thread end";
356+
}
357+
358+
void PSGPUWrapper::build_gpu_thread() {
359+
while (running_) {
360+
std::shared_ptr<HeterContext> gpu_task = nullptr;
361+
if (!gpu_free_channel_->Get(gpu_task)) {
362+
continue;
363+
}
364+
if (!buildcpu_ready_channel_->Get(gpu_task)) {
365+
continue;
366+
}
367+
VLOG(3) << "thread BuildGPUTask start.";
368+
platform::Timer timer;
369+
timer.Start();
370+
BuildGPUTask(gpu_task);
371+
timer.Pause();
372+
VLOG(1) << "thread BuildGPUTask end, cost time: " << timer.ElapsedSec()
373+
<< "s";
374+
375+
gpu_task_pool_.Push(gpu_task);
376+
train_ready_channel_->Put(gpu_task);
377+
}
378+
VLOG(3) << "build gpu thread end";
379+
}
380+
381+
void PSGPUWrapper::BeginPass() {
382+
platform::Timer timer;
383+
timer.Start();
384+
if (current_task_) {
385+
PADDLE_THROW(
386+
platform::errors::Fatal("[BeginPass] current task is not ended."));
387+
}
388+
// load+build done
389+
if (!train_ready_channel_->Get(current_task_)) {
390+
PADDLE_THROW(platform::errors::Fatal("train_ready_channel_ failed."));
391+
}
392+
timer.Pause();
393+
VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
394+
}
395+
396+
void PSGPUWrapper::EndPass() {
397+
if (!current_task_) {
398+
PADDLE_THROW(
399+
platform::errors::Fatal("[EndPass] current task has been ended."));
400+
}
401+
platform::Timer timer;
402+
timer.Start();
403+
size_t keysize_max = 0;
404+
// in case of feasign_num = 0, skip dump_to_cpu
405+
for (size_t i = 0; i < heter_devices_.size(); i++) {
406+
keysize_max = std::max(keysize_max, current_task_->device_keys_[i].size());
407+
}
408+
if (keysize_max != 0) {
409+
HeterPs_->end_pass();
410+
}
411+
current_task_ = nullptr;
412+
gpu_free_channel_->Put(current_task_);
413+
timer.Pause();
414+
VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
314415
}
315416

316417
void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,

paddle/fluid/framework/fleet/ps_gpu_wrapper.h

Lines changed: 67 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,33 @@ class PSGPUWrapper {
8282
const int hidden_size, const int64_t total_length,
8383
const int batch_size);
8484

85-
void BuildGPUPS(const uint64_t table_id, int feature_dim);
86-
void BuildTask(std::shared_ptr<HeterContext> gpu_task, uint64_t table_id,
87-
int feature_dim);
85+
void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
86+
void BuildTask(std::shared_ptr<HeterContext> gpu_task);
87+
void LoadIntoMemory(bool is_shuffle);
88+
void BeginPass();
89+
void EndPass();
90+
void start_build_thread();
91+
void build_cpu_thread();
92+
void build_gpu_thread();
93+
94+
void Finalize() {
95+
VLOG(3) << "PSGPUWrapper Begin Finalize.";
96+
if (s_instance_ == nullptr) {
97+
return;
98+
}
99+
data_ready_channel_->Close();
100+
buildcpu_ready_channel_->Close();
101+
gpu_free_channel_->Close();
102+
train_ready_channel_->Close();
103+
running_ = false;
104+
VLOG(3) << "begin stop build_cpu_threads_";
105+
build_cpu_threads_.join();
106+
VLOG(3) << "begin stop build_gpu_threads_";
107+
build_gpu_threads_.join();
108+
s_instance_ = nullptr;
109+
VLOG(3) << "PSGPUWrapper Finalize Finished.";
110+
}
111+
88112
void InitializeGPU(const std::vector<int>& dev_ids) {
89113
if (s_instance_ != NULL && is_initialized_ == false) {
90114
VLOG(3) << "PSGPUWrapper Begin InitializeGPU";
@@ -129,6 +153,24 @@ class PSGPUWrapper {
129153
#endif
130154
}
131155
heter_devices_ = dev_ids;
156+
data_ready_channel_->Open();
157+
data_ready_channel_->SetCapacity(3);
158+
buildcpu_ready_channel_->Open();
159+
buildcpu_ready_channel_->SetCapacity(3);
160+
gpu_free_channel_->Open();
161+
gpu_free_channel_->SetCapacity(1);
162+
train_ready_channel_->Open();
163+
train_ready_channel_->SetCapacity(1);
164+
165+
current_task_ = nullptr;
166+
gpu_free_channel_->Put(current_task_);
167+
168+
table_id_ = 1;
169+
#ifdef PADDLE_WITH_PSLIB
170+
table_id_ = 0;
171+
#endif
172+
// start build cpu&gpu ps thread
173+
start_build_thread();
132174
}
133175
}
134176

@@ -206,18 +248,8 @@ class PSGPUWrapper {
206248
slot_vector_ = slot_vector;
207249
}
208250

209-
void EndPass() { HeterPs_->end_pass(); }
210251
void ShowOneTable(int index) { HeterPs_->show_one_table(index); }
211252

212-
void Finalize() {
213-
VLOG(3) << "PSGPUWrapper Begin Finalize.";
214-
if (s_instance_ == nullptr) {
215-
return;
216-
}
217-
s_instance_ = nullptr;
218-
VLOG(3) << "PSGPUWrapper Finalize Finished.";
219-
}
220-
221253
private:
222254
static std::shared_ptr<PSGPUWrapper> s_instance_;
223255
Dataset* dataset_;
@@ -231,6 +263,7 @@ class PSGPUWrapper {
231263
std::vector<int> slot_vector_;
232264
int multi_node_{0};
233265
int node_size_;
266+
uint64_t table_id_;
234267
std::vector<ncclComm_t> inner_comms_;
235268
std::vector<ncclComm_t> inter_comms_;
236269
std::vector<ncclUniqueId> inter_ncclids_;
@@ -242,6 +275,27 @@ class PSGPUWrapper {
242275
int thread_keys_shard_num_ = 37;
243276
uint64_t max_fea_num_per_pass_ = 5000000000;
244277

278+
std::shared_ptr<
279+
paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>>
280+
data_ready_channel_ =
281+
paddle::framework::MakeChannel<std::shared_ptr<HeterContext>>();
282+
std::shared_ptr<
283+
paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>>
284+
buildcpu_ready_channel_ =
285+
paddle::framework::MakeChannel<std::shared_ptr<HeterContext>>();
286+
std::shared_ptr<
287+
paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>>
288+
gpu_free_channel_ =
289+
paddle::framework::MakeChannel<std::shared_ptr<HeterContext>>();
290+
std::shared_ptr<
291+
paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>>
292+
train_ready_channel_ =
293+
paddle::framework::MakeChannel<std::shared_ptr<HeterContext>>();
294+
std::shared_ptr<HeterContext> current_task_ = nullptr;
295+
std::thread build_cpu_threads_;
296+
std::thread build_gpu_threads_;
297+
bool running_ = false;
298+
245299
protected:
246300
static bool is_initialized_;
247301
};

0 commit comments

Comments
 (0)