Skip to content

Commit e603334

Browse files
SmallBirdLiaoxionglei1234rensilinrootzmxdream
authored
Base remote/lxch pre stable (PaddlePaddle#30)
* parquet parser * fix IsThreadLocalCapturing * run cuda kernel: CalcAucKernel with 512 threads * fix_afs_api_download_dnn_plugin * fix_fleet_last_base * parquet parser * add ps core so * chg cmake Co-authored-by: rensilin <[email protected]> Co-authored-by: root <[email protected]> * parquet * fix IsThreadLocalCapturing * run cuda kernel: CalcAucKernel with 512 threads * fix_afs_api_download_dnn_plugin * fix_fleet_last_base * parquet parser * add ps core so * chg cmake * fix libjvm lost Co-authored-by: rensilin <[email protected]> Co-authored-by: root <[email protected]> * add dymf (PaddlePaddle#10) * dymf tmp * add dymf tmp * local test change * pull thread pool * fix conflict * delete unuse log * local change for mirrow 0 * fix dymf * code clean * fix code clean * code clean * code clean * fix dymf * fix dymf * add endpass optimize * clean code * fix endpass optimize * fix * fix Co-authored-by: yaoxuefeng6 <[email protected]> Co-authored-by: Thunderbrook <[email protected]> * pipeline build (#9) * Fix eigvals_op (PaddlePaddle#12) * dymf tmp * add dymf tmp * local test change * pull thread pool * fix conflict * delete unuse log * local change for mirrow 0 * fix dymf * code clean * fix code clean * code clean * code clean * fix dymf * fix dymf * add endpass optimize * clean code * fix endpass optimize * fix * fix * fix eigvals_op * merge pre-stable * merge pre-stable Co-authored-by: yaoxuefeng6 <[email protected]> Co-authored-by: Thunderbrook <[email protected]> * test * passid memory && Generalization * fix code style Co-authored-by: xionglei1234 <[email protected]> Co-authored-by: rensilin <[email protected]> Co-authored-by: root <[email protected]> Co-authored-by: zmxdream <[email protected]> Co-authored-by: yaoxuefeng6 <[email protected]> Co-authored-by: Thunderbrook <[email protected]> Co-authored-by: Thunderbrook <[email protected]> Co-authored-by: liaoxiaochao <[email protected]>
1 parent bb9733e commit e603334

30 files changed

+2064
-3153
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,4 @@ paddle/infrt/tests/lit.cfg.py
6666
paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
6767
paddle/fluid/pybind/eager_final_state_op_function_impl.h
6868
paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h
69+
builder

cmake/external/pslib.cmake

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,7 @@ ExternalProject_Add(
4949
${EXTERNAL_PROJECT_LOG_ARGS}
5050
PREFIX ${PSLIB_PREFIX_DIR}
5151
DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR}
52-
DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz
53-
&& tar zxvf ${PSLIB_NAME}.tar.gz
52+
DOWNLOAD_COMMAND cp /root/paddlejob/new1_code/ps/baidu/paddlepaddle/pslib/pslib.tar.gz ./ && tar zxvf ${PSLIB_NAME}.tar.gz
5453
DOWNLOAD_NO_PROGRESS 1
5554
UPDATE_COMMAND ""
5655
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT}

paddle/fluid/framework/data_set.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,8 @@ class Dataset {
160160
virtual void SetFleetSendSleepSeconds(int seconds) = 0;
161161

162162
virtual std::vector<std::string> GetSlots() = 0;
163-
163+
virtual void SetPassId(uint32_t pass_id) = 0;
164+
virtual uint32_t GetPassID() = 0;
164165
protected:
165166
virtual int ReceiveFromClient(int msg_type, int client_id,
166167
const std::string& msg) = 0;
@@ -249,6 +250,13 @@ class DatasetImpl : public Dataset {
249250
virtual void DynamicAdjustReadersNum(int thread_num);
250251
virtual void SetFleetSendSleepSeconds(int seconds);
251252
virtual std::vector<std::string> GetSlots();
253+
virtual void SetPassId(uint32_t pass_id) {
254+
pass_id_ = pass_id;
255+
}
256+
virtual uint32_t GetPassID() {
257+
return pass_id_;
258+
}
259+
252260
/* for enable_heterps_
253261
virtual void EnableHeterps(bool enable_heterps) {
254262
enable_heterps_ = enable_heterps;
@@ -275,6 +283,7 @@ class DatasetImpl : public Dataset {
275283
// TODO(yaoxuefeng) for SlotRecordDataset
276284
return -1;
277285
}
286+
uint32_t pass_id_ = 0;
278287
std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers_;
279288
std::vector<std::shared_ptr<paddle::framework::DataFeed>> preload_readers_;
280289
paddle::framework::Channel<T> input_channel_;

paddle/fluid/framework/fleet/fleet_wrapper.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1388,9 +1388,9 @@ void FleetWrapper::SetDate(const uint64_t table_id, const std::string& date) {
13881388
#endif
13891389
}
13901390

1391-
void FleetWrapper::PrintTableStat(const uint64_t table_id) {
1391+
void FleetWrapper::PrintTableStat(const uint64_t table_id, uint32_t pass_id, size_t threshold) {
13921392
#ifdef PADDLE_WITH_PSLIB
1393-
auto ret = pslib_ptr_->_worker_ptr->print_table_stat(table_id);
1393+
auto ret = pslib_ptr_->_worker_ptr->print_table_stat(table_id, pass_id, threshold);
13941394
ret.wait();
13951395
int32_t err_code = ret.get();
13961396
if (err_code == -1) {

paddle/fluid/framework/fleet/fleet_wrapper.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ class FleetWrapper {
265265
std::vector<std::string> table_var_list,
266266
bool load_combine);
267267

268-
void PrintTableStat(const uint64_t table_id);
268+
void PrintTableStat(const uint64_t table_id, uint32_t pass_id, uint64_t threshold);
269269
void SetFileNumOneShard(const uint64_t table_id, int file_num);
270270
// mode = 0, load all feature
271271
// mode = 1, load delta feature, which means load diff

paddle/fluid/framework/fleet/heter_context.h

Lines changed: 37 additions & 188 deletions
Original file line numberDiff line numberDiff line change
@@ -39,227 +39,76 @@ namespace framework {
3939

4040
class HeterContext {
4141
public:
42-
virtual ~HeterContext() {
43-
if (!multi_mf_dim_) {
44-
for (size_t i = 0; i < mutex_.size(); ++i) {
45-
delete mutex_[i];
46-
}
47-
mutex_.clear();
48-
} else {
49-
for (size_t i = 0; i < dim_mutex_.size(); ++i) {
50-
for (size_t j = 0; j < dim_mutex_[i].size(); j++) {
51-
delete dim_mutex_[i][j];
52-
}
53-
dim_mutex_[i].clear();
54-
}
55-
}
56-
}
57-
Scope* scope_{nullptr};
58-
std::vector<std::vector<FeatureKey>> feature_keys_;
59-
std::vector<std::vector<std::vector<FeatureKey>>> feature_dim_keys_;
60-
std::vector<std::vector<std::vector<FeatureKey>>> device_task_keys_;
61-
42+
//保存去重后的待查table的key, 第一层对应table-shard, 第二层对应不同维度,第三层就是key集合
43+
std::vector<std::vector<std::vector<FeatureKey>>>feature_keys_;
44+
//保存查到的value数据,维度同feature_keys_
6245
#ifdef PADDLE_WITH_PSLIB
63-
std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> value_ptr_;
64-
std::vector<std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>>>
65-
device_task_ptr_;
66-
std::vector<std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>>>
67-
value_dim_ptr_;
6846
std::vector<std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>>>
69-
device_dim_ptr_;
47+
value_ptr_;
7048
#endif
7149
#ifdef PADDLE_WITH_PSCORE
72-
std::vector<std::vector<paddle::distributed::FixedFeatureValue*>> value_ptr_;
7350
std::vector<std::vector<std::vector<paddle::distributed::FixedFeatureValue*>>>
74-
value_dim_ptr_;
75-
std::vector<std::vector<std::vector<paddle::distributed::FixedFeatureValue*>>>
76-
device_task_ptr_;
77-
std::vector<std::vector<std::vector<paddle::distributed::FixedFeatureValue*>>>
78-
device_dim_ptr_;
51+
value_ptr_;
7952
#endif
80-
std::vector<std::vector<FeatureValue>> device_values_;
81-
std::vector<std::vector<FeatureKey>> device_keys_;
82-
std::vector<std::vector<std::vector<FeatureKey>>> device_dim_keys_;
83-
std::vector<std::vector<std::vector<FeatureValue>>> device_dim_values_;
84-
std::vector<std::mutex*> mutex_;
85-
std::vector<std::vector<std::mutex*>> dim_mutex_;
86-
int multi_mf_dim_ = 0;
87-
88-
uint32_t shard_num_ = 37;
89-
uint64_t size() {
90-
uint64_t total_size = 0;
91-
for (auto& keys : feature_keys_) {
92-
total_size += keys.size();
93-
}
94-
return total_size;
95-
}
96-
void SetShardNum(uint32_t shard_num) { shard_num_ = shard_num; }
97-
uint32_t ShardNum() { return shard_num_; }
98-
void init(int shard_num, int device_num) {
99-
shard_num_ = shard_num;
100-
feature_keys_.resize(shard_num_);
101-
value_ptr_.resize(shard_num_);
102-
device_task_ptr_.resize(shard_num_);
103-
device_task_keys_.resize(shard_num_);
104-
for (size_t i = 0; i < device_task_ptr_.size(); i++) {
105-
device_task_ptr_[i].resize(device_num);
106-
device_task_keys_[i].resize(device_num);
107-
}
108-
109-
device_values_.resize(device_num);
110-
device_keys_.resize(device_num);
111-
mutex_.resize(device_num);
112-
for (size_t i = 0; i < mutex_.size(); ++i) {
113-
mutex_[i] = new std::mutex();
114-
}
115-
}
53+
//经过去重后的gpu-table中的key数据, 第一层设备,第二层维度,第三层具体的key
54+
std::vector<std::vector<std::vector<FeatureKey>>> device_keys_;
11655

56+
//初始化
11757
void init(int shard_num, int device_num, int dim_num) {
118-
shard_num_ = shard_num;
119-
feature_keys_.resize(shard_num_);
120-
feature_dim_keys_.resize(shard_num_);
121-
value_ptr_.resize(shard_num_);
122-
value_dim_ptr_.resize(shard_num_);
123-
device_task_ptr_.resize(shard_num_);
124-
device_task_keys_.resize(shard_num_);
125-
for (size_t i = 0; i < device_task_ptr_.size(); i++) {
126-
device_task_ptr_[i].resize(device_num);
127-
device_task_keys_[i].resize(device_num);
128-
}
129-
for (size_t i = 0; i < feature_dim_keys_.size(); i++) {
130-
feature_dim_keys_[i].resize(dim_num);
131-
value_dim_ptr_[i].resize(dim_num);
132-
}
133-
device_values_.resize(device_num);
134-
device_dim_values_.resize(device_num);
135-
device_keys_.resize(device_num);
136-
137-
device_dim_keys_.resize(device_num);
138-
device_dim_ptr_.resize(device_num);
139-
mutex_.resize(device_num);
140-
dim_mutex_.resize(device_num);
141-
for (size_t i = 0; i < mutex_.size(); ++i) {
142-
mutex_[i] = new std::mutex();
143-
}
144-
for (size_t i = 0; i < dim_mutex_.size(); ++i) {
145-
dim_mutex_[i].resize(dim_num);
146-
for (int j = 0; j < dim_num; j++) {
147-
dim_mutex_[i][j] = new std::mutex();
58+
feature_keys_.resize(shard_num);
59+
for (auto& iter : feature_keys_) {
60+
iter.resize(dim_num);
61+
for (auto& iter1: iter) {
62+
iter1.clear();
14863
}
14964
}
150-
multi_mf_dim_ = dim_num;
151-
}
152-
153-
void Reset() {
154-
if (!multi_mf_dim_) {
155-
for (size_t i = 0; i < feature_keys_.size(); ++i) {
156-
feature_keys_[i].clear();
157-
}
158-
for (size_t i = 0; i < value_ptr_.size(); ++i) {
159-
value_ptr_[i].clear();
160-
}
161-
for (size_t i = 0; i < device_values_.size(); ++i) {
162-
device_values_[i].clear();
163-
}
164-
for (size_t i = 0; i < device_keys_.size(); ++i) {
165-
device_keys_[i].clear();
166-
}
167-
for (size_t i = 0; i < device_task_ptr_.size(); ++i) {
168-
for (size_t j = 0; j < device_task_ptr_[i].size(); ++j) {
169-
device_task_ptr_[i][j].clear();
170-
device_task_keys_[i][j].clear();
171-
}
172-
}
173-
} else {
174-
VLOG(3) << "Reset gpu task with dynamic mf dimention";
175-
for (size_t i = 0; i < feature_dim_keys_.size(); i++) {
176-
for (size_t j = 0; j < feature_dim_keys_[i].size(); j++) {
177-
feature_dim_keys_[i][j].clear();
178-
}
179-
}
180-
for (size_t i = 0; i < value_dim_ptr_.size(); i++) {
181-
for (size_t j = 0; j < value_dim_ptr_[i].size(); j++) {
182-
value_dim_ptr_[i][j].clear();
183-
}
184-
}
185-
186-
for (size_t i = 0; i < device_dim_keys_.size(); i++) {
187-
for (size_t j = 0; j < device_dim_keys_[i].size(); j++) {
188-
device_dim_keys_[i][j].clear();
189-
}
190-
}
191-
for (size_t i = 0; i < device_dim_ptr_.size(); i++) {
192-
for (size_t j = 0; j < device_dim_ptr_[i].size(); j++) {
193-
device_dim_ptr_[i][j].clear();
194-
}
65+
value_ptr_.resize(shard_num);
66+
for (auto& iter : value_ptr_) {
67+
iter.resize(dim_num);
68+
for (auto& iter1: iter) {
69+
iter1.clear();
19570
}
19671
}
197-
}
198-
void batch_add_keys(
199-
const std::vector<std::unordered_set<uint64_t>>& thread_keys) {
200-
assert(thread_keys.size() == feature_keys_.size());
201-
202-
for (uint32_t i = 0; i < shard_num_; i++) {
203-
int idx = 0;
204-
idx = feature_keys_[i].size();
205-
feature_keys_[i].resize(feature_keys_[i].size() + thread_keys[i].size());
206-
std::copy(thread_keys[i].begin(), thread_keys[i].end(),
207-
feature_keys_[i].begin() + idx);
72+
device_keys_.resize(device_num);
73+
for (auto& iter : device_keys_) {
74+
iter.resize(dim_num);
75+
for (auto& iter1: iter) {
76+
iter1.clear();
77+
}
20878
}
209-
}
21079

211-
void batch_add_keys(int shard_num,
212-
const robin_hood::unordered_set<uint64_t>& shard_keys) {
213-
int idx = feature_keys_[shard_num].size();
214-
feature_keys_[shard_num].resize(feature_keys_[shard_num].size() +
215-
shard_keys.size());
216-
std::copy(shard_keys.begin(), shard_keys.end(),
217-
feature_keys_[shard_num].begin() + idx);
21880
}
219-
81+
//将粗去重的key加入进来,后面再做精细化去重
22082
void batch_add_keys(int shard_num, int dim_id,
22183
const robin_hood::unordered_set<uint64_t>& shard_keys) {
222-
int idx = feature_dim_keys_[shard_num][dim_id].size();
223-
feature_dim_keys_[shard_num][dim_id].resize(
224-
feature_dim_keys_[shard_num][dim_id].size() + shard_keys.size());
84+
int idx = feature_keys_[shard_num][dim_id].size();
85+
feature_keys_[shard_num][dim_id].resize(
86+
feature_keys_[shard_num][dim_id].size() + shard_keys.size());
22587
std::copy(shard_keys.begin(), shard_keys.end(),
226-
feature_dim_keys_[shard_num][dim_id].begin() + idx);
88+
feature_keys_[shard_num][dim_id].begin() + idx);
22789
}
228-
229-
void UniqueKeys() {
90+
void unique_keys() {
23091
std::vector<std::thread> threads;
231-
auto unique_func = [this](int i) {
232-
auto& cur_keys = feature_keys_[i];
233-
std::sort(cur_keys.begin(), cur_keys.end());
234-
std::vector<FeatureKey>::iterator it;
235-
it = std::unique(cur_keys.begin(), cur_keys.end());
236-
cur_keys.resize(std::distance(cur_keys.begin(), it));
237-
};
238-
auto unique_dynamic_mf_func = [this](int i, int j) {
239-
auto& cur_keys = feature_dim_keys_[i][j];
92+
auto unique_func = [this](int i, int j) {
93+
auto& cur_keys = feature_keys_[i][j];
24094
std::sort(cur_keys.begin(), cur_keys.end());
24195
std::vector<FeatureKey>::iterator it;
24296
it = std::unique(cur_keys.begin(), cur_keys.end());
24397
cur_keys.resize(std::distance(cur_keys.begin(), it));
24498
};
245-
if (!multi_mf_dim_) {
246-
for (uint32_t i = 0; i < shard_num_; i++) {
247-
threads.push_back(std::thread(unique_func, i));
248-
}
249-
} else {
250-
for (uint32_t i = 0; i < shard_num_; i++) {
251-
for (int j = 0; j < multi_mf_dim_; j++) {
252-
threads.push_back(std::thread(unique_dynamic_mf_func, i, j));
253-
}
99+
for (size_t i = 0; i < feature_keys_.size(); i++) {
100+
for (size_t j = 0; j < feature_keys_[i].size(); j++) {
101+
threads.push_back(std::thread(unique_func, i, j));
254102
}
255-
VLOG(3) << "heter_context unique keys with dynamic mf dimention";
256103
}
257104
for (std::thread& t : threads) {
258105
t.join();
259106
}
260107
}
108+
uint16_t pass_id_;
261109
};
262110

111+
263112
} // end namespace framework
264113
} // end namespace paddle
265114
#endif

paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ IF(WITH_GPU)
77
get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
88
SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
99
endif()
10-
nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
10+
nv_library(heter_comm SRCS heter_comm.h feature_value.h dy_gpu_value_inl.h feature_value_inl.h gpu_value_inl.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
1111
nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
12-
nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
12+
nv_library(heter_ps SRCS heter_ps.cu feature_value.cu DEPS heter_comm)
1313
if(WITH_PSCORE)
1414
nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table)
1515
nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
@@ -20,7 +20,7 @@ IF(WITH_GPU)
2020
endif()
2121
ENDIF()
2222
IF(WITH_ROCM)
23-
hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
23+
hip_library(heter_comm SRCS heter_comm.h feature_value.h dy_gpu_value_inl.h feature_value_inl.h gpu_value_inl.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
2424
hip_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
25-
hip_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
25+
hip_library(heter_ps SRCS heter_ps.cu feature_value.cu DEPS heter_comm)
2626
ENDIF()

0 commit comments

Comments
 (0)