Skip to content

Commit c645624

Browse files
committed
Merge remote-tracking branch 'upstream/develop' into add_searchsorted_op
2 parents ed2173d + f05e444 commit c645624

File tree

509 files changed

+36365
-6172
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

509 files changed

+36365
-6172
lines changed

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,9 @@ option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF)
222222
option(WITH_MUSL "Compile with musl libc instead of gblic" OFF)
223223
option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF)
224224
option(WITH_STRIP "Strip so files of Whl packages" OFF)
225+
option(NEW_RELEASE_CUBIN "PaddlePaddle next-level release strategy for pypi cubin package" OFF)
226+
option(NEW_RELEASE_JIT "PaddlePaddle next-level release strategy for backup jit package" OFF)
227+
option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF)
225228

226229
# PY_VERSION
227230
if(NOT PY_VERSION)

cmake/configure.cmake

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ if(WITH_TESTING)
2020
add_definitions(-DPADDLE_WITH_TESTING)
2121
endif(WITH_TESTING)
2222

23+
if(WITH_INFERENCE_API_TEST)
24+
add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST)
25+
endif(WITH_INFERENCE_API_TEST)
26+
2327
if(NOT WITH_PROFILER)
2428
add_definitions(-DPADDLE_DISABLE_PROFILER)
2529
endif(NOT WITH_PROFILER)
@@ -86,6 +90,10 @@ if(WITH_ASCEND_CL)
8690
add_definitions(-DPADDLE_WITH_ASCEND_CL)
8791
endif()
8892

93+
if(WITH_ASCEND_INT64)
94+
add_definitions(-DPADDLE_WITH_ASCEND_INT64)
95+
endif()
96+
8997
if(WITH_XPU)
9098
message(STATUS "Compile with XPU!")
9199
add_definitions(-DPADDLE_WITH_XPU)

cmake/cuda.cmake

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,22 @@ if(NOT WITH_GPU)
33
endif()
44

55

6-
if (WITH_NV_JETSON)
6+
if(WITH_NV_JETSON)
77
add_definitions(-DWITH_NV_JETSON)
88
set(paddle_known_gpu_archs "53 62 72")
99
set(paddle_known_gpu_archs10 "53 62 72")
10+
elseif(NEW_RELEASE_CUBIN)
11+
message("Using New Release Strategy - Cubin Packge")
12+
add_definitions(-DNEW_RELEASE_CUBIN)
13+
set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
14+
set(paddle_known_gpu_archs10 "50 60 70 75")
15+
set(paddle_known_gpu_archs11 "60 70 75 80")
16+
elseif(NEW_RELEASE_JIT)
17+
message("Using New Release Strategy - JIT Packge")
18+
add_definitions(-DNEW_RELEASE_JIT)
19+
set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
20+
set(paddle_known_gpu_archs10 "35 50 60 70 75")
21+
set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
1022
else()
1123
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
1224
set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
@@ -130,11 +142,17 @@ function(select_nvcc_arch_flags out_variable)
130142
set(cuda_arch_bin ${CUDA_ARCH_BIN})
131143
endif()
132144

145+
if(NEW_RELEASE_JIT)
146+
set(cuda_arch_ptx "${cuda_arch_ptx}${cuda_arch_bin}")
147+
set(cuda_arch_bin "")
148+
endif()
149+
133150
# remove dots and convert to lists
134151
string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
135152
string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
136153
string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
137154
string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}")
155+
138156
list(REMOVE_DUPLICATES cuda_arch_bin)
139157
list(REMOVE_DUPLICATES cuda_arch_ptx)
140158

cmake/external/python.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
2727
OUTPUT_VARIABLE _PYTHON_VALUES
2828
ERROR_VARIABLE _PYTHON_ERROR_VALUE)
2929

30-
if(NOT _PYTHON_SUCCESS MATCHES 0)
30+
if(NOT _PYTHON_SUCCESS EQUAL 0)
3131
set(PYTHONLIBS_FOUND FALSE)
3232
return()
3333
endif()

cmake/external/xpu.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ ELSE ()
3535
ENDIF()
3636

3737
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
38-
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210818")
38+
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210830")
3939
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
4040
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
4141
SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)

cmake/generic.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,8 @@ function(cc_test_run TARGET_NAME)
431431
if (APPLE)
432432
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20)
433433
endif()
434+
elseif(WITH_TESTING AND NOT TEST ${TARGET_NAME})
435+
add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E echo CI skip ${TARGET_NAME}.)
434436
endif()
435437
endfunction()
436438

@@ -459,6 +461,8 @@ function(cc_test TARGET_NAME)
459461
COMMAND ${TARGET_NAME}
460462
ARGS ${cc_test_ARGS})
461463
endif()
464+
elseif(WITH_TESTING AND NOT TEST ${TARGET_NAME})
465+
add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E echo CI skip ${TARGET_NAME}.)
462466
endif()
463467
endfunction(cc_test)
464468

cmake/operators.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ function(op_library TARGET)
183183
list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
184184
list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
185185
list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
186+
list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu")
187+
list(REMOVE_ITEM hip_srcs "svd_op.cu")
186188
list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
187189
list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
188190
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}

paddle/fluid/framework/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -354,10 +354,10 @@ cc_library(executor_cache SRCS executor_cache.cc DEPS parallel_executor)
354354
if(WITH_PSCORE)
355355
get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
356356
cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS
357-
conditional_block_op executor ${RPC_DEPS})
357+
conditional_block_op executor gloo_wrapper ${RPC_DEPS})
358358
else()
359359
cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS
360-
conditional_block_op executor)
360+
conditional_block_op executor gloo_wrapper)
361361
endif()
362362
cc_library(prune SRCS prune.cc DEPS framework_proto boost)
363363
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

paddle/fluid/framework/block_desc.cc

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,27 @@ void BlockDesc::MoveFrom(BlockDesc *block) {
263263
}
264264
ops_.clear();
265265
for (const auto &src_op : block->ops_) {
266-
AppendOp()->CopyFrom(*src_op);
266+
auto *dst_op = AppendOp();
267+
dst_op->CopyFrom(*src_op);
268+
for (const auto &pair : src_op->GetAttrMap()) {
269+
const auto &attr_name = pair.first;
270+
const auto &attr_value = pair.second;
271+
auto attr_type = static_cast<proto::AttrType>(attr_value.which() - 1);
272+
if (attr_type == proto::AttrType::BLOCK) {
273+
auto block_id = BOOST_GET_CONST(BlockDesc *, attr_value)->ID();
274+
dst_op->SetBlockAttr(attr_name, prog_->MutableBlock(block_id));
275+
VLOG(10) << "Set block attr " << attr_name << " id " << block_id;
276+
} else if (attr_type == proto::AttrType::BLOCKS) {
277+
auto old_blocks = BOOST_GET_CONST(std::vector<BlockDesc *>, attr_value);
278+
std::vector<BlockDesc *> new_blocks;
279+
new_blocks.reserve(old_blocks.size());
280+
for (auto *b : old_blocks) {
281+
VLOG(10) << "Set block attr " << attr_name << " id " << b->ID();
282+
new_blocks.push_back(prog_->MutableBlock(b->ID()));
283+
}
284+
dst_op->SetBlocksAttr(attr_name, new_blocks);
285+
}
286+
}
267287
}
268288
need_update_ = true;
269289
Flush();

paddle/fluid/framework/data_feed.cc

Lines changed: 157 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,11 @@ bool InMemoryDataFeed<T>::Start() {
257257
output_channel_->Write(std::move(data));
258258
}
259259
#endif
260+
if (batch_offsets_.size() > 0) {
261+
VLOG(3) << "batch_size offsets: " << batch_offsets_.size();
262+
enable_heterps_ = true;
263+
this->offset_index_ = 0;
264+
}
260265
this->finish_start_ = true;
261266
return true;
262267
}
@@ -265,34 +270,64 @@ template <typename T>
265270
int InMemoryDataFeed<T>::Next() {
266271
#ifdef _LINUX
267272
this->CheckStart();
268-
CHECK(output_channel_ != nullptr);
269-
CHECK(consume_channel_ != nullptr);
270-
VLOG(3) << "output_channel_ size=" << output_channel_->Size()
271-
<< ", consume_channel_ size=" << consume_channel_->Size()
272-
<< ", thread_id=" << thread_id_;
273-
int index = 0;
274-
T instance;
275-
std::vector<T> ins_vec;
276-
ins_vec.reserve(this->default_batch_size_);
277-
while (index < this->default_batch_size_) {
278-
if (output_channel_->Size() == 0) {
279-
break;
273+
if (!enable_heterps_) {
274+
CHECK(output_channel_ != nullptr);
275+
CHECK(consume_channel_ != nullptr);
276+
VLOG(3) << "output_channel_ size=" << output_channel_->Size()
277+
<< ", consume_channel_ size=" << consume_channel_->Size()
278+
<< ", thread_id=" << thread_id_;
279+
int index = 0;
280+
T instance;
281+
std::vector<T> ins_vec;
282+
ins_vec.reserve(this->default_batch_size_);
283+
while (index < this->default_batch_size_) {
284+
if (output_channel_->Size() == 0) {
285+
break;
286+
}
287+
output_channel_->Get(instance);
288+
ins_vec.push_back(instance);
289+
++index;
290+
consume_channel_->Put(std::move(instance));
291+
}
292+
this->batch_size_ = index;
293+
VLOG(3) << "batch_size_=" << this->batch_size_
294+
<< ", thread_id=" << thread_id_;
295+
if (this->batch_size_ != 0) {
296+
PutToFeedVec(ins_vec);
297+
} else {
298+
VLOG(3) << "finish reading, output_channel_ size="
299+
<< output_channel_->Size()
300+
<< ", consume_channel_ size=" << consume_channel_->Size()
301+
<< ", thread_id=" << thread_id_;
280302
}
281-
output_channel_->Get(instance);
282-
ins_vec.push_back(instance);
283-
++index;
284-
consume_channel_->Put(std::move(instance));
285-
}
286-
this->batch_size_ = index;
287-
VLOG(3) << "batch_size_=" << this->batch_size_
288-
<< ", thread_id=" << thread_id_;
289-
if (this->batch_size_ != 0) {
290-
PutToFeedVec(ins_vec);
291303
} else {
292-
VLOG(3) << "finish reading, output_channel_ size="
293-
<< output_channel_->Size()
294-
<< ", consume_channel_ size=" << consume_channel_->Size()
304+
VLOG(3) << "enable heter NEXT: " << offset_index_
305+
<< " batch_offsets: " << batch_offsets_.size();
306+
if (offset_index_ >= batch_offsets_.size()) {
307+
VLOG(3) << "offset_index: " << offset_index_
308+
<< " batch_offsets: " << batch_offsets_.size();
309+
return 0;
310+
}
311+
auto& batch = batch_offsets_[offset_index_++];
312+
this->batch_size_ = batch.second;
313+
VLOG(3) << "batch_size_=" << this->batch_size_
295314
<< ", thread_id=" << thread_id_;
315+
if (this->batch_size_ != 0) {
316+
PutToFeedVec(&records_[batch.first], this->batch_size_);
317+
} else {
318+
VLOG(3) << "finish reading for heterps, batch size zero, thread_id="
319+
<< thread_id_;
320+
}
321+
/*
322+
if (offset_index_ == batch_offsets_.size() - 1) {
323+
std::vector<Record> data;
324+
output_channel_->ReadAll(data);
325+
consume_channel_->Write(std::move(data));
326+
}
327+
*/
328+
VLOG(3) << "#15 enable heter NEXT: " << offset_index_
329+
<< " batch_offsets: " << batch_offsets_.size()
330+
<< " baych_size: " << this->batch_size_;
296331
}
297332
return this->batch_size_;
298333
#else
@@ -1141,6 +1176,103 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
11411176
return false;
11421177
}
11431178

1179+
void MultiSlotInMemoryDataFeed::PutToFeedVec(const Record* ins_vec, int num) {
1180+
#ifdef _LINUX
1181+
for (size_t i = 0; i < batch_float_feasigns_.size(); ++i) {
1182+
batch_float_feasigns_[i].clear();
1183+
batch_uint64_feasigns_[i].clear();
1184+
offset_[i].clear();
1185+
offset_[i].push_back(0);
1186+
}
1187+
ins_content_vec_.clear();
1188+
ins_content_vec_.reserve(num);
1189+
ins_id_vec_.clear();
1190+
ins_id_vec_.reserve(num);
1191+
for (int i = 0; i < num; ++i) {
1192+
auto& r = ins_vec[i];
1193+
ins_id_vec_.push_back(r.ins_id_);
1194+
ins_content_vec_.push_back(r.content_);
1195+
for (auto& item : r.float_feasigns_) {
1196+
batch_float_feasigns_[item.slot()].push_back(item.sign().float_feasign_);
1197+
visit_[item.slot()] = true;
1198+
}
1199+
for (auto& item : r.uint64_feasigns_) {
1200+
batch_uint64_feasigns_[item.slot()].push_back(
1201+
item.sign().uint64_feasign_);
1202+
visit_[item.slot()] = true;
1203+
}
1204+
for (size_t j = 0; j < use_slots_.size(); ++j) {
1205+
const auto& type = all_slots_type_[j];
1206+
if (visit_[j]) {
1207+
visit_[j] = false;
1208+
} else {
1209+
// fill slot value with default value 0
1210+
if (type[0] == 'f') { // float
1211+
batch_float_feasigns_[j].push_back(0.0);
1212+
} else if (type[0] == 'u') { // uint64
1213+
batch_uint64_feasigns_[j].push_back(0);
1214+
}
1215+
}
1216+
// get offset of this ins in this slot
1217+
if (type[0] == 'f') { // float
1218+
offset_[j].push_back(batch_float_feasigns_[j].size());
1219+
} else if (type[0] == 'u') { // uint64
1220+
offset_[j].push_back(batch_uint64_feasigns_[j].size());
1221+
}
1222+
}
1223+
}
1224+
1225+
for (size_t i = 0; i < use_slots_.size(); ++i) {
1226+
if (feed_vec_[i] == nullptr) {
1227+
continue;
1228+
}
1229+
int total_instance = offset_[i].back();
1230+
const auto& type = all_slots_type_[i];
1231+
if (type[0] == 'f') { // float
1232+
float* feasign = batch_float_feasigns_[i].data();
1233+
float* tensor_ptr =
1234+
feed_vec_[i]->mutable_data<float>({total_instance, 1}, this->place_);
1235+
CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(float));
1236+
} else if (type[0] == 'u') { // uint64
1237+
// no uint64_t type in paddlepaddle
1238+
uint64_t* feasign = batch_uint64_feasigns_[i].data();
1239+
int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
1240+
{total_instance, 1}, this->place_);
1241+
CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(int64_t));
1242+
}
1243+
auto& slot_offset = offset_[i];
1244+
if (this->input_type_ == 0) {
1245+
LoD data_lod{slot_offset};
1246+
feed_vec_[i]->set_lod(data_lod);
1247+
} else if (this->input_type_ == 1) {
1248+
if (!use_slots_is_dense_[i]) {
1249+
std::vector<size_t> tmp_offset;
1250+
PADDLE_ENFORCE_EQ(slot_offset.size(), 2,
1251+
platform::errors::InvalidArgument(
1252+
"In batch reader, the sparse tensor lod size "
1253+
"must be 2, but received %d.",
1254+
slot_offset.size()));
1255+
const auto& max_size = slot_offset[1];
1256+
tmp_offset.reserve(max_size + 1);
1257+
for (unsigned int k = 0; k <= max_size; k++) {
1258+
tmp_offset.emplace_back(k);
1259+
}
1260+
slot_offset = tmp_offset;
1261+
LoD data_lod{slot_offset};
1262+
feed_vec_[i]->set_lod(data_lod);
1263+
}
1264+
}
1265+
if (use_slots_is_dense_[i]) {
1266+
if (inductive_shape_index_[i] != -1) {
1267+
use_slots_shape_[i][inductive_shape_index_[i]] =
1268+
total_instance / total_dims_without_inductive_[i];
1269+
}
1270+
feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
1271+
}
1272+
}
1273+
#endif
1274+
}
1275+
11441276
void MultiSlotInMemoryDataFeed::PutToFeedVec(
11451277
const std::vector<Record>& ins_vec) {
11461278
#ifdef _LINUX

0 commit comments

Comments
 (0)