diff --git a/CMakeLists.txt b/CMakeLists.txt index f30671bd3a87e8..28dc39920c6d07 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,13 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) include(system) +# Note(zhouwei): Ninja Generator will set CMAKE_BUILD_TYPE to Debug +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + project(paddle CXX C) # enable language CUDA @@ -213,12 +220,6 @@ if(NOT PY_VERSION) endif() set(PYBIND11_PYTHON_VERSION ${PY_VERSION}) -# CMAKE_BUILD_TYPE -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Release" CACHE STRING - "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" - FORCE) -endif() # the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined. Default: OFF if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 7f2addb02d36dd..033b40622e2594 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -205,23 +205,16 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") if(WIN32) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"/wd4244 /wd4267 /wd4819 \"") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /bigobj") - if(CMAKE_BUILD_TYPE STREQUAL "Debug") - # match the cl's _ITERATOR_DEBUG_LEVEL - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"-g -G -D_DEBUG\"") - if(MSVC_STATIC_CRT) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MTd") - else() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MDd") - endif() - elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"-DNDEBUG\"") - if(MSVC_STATIC_CRT) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MT") - else() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MD") - endif() - else() - message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") + if(MSVC_STATIC_CRT) + set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -Xcompiler /MTd") + set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler /MT") + foreach(flag_var + CMAKE_CUDA_FLAGS CMAKE_CUDA_FLAGS_DEBUG CMAKE_CUDA_FLAGS_RELEASE + CMAKE_CUDA_FLAGS_MINSIZEREL CMAKE_CUDA_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "-MD") + string(REGEX REPLACE "-MD" "-MT" ${flag_var} "${${flag_var}}") + endif() + endforeach(flag_var) endif() endif() diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 4e0768fc10f03f..c37e28523f43c5 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -60,8 +60,8 @@ ExternalProject_Add( DEPENDS ${MKLDNN_DEPENDS} PREFIX ${MKLDNN_PREFIX_DIR} SOURCE_DIR ${MKLDNN_SOURCE_DIR} - BUILD_ALWAYS 1 - # UPDATE_COMMAND "" + UPDATE_COMMAND "" + #BUILD_ALWAYS 1 CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index c591a9391dfa5d..b0ea338d20525d 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -100,9 +100,9 @@ else() "${WARPCTC_DOWNLOAD_CMD}" PREFIX ${WARPCTC_PREFIX_DIR} SOURCE_DIR ${WARPCTC_SOURCE_DIR} - #UPDATE_COMMAND "" + UPDATE_COMMAND "" PATCH_COMMAND "" - BUILD_ALWAYS 1 + #BUILD_ALWAYS 1 CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS} diff --git a/cmake/flags.cmake b/cmake/flags.cmake index a2ddad557c2956..94fd29b905009b 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -28,7 +28,12 @@ function(CheckCompilerCXX14Flag) endfunction() CheckCompilerCXX14Flag() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") +if(NOT WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") +else() + set(CMAKE_CXX_STANDARD 14) +endif() + # safe_set_flag # # Set a compile flag only if compiler is support diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 53dcde616b261d..cea65f17fbe836 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -92,7 +92,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) # including io directory for inference lib paddle_api.h include_directories("${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io") -if(NOT APPLE) +if(NOT APPLE AND NOT WIN32) find_package(Threads REQUIRED) link_libraries(${CMAKE_THREAD_LIBS_INIT}) if(WITH_PSLIB OR WITH_DISTRIBUTE) @@ -100,7 +100,7 @@ if(NOT APPLE) else() set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") endif() -endif(NOT APPLE) +endif() set_property(GLOBAL PROPERTY FLUID_MODULES "") # find all fluid modules is used for paddle fluid static library diff --git a/cmake/init.cmake b/cmake/init.cmake index 4bdcaeb4c5f3c0..0ebcdc8ceeebca 100644 --- a/cmake/init.cmake +++ b/cmake/init.cmake @@ -17,16 +17,30 @@ if(NOT WIN32) set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") + + if(WITH_GPU) + set(CMAKE_CUDA_FLAGS_DEBUG "-g") + set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG") + set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") + set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG") + endif() else() - set(CMAKE_C_FLAGS_DEBUG "/Zi /DEBUG") - set(CMAKE_C_FLAGS_RELEASE "/O2 /DNDEBUG") - set(CMAKE_C_FLAGS_RELWITHDEBINFO "/O2 /DNDEBUG") - set(CMAKE_C_FLAGS_MINSIZEREL "/Os /DNDEBUG") + set(CMAKE_C_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1") + set(CMAKE_C_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG") + set(CMAKE_C_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG") + set(CMAKE_C_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG") - set(CMAKE_CXX_FLAGS_DEBUG "/Zi /DEBUG") - set(CMAKE_CXX_FLAGS_RELEASE "/O2 /DNDEBUG") - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/O2 /DNDEBUG") - set(CMAKE_CXX_FLAGS_MINSIZEREL "/Os /DNDEBUG") + set(CMAKE_CXX_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1") + set(CMAKE_CXX_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG") + set(CMAKE_CXX_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG") + + if(WITH_GPU) + set(CMAKE_CUDA_FLAGS_DEBUG "-Xcompiler=\"-MDd -Zi -Ob0 -Od /RTC1\"") + set(CMAKE_CUDA_FLAGS_RELEASE "-Xcompiler=\"-MD -O2 -Ob2\" -DNDEBUG") + set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-Xcompiler=\"-MD -Zi -O2 -Ob1\" -DNDEBUG") + set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Xcompiler=\"-MD -O1 -Ob1\" -DNDEBUG") + endif() # It can specify CUDA compile flag manualy, # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous @@ -34,10 +48,3 @@ else() # Now, it's only used in VS2015 + CUDA:[10.0, 10.2] set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props) endif() - -if(WITH_GPU) - set(CMAKE_CUDA_FLAGS_DEBUG "-g") - set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG") - set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") - set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG") -endif() diff --git a/paddle/fluid/distributed/common/sparse_sharding_merge.h b/paddle/fluid/distributed/common/sparse_sharding_merge.h new file mode 100644 index 00000000000000..3f84b5c4b212e2 --- /dev/null +++ b/paddle/fluid/distributed/common/sparse_sharding_merge.h @@ -0,0 +1,311 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include + +#include +#include +#include +#include // NOLINT +#include + +#include +#include "boost/lexical_cast.hpp" +#include "glog/logging.h" +#include "paddle/fluid/distributed/common/utils.h" +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/dim.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/string/split.h" + +constexpr int FG = 256 * 1024 * 1024; +constexpr int Q_SIZE = 10000; +constexpr int BUCKET = 10; +constexpr char XEOF[] = "EOF"; + +using boost::lexical_cast; + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +namespace paddle { +namespace distributed { + +class ShardingMerge { + public: + ShardingMerge() {} + ~ShardingMerge() {} + + void Merge(const std::vector &inputs, + const std::vector &feasigns, const std::string &output, + const int embedding_dim) { + pool_.reset(new ::ThreadPool(inputs.size())); + + std::vector> tasks(inputs.size()); + std::vector> rows; + rows.resize(inputs.size()); + + auto begin = GetCurrentUS(); + for (int x = 0; x < inputs.size(); ++x) { + tasks[x] = pool_->enqueue([this, x, &rows, &inputs, &feasigns]() -> int { + DeserializeRowsFromFile(inputs[x], feasigns[x], &rows[x]); + return 0; + }); + } + + for (size_t x = 0; x < tasks.size(); ++x) { + tasks[x].wait(); + } + + int64_t total_rows = 0; + for (auto x = 0; x < rows.size(); x++) { + total_rows += rows[x].size(); + } + + auto end = GetCurrentUS(); + + VLOG(0) << "got " << total_rows + << " feasigin ids from sparse embedding using " << end - begin; + + std::vector total_dims = {total_rows, + static_cast(embedding_dim)}; + + std::vector> batch_buckets; + batch_buckets.resize(inputs.size()); + + for (int x = 0; x < rows.size(); ++x) { + batch_buckets[x] = bucket(rows[x].size(), BUCKET); + } + + std::ofstream out(output, std::ios::binary); + + begin = GetCurrentUS(); + SerializeRowsToStream(out, rows, batch_buckets, total_rows); + end = GetCurrentUS(); + VLOG(0) << "write rows to oostrream using " << end - begin; + + begin = GetCurrentUS(); + SerializePreTensorToStream(out, total_dims); + end = GetCurrentUS(); + VLOG(0) << "write pretensor to oostrream using " << end - begin; + + begin = GetCurrentUS(); + SerializeValueToStream(out, inputs, batch_buckets, embedding_dim); + end = GetCurrentUS(); + VLOG(0) << "write values to oostrream using " << end - begin; + } + + private: + void SerializeRowsToStream(std::ostream &os, + const std::vector> &rows, + const std::vector> &batch_buckets, + int64_t total_rows) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + + { + // the 2st field, rows information + os.write(reinterpret_cast(&total_rows), sizeof(total_rows)); + + for (int b = 0; b < BUCKET; ++b) { + for (int x = 0; x < batch_buckets.size(); ++x) { + auto begin = batch_buckets[x][b]; + auto end = batch_buckets[x][b + 1]; + + if (end - begin == 0) continue; + + os.write(reinterpret_cast(rows[x].data() + begin), + sizeof(int64_t) * (end - begin)); + } + } + + // the 3st field, the height of SelectedRows + int64_t height = total_rows; + os.write(reinterpret_cast(&height), sizeof(height)); + } + } + + void SerializePreTensorToStream(std::ostream &os, + const std::vector &dims) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { // the 2nd field, tensor description + // int32_t size + framework::proto::VarType::TensorDesc desc; + desc.set_data_type(framework::proto::VarType::FP32); + auto *pb_dims = desc.mutable_dims(); + pb_dims->Resize(static_cast(dims.size()), 0); + std::copy(dims.begin(), dims.end(), pb_dims->begin()); + int32_t size = desc.ByteSize(); + os.write(reinterpret_cast(&size), sizeof(size)); + auto out = desc.SerializeAsString(); + os.write(out.data(), size); + } + } + + void SerializeValueToVec(std::ifstream &in, const int batch, + const int embedding_dim, std::vector *out) { + auto queue = + std::make_shared>>(); + + auto read = [batch, &in, &queue]() { + std::string line; + std::vector columns; + std::vector values_str; + + int count = 0; + + while (std::getline(in, line)) { + ++count; + columns = string::Split(line, '\t'); + + if (columns.size() != 5) { + VLOG(0) << "unexpected line: " << line << ", skip it"; + continue; + } + + values_str = string::Split(columns[4], ','); + queue->Push(values_str); + + if (count >= batch) { + break; + } + } + queue->Push({}); + }; + + auto write = [embedding_dim, &out, &queue]() { + std::vector values_str; + std::string line; + + while (true) { + queue->Pop(&values_str); + + if (values_str.size() == 0) { + break; + } + + for (int x = 0; x < embedding_dim; ++x) { + float v = 0.0; + try { + v = lexical_cast(values_str[x]); + } catch (boost::bad_lexical_cast &e) { + VLOG(0) << " get unexpected line: " << line; + } + out->push_back(v); + } + } + }; + + std::thread p_read(read); + std::thread p_write(write); + p_read.join(); + p_write.join(); + } + + void SerializeVecToStream(std::ostream &out, + const std::vector &value) { + out.write(reinterpret_cast(value.data()), + static_cast(sizeof(float) * value.size())); + } + + void SerializeValueToStream( + std::ostream &out, const std::vector &ins, + const std::vector> &batch_buckets, + const int embedding_dim) { + std::vector> in_streams; + + for (int x = 0; x < ins.size(); ++x) { + in_streams.emplace_back(std::make_shared(ins[x])); + } + + std::vector> tasks(ins.size()); + + for (int b = 0; b < BUCKET; ++b) { + std::vector> values; + values.resize(tasks.size()); + + auto begin = GetCurrentUS(); + + for (int x = 0; x < tasks.size(); ++x) { + auto batch = batch_buckets[x][b + 1] - batch_buckets[x][b]; + values[x].clear(); + values[x].reserve(batch * embedding_dim); + } + + for (int x = 0; x < tasks.size(); ++x) { + tasks[x] = + pool_->enqueue([this, b, x, &out, &in_streams, &batch_buckets, + &values, embedding_dim]() -> int { + auto batch = batch_buckets[x][b + 1] - batch_buckets[x][b]; + if (batch == 0) return 0; + SerializeValueToVec(*(in_streams[x].get()), batch, embedding_dim, + &values[x]); + return 0; + }); + } + + for (size_t x = 0; x < tasks.size(); ++x) { + tasks[x].wait(); + } + + auto end = GetCurrentUS(); + + auto begin1 = GetCurrentUS(); + for (size_t x = 0; x < tasks.size(); ++x) { + SerializeVecToStream(out, values[x]); + } + auto end1 = GetCurrentUS(); + + VLOG(0) << "serialize buckets " << b << " read using " << end - begin + << ", to oostream using " << end1 - begin1; + } + } + + void DeserializeRowsFromFile(const std::string &input_file, + const int64_t feasigns, + std::vector *rows) { + std::string line; + std::vector columns; + std::ifstream file(input_file); + + rows->reserve(feasigns); + + while (std::getline(file, line)) { + columns = string::Split(line, '\t'); + if (columns.size() != 5) { + VLOG(0) << "unexpected line: " << line << ", skip it"; + continue; + } + rows->push_back(std::stoull(columns[0])); + } + + VLOG(0) << "parse " << rows->size() << " embedding rows from " + << input_file; + } + + private: + std::unique_ptr<::ThreadPool> pool_; +}; +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/common/utils.h b/paddle/fluid/distributed/common/utils.h index f81f84b1e11751..2305001ad6f8f9 100644 --- a/paddle/fluid/distributed/common/utils.h +++ b/paddle/fluid/distributed/common/utils.h @@ -14,6 +14,8 @@ #pragma once +#include + #include #include #include @@ -83,5 +85,11 @@ std::string to_string(const std::vector& vec) { } return ss.str(); } + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; } -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc index a4f672c2963a84..b667aec186f9e3 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.cc +++ b/paddle/fluid/distributed/table/common_sparse_table.cc @@ -134,10 +134,23 @@ void ProcessALine(const std::vector& columns, const Meta& meta, } } -int64_t SaveToText(std::ostream* os, std::shared_ptr block, - const int mode) { - int64_t save_num = 0; +void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common, + const size_t shard_idx, const int64_t total) { + // save meta + std::stringstream stream; + stream << "param=" << common.table_name() << "\n"; + stream << "shard_id=" << shard_idx << "\n"; + stream << "row_names=" << paddle::string::join_strings(common.params(), ',') + << "\n"; + stream << "row_dims=" << paddle::string::join_strings(common.dims(), ',') + << "\n"; + stream << "count=" << total << "\n"; + os->write(stream.str().c_str(), sizeof(char) * stream.str().size()); +} +int64_t SaveValueToText(std::ostream* os, std::shared_ptr block, + std::shared_ptr<::ThreadPool> pool, const int mode) { + int64_t save_num = 0; for (auto& table : block->values_) { for (auto& value : table) { if (mode == SaveMode::delta && !value.second->need_save_) { @@ -334,16 +347,24 @@ int32_t CommonSparseTable::set_global_lr(float* lr) { int32_t CommonSparseTable::load(const std::string& path, const std::string& param) { + auto begin = GetCurrentUS(); rwlock_->WRLock(); - VLOG(3) << "sparse table load with " << path << " with meta " << param; LoadFromText(path, param, _shard_idx, _shard_num, task_pool_size_, &shard_values_); rwlock_->UNLock(); + auto end = GetCurrentUS(); + + auto varname = _config.common().table_name(); + VLOG(0) << "load " << varname << " with value: " << path + << " , meta: " << param + << " using: " << std::to_string((end - begin) / 1e+6) << " seconds"; + return 0; } int32_t CommonSparseTable::save(const std::string& dirname, const std::string& param) { + auto begin = GetCurrentUS(); rwlock_->WRLock(); int mode = std::stoi(param); VLOG(3) << "sparse table save: " << dirname << " mode: " << mode; @@ -356,36 +377,33 @@ int32_t CommonSparseTable::save(const std::string& dirname, VLOG(3) << "save " << varname << " in dir: " << var_store << " begin"; std::vector params(_config.common().params().begin(), _config.common().params().end()); + std::string shard_var_pre = string::Sprintf("%s.block%d", varname, _shard_idx); std::string value_ = string::Sprintf("%s/%s.txt", var_store, shard_var_pre); - std::unique_ptr value_out(new std::ofstream(value_)); + std::unique_ptr vs(new std::ofstream(value_)); int64_t total_ins = 0; for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) { // save values - total_ins += SaveToText(value_out.get(), shard_values_[shard_id], mode); + auto shard_save_num = SaveValueToText(vs.get(), shard_values_[shard_id], + _shards_task_pool[shard_id], mode); + total_ins += shard_save_num; } - value_out->close(); + vs->close(); - // save meta - std::stringstream stream; - stream << "param=" << _config.common().table_name() << "\n"; - stream << "shard_id=" << _shard_idx << "\n"; - stream << "row_names=" - << paddle::string::join_strings(_config.common().params(), ',') - << "\n"; - stream << "row_dims=" - << paddle::string::join_strings(_config.common().dims(), ',') << "\n"; - stream << "count=" << total_ins << "\n"; std::string meta_ = string::Sprintf("%s/%s.meta", var_store, shard_var_pre); - std::unique_ptr meta_out(new std::ofstream(meta_)); - meta_out->write(stream.str().c_str(), sizeof(char) * stream.str().size()); - meta_out->close(); - VLOG(3) << "save " << varname << " in dir: " << var_store << " done"; + std::unique_ptr ms(new std::ofstream(meta_)); + SaveMetaToText(ms.get(), _config.common(), _shard_idx, total_ins); + ms->close(); + + auto end = GetCurrentUS(); rwlock_->UNLock(); + VLOG(0) << "save " << varname << " with path: " << value_ + << " using: " << std::to_string((end - begin) / 1e+6) << " seconds"; + return 0; } @@ -403,8 +421,6 @@ std::pair CommonSparseTable::print_table_stat() { } int32_t CommonSparseTable::pour() { - rwlock_->RDLock(); - std::vector values; std::vector keys; @@ -421,14 +437,11 @@ int32_t CommonSparseTable::pour() { _push_sparse(keys.data(), values.data(), pull_reservoir_.size()); pull_reservoir_.clear(); - rwlock_->UNLock(); return 0; } int32_t CommonSparseTable::pull_sparse(float* pull_values, const PullSparseValue& pull_value) { - rwlock_->RDLock(); - auto shard_num = task_pool_size_; std::vector> tasks(shard_num); @@ -464,7 +477,6 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values, for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { tasks[shard_id].wait(); } - rwlock_->UNLock(); return 0; } @@ -507,7 +519,6 @@ int32_t CommonSparseTable::pull_sparse_ptr(char** pull_values, int32_t CommonSparseTable::_push_sparse(const uint64_t* keys, const float* values, size_t num) { - rwlock_->RDLock(); std::vector> offset_bucket; offset_bucket.resize(task_pool_size_); @@ -531,7 +542,6 @@ int32_t CommonSparseTable::_push_sparse(const uint64_t* keys, for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { tasks[shard_id].wait(); } - rwlock_->UNLock(); return 0; } @@ -569,7 +579,6 @@ int32_t CommonSparseTable::push_sparse(const uint64_t* keys, int32_t CommonSparseTable::_push_sparse(const uint64_t* keys, const float** values, size_t num) { - rwlock_->RDLock(); std::vector> offset_bucket; offset_bucket.resize(task_pool_size_); @@ -596,14 +605,11 @@ int32_t CommonSparseTable::_push_sparse(const uint64_t* keys, for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { tasks[shard_id].wait(); } - rwlock_->UNLock(); return 0; } int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys, const float* values, size_t num) { - rwlock_->RDLock(); - std::vector> offset_bucket; offset_bucket.resize(task_pool_size_); @@ -635,14 +641,12 @@ int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys, for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { tasks[shard_id].wait(); } - rwlock_->UNLock(); return 0; } int32_t CommonSparseTable::flush() { return 0; } int32_t CommonSparseTable::shrink(const std::string& param) { - rwlock_->WRLock(); int threshold = std::stoi(param); VLOG(3) << "sparse table shrink: " << threshold; @@ -651,7 +655,6 @@ int32_t CommonSparseTable::shrink(const std::string& param) { VLOG(4) << shard_id << " " << task_pool_size_ << " begin shrink"; shard_values_[shard_id]->Shrink(threshold); } - rwlock_->UNLock(); return 0; } diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h index 81a1ff5eced2bb..55fc92c9b57859 100644 --- a/paddle/fluid/distributed/table/table.h +++ b/paddle/fluid/distributed/table/table.h @@ -36,7 +36,7 @@ class Table { Table() {} virtual ~Table() {} virtual int32_t initialize(const TableParameter &config, - const FsClientParameter &fs_config) final; + const FsClientParameter &fs_config); virtual int32_t pull_dense(float *values, size_t num) = 0; virtual int32_t push_dense(const float *values, size_t num) = 0; @@ -58,7 +58,9 @@ class Table { virtual int32_t push_sparse(const uint64_t *keys, const float *values, size_t num) = 0; virtual int32_t push_sparse(const uint64_t *keys, const float **values, - size_t num){}; + size_t num) { + return 0; + } virtual int32_t push_sparse_param(const uint64_t *keys, const float *values, size_t num) { return 0; @@ -108,7 +110,7 @@ class Table { virtual int32_t save(const std::string &path, const std::string &converter) = 0; - virtual int32_t set_shard(size_t shard_idx, size_t shard_num) final { + virtual int32_t set_shard(size_t shard_idx, size_t shard_num) { _shard_idx = shard_idx; _shard_num = shard_num; return initialize_shard(); @@ -123,7 +125,7 @@ class Table { protected: virtual int32_t initialize() = 0; - virtual int32_t initialize_accessor() final; + virtual int32_t initialize_accessor(); virtual int32_t initialize_shard() = 0; virtual std::string table_dir(const std::string &model_dir) { return paddle::string::format_string("%s/%03d/", model_dir.c_str(), diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index db2f9c9fc5fc55..8d1ae4926a8012 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -27,6 +27,7 @@ add_subdirectory(fleet) add_subdirectory(io) #ddim lib proto_library(framework_proto SRCS framework.proto) +proto_library(op_def_proto SRCS op_def.proto) proto_library(heter_service_proto SRCS heter_service.proto) proto_library(data_feed_proto SRCS data_feed.proto) proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 84369011476c77..db83cd55889c43 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -195,6 +195,9 @@ class DeviceWorker { virtual void SetReaderPlace(const paddle::platform::Place& place) { device_reader_->SetPlace(place); } + virtual void SetDeviceContext(platform::DeviceContext* dev_ctx) { + dev_ctx_ = dev_ctx; + } virtual Scope* GetThreadScope() { return thread_scope_; } DataFeed* device_reader_ = nullptr; @@ -221,6 +224,7 @@ class DeviceWorker { int dump_mode_ = 0; int dump_interval_ = 10000; ChannelWriter writer_; + platform::DeviceContext* dev_ctx_ = nullptr; }; class CPUWorkerBase : public DeviceWorker { @@ -266,9 +270,6 @@ class HogwildWorker : public CPUWorkerBase { HogwildWorkerParameter param_; std::vector skip_ops_; std::map stat_var_name_map_; -#ifdef PADDLE_WITH_HETERPS - platform::DeviceContext* dev_ctx_ = nullptr; -#endif }; class DownpourWorker : public HogwildWorker { @@ -622,7 +623,6 @@ class PSGPUWorker : public HogwildWorker { gpuStream_t copy_stream_; int batch_cnt_{0}; std::atomic done_cnt_{0}; - platform::DeviceContext* dev_ctx_ = nullptr; double total_time_; double read_time_; diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index b2d170888e28fc..0c66622ed7b9a6 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -39,9 +39,6 @@ void HogwildWorker::Initialize(const TrainerDesc &desc) { for (int i = 0; i < param_.stat_var_names_size(); ++i) { stat_var_name_map_[param_.stat_var_names(i)] = 1; } -#ifdef PADDLE_WITH_HETERPS - dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_); -#endif } void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) { diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 01536fd36ff83b..7e7f1fed5ad58d 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -52,7 +52,7 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PA cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector) cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor) -cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) +cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS op_compat_sensible_pass) cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass) cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper) diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h index ce7635bb35ce61..bc5fc2a16d3939 100644 --- a/paddle/fluid/framework/ir/fuse_pass_base.h +++ b/paddle/fluid/framework/ir/fuse_pass_base.h @@ -17,7 +17,7 @@ #include #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h" #include "paddle/fluid/framework/scope.h" namespace paddle { @@ -46,7 +46,7 @@ enum FuseOptions { FUSE_MKLDNN // fusing will be done with MKL-DNN }; -class FusePassBase : public Pass { +class FusePassBase : public OpCompatSensiblePass { public: void Init(const std::string& repr, Graph* graph) const; Scope* param_scope() const; diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc index f7312ca5555311..b056c3b07a2f65 100644 --- a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc +++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/ir/op_compat_sensible_pass.h" - +#include "paddle/fluid/framework/op_info.h" namespace paddle { namespace framework { namespace ir { @@ -51,11 +51,33 @@ AttrCompat& AttrCompat::IsIntIn(const std::set& candidates) { } //! Todo: append the definition. -AttrCompat& AttrCompat::IsLeftDefault() { return *this; } +AttrCompat& AttrCompat::IsLeftDefault() { + const std::string& op_name = op_compat_->Name(); + if (!OpInfoMap::Instance().Has(op_name)) { + VLOG(3) << "Op (" << op_name << ") is not registered!"; + conditions_.emplace_back([](const Attribute& attr) { return false; }); + return *this; + } + const OpInfo& op_info = OpInfoMap::Instance().Get(op_name); + const AttributeMap attrs = op_info.Checker()->GetAttrsDefaultValuesMap(); + if (attrs.find(attr_name_) == attrs.end()) { + VLOG(3) << "Op (" << op_name << ") has no default attr:" << attr_name_; + conditions_.emplace_back([](const Attribute& attr) { return false; }); + } else { + Attribute default_attr = attrs.at(attr_name_); + conditions_.emplace_back([default_attr](const Attribute& attr) -> bool { + return attr == default_attr; + }); + } + return *this; +} bool AttrCompat::operator()(const OpDesc& op_desc) { + if (conditions_.empty()) { + return true; + } if (!op_desc.HasAttr(attr_name_)) { - return false; + return optional_; } const Attribute attr = op_desc.GetAttr(attr_name_); for (auto& func : conditions_) { @@ -65,6 +87,10 @@ bool AttrCompat::operator()(const OpDesc& op_desc) { } return true; } +AttrCompat& AttrCompat::IsOptional() { + optional_ = true; + return *this; +} AttrCompat& AttrCompat::IsBoolEQ(bool v) { conditions_.emplace_back([v](const Attribute& attr) -> bool { @@ -98,8 +124,12 @@ bool InputOrOutputCompat::operator()( } AttrCompat& OpCompat::AddAttr(const std::string& attr_name) { - attr_compats_.emplace_back(attr_name, this); - return attr_compats_.back(); + PADDLE_ENFORCE_EQ( + attr_compats_.find(attr_name), attr_compats_.end(), + platform::errors::InvalidArgument( + "The attrubute compat with the same name has been added")); + attr_compats_.emplace(attr_name, AttrCompat(attr_name, this)); + return attr_compats_.at(attr_name); } InputOrOutputCompat& OpCompat::AddInput(const std::string& name) { @@ -119,8 +149,19 @@ InputOrOutputCompat& OpCompat::AddOutput(const std::string& name) { } bool OpCompat::Judge(const OpDesc& op_desc) { + for (auto& attr_map : op_desc.GetAttrMap()) { + if (attr_compats_.find(attr_map.first) == attr_compats_.end()) { + if (!AttrCompat(attr_map.first, this).IsLeftDefault()(op_desc)) { + VLOG(3) << "The Attr(" << attr_map.first << ") of Op (" << op_name_ + << ") not reigistered in OpCompat, not equal to default value!"; + return false; + } + } + } for (auto& attr_compat : attr_compats_) { - if (!attr_compat(op_desc)) { + if (!attr_compat.second(op_desc)) { + VLOG(3) << " Check the Attr(" << attr_compat.first << ") of Op(" + << op_name_ << ") failed!"; return false; } } @@ -129,6 +170,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) { for (auto& input_desc : inputs_map) { if (input_compats_.find(input_desc.first) == input_compats_.end()) { if (!input_desc.second.empty()) { + VLOG(3) << "The Input (" << input_desc.first << ") of Operator (" + << op_name_ << ") not reigistered in OpCompat!"; return false; } } @@ -136,10 +179,14 @@ bool OpCompat::Judge(const OpDesc& op_desc) { for (auto& input_val : input_compats_) { if (inputs_map.find(input_val.first) == inputs_map.end()) { if (!input_val.second.Optional()) { + VLOG(3) << "The No optional Input (" << input_val.first + << ") of Operator (" << op_name_ << ") not find in op_desc!"; return false; } } else { if (!input_val.second(inputs_map.at(input_val.first))) { + VLOG(3) << "The Input (" << input_val.first << ") of Operator (" + << op_name_ << ") compat check failed!"; return false; } } @@ -149,6 +196,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) { for (auto& output_desc : outputs_map) { if (output_compats_.find(output_desc.first) == output_compats_.end()) { if (!output_desc.second.empty()) { + VLOG(3) << "The Output (" << output_desc.first << ") of Operator (" + << op_name_ << ") not reigistered in OpCompat!"; return false; } } @@ -156,10 +205,14 @@ bool OpCompat::Judge(const OpDesc& op_desc) { for (auto& output_val : output_compats_) { if (outputs_map.find(output_val.first) == outputs_map.end()) { if (!output_val.second.Optional()) { + VLOG(3) << "The No optional Output (" << output_val.first + << ") of Operator (" << op_name_ << ") not find in op_desc!"; return false; } } else { if (!output_val.second(outputs_map.at(output_val.first))) { + VLOG(3) << "The Output (" << output_val.first << ") of Operator (" + << op_name_ << ") compat check failed!"; return false; } } diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.h b/paddle/fluid/framework/ir/op_compat_sensible_pass.h index 6c0860549fbfee..3f2ea673d879b8 100644 --- a/paddle/fluid/framework/ir/op_compat_sensible_pass.h +++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h @@ -29,7 +29,7 @@ class OpCompat; class AttrCompat { public: AttrCompat(const std::string& attr_name, OpCompat* op_compat) - : attr_name_(attr_name), op_compat_(op_compat) {} + : optional_(false), attr_name_(attr_name), op_compat_(op_compat) {} // @{ String-related methods //! Assert the attribute is an string in the `candidates` domain. @@ -70,12 +70,15 @@ class AttrCompat { //! Tell whether this attribute is left as default value. AttrCompat& IsLeftDefault(); + AttrCompat& IsOptional(); + //! Jump back to retrieve OpCompat instance. OpCompat& End() { return *op_compat_; } bool operator()(const OpDesc& op_desc); private: + bool optional_; std::string attr_name_; OpCompat* op_compat_; std::vector> conditions_; @@ -134,7 +137,7 @@ class OpCompat { private: std::string op_name_; - std::vector attr_compats_; + std::unordered_map attr_compats_; std::unordered_map input_compats_; std::unordered_map output_compats_; }; @@ -179,15 +182,6 @@ class OpCompat { * }; */ class OpCompatSensiblePass : public Pass { - public: - //! Access the subgraph and pattern. - void AccessSubgraph(const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - if (IsCompat(subgraph, g)) { - AccessSubgraphImpl(subgraph, g); - } - } - protected: /** * Developer should push the compatibility `teller` for each kind of Op in the @@ -197,12 +191,6 @@ class OpCompatSensiblePass : public Pass { */ OpCompat& AddOpCompat(OpCompat&& op_compat); - //! Modify the subgraph. - virtual bool AccessSubgraphImpl( - const GraphPatternDetector::subgraph_t& subgraph, Graph* g) const { - return true; - } - //! Tell the Op compability of a subgraph. bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph, Graph* g) const { @@ -212,7 +200,7 @@ class OpCompatSensiblePass : public Pass { // Check the all the ops in the subgraph are contained in the // op_compat. for (auto& node_pair : subgraph) { - if (!node_pair.first->IsOp()) continue; + if (!node_pair.second->IsOp()) continue; auto op_type = node_pair.second->Op()->Type(); if (!op_compat_judgers_.count(op_type)) { return false; diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc index 3d0863a6d12d95..0878e4d9890d35 100644 --- a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc +++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/op_compat_sensible_pass.h" - #include "gtest/gtest.h" +#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" namespace paddle { @@ -23,7 +23,7 @@ namespace ir { TEST(OpCompatSensiblePass, compatOp) { auto lambda = [](const std::string& str) { return str == "tanh"; }; - OpCompat compat("FC"); + OpCompat compat("fc"); compat.AddAttr("in_num_col_dims") .IsIntIn({1, 2}) .IsNumLE(1) @@ -67,10 +67,75 @@ TEST(OpCompatSensiblePass, compatOp) { fc_op.SetInput("Bias", std::vector{"test_input_1"}); fc_op.SetOutput("Out", std::vector{"test_output"}); - EXPECT_STREQ(compat.Name().c_str(), "FC"); + EXPECT_STREQ(compat.Name().c_str(), "fc"); + EXPECT_FALSE(compat.Judge(fc_op)); +} + +TEST(OpCompatSensiblePass, compatOpAttribute) { + OpCompat compat("fc"); + + OpDesc fc_op; + + std::unordered_map attr_map; + attr_map["in_num_col_dims"] = 1; + fc_op.SetAttrMap(attr_map); + + OpInfo info; + info.checker_ = new OpAttrChecker(); + OpInfoMap::Instance().Insert("fc", info); + + EXPECT_FALSE(compat.Judge(fc_op)); + + info.checker_->AddAttrChecker("in_num_col_dims").SetDefault(1); + + EXPECT_TRUE(compat.Judge(fc_op)); + delete info.checker_; +} + +TEST(OpCompatSensiblePass, compatOpAttributeOptional) { + OpCompat compat("fc"); + compat.AddAttr("activation_type") + .IsOptional() + .IsStringIn({"tanh", "sigmoid"}); + OpDesc fc_op; EXPECT_TRUE(compat.Judge(fc_op)); } +TEST(OpCompatSensiblePass, compatOpInput) { + OpCompat compat("fc"); + + OpDesc fc_op; + fc_op.SetInput("Input", std::vector{"test_input"}); + + EXPECT_FALSE(compat.Judge(fc_op)); + + compat.AddInput("Input").IsTensor().End().AddInput("Bias").IsTensor().End(); + EXPECT_FALSE(compat.Judge(fc_op)); + + fc_op.SetInput("Bias", std::vector{"test_input", ""}); + EXPECT_FALSE(compat.Judge(fc_op)); +} + +TEST(OpCompatSensiblePass, compatOutput) { + OpCompat compat("fc"); + + OpDesc fc_op; + fc_op.SetOutput("Output", std::vector{"test_output"}); + + EXPECT_FALSE(compat.Judge(fc_op)); + + compat.AddOutput("Output") + .IsTensor() + .End() + .AddOutput("Output_2") + .IsTensor() + .End(); + EXPECT_FALSE(compat.Judge(fc_op)); + + fc_op.SetOutput("Output_2", std::vector{"test_output", ""}); + EXPECT_FALSE(compat.Judge(fc_op)); +} + class OpCompatSensiblePassTest : public OpCompatSensiblePass { public: OpCompatSensiblePassTest(); @@ -78,7 +143,7 @@ class OpCompatSensiblePassTest : public OpCompatSensiblePass { }; OpCompatSensiblePassTest::OpCompatSensiblePassTest() { - AddOpCompat(OpCompat("FC")) + AddOpCompat(OpCompat("fc")) .AddAttr("in_num_col_dims") .IsNumLE(1) .End() @@ -102,7 +167,7 @@ OpCompatSensiblePassTest::OpCompatSensiblePassTest() { TEST(OpCompatSensiblePass, IsCompat) { OpCompatSensiblePassTest test; OpDesc fc_op; - fc_op.SetType("FC"); + fc_op.SetType("fc"); std::unordered_map attr_map; attr_map["in_num_col_dims"] = 1; attr_map["activation_type"] = std::string("tanh"); @@ -114,18 +179,6 @@ TEST(OpCompatSensiblePass, IsCompat) { fc_op.SetOutput("Out", std::vector{"test_output"}); EXPECT_TRUE(test.TestIsCompat(fc_op)); - - ProgramDesc prog; - std::unique_ptr g(new Graph(prog)); - Node* o1 = g->CreateOpNode(&fc_op); - - GraphPatternDetector detector; - PDNode* op2 = - detector.mutable_pattern()->NewNode([](Node* x) { return true; }); - GraphPatternDetector::subgraph_t subgraph; - subgraph[op2] = o1; - - test.AccessSubgraph(subgraph, g.get()); } } // namespace ir diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc index 7afa76c3fbd23a..c0ccc196348a57 100644 --- a/paddle/fluid/framework/multi_trainer.cc +++ b/paddle/fluid/framework/multi_trainer.cc @@ -112,6 +112,8 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, #ifdef PADDLE_WITH_HETERPS workers_[i]->SetPlace(places_[i]); workers_[i]->SetReaderPlace(places_[i]); + workers_[i]->SetDeviceContext( + platform::DeviceContextPool::Instance().Get(places_[i])); #else workers_[i]->SetPlace(place); workers_[i]->SetReaderPlace(place); diff --git a/paddle/fluid/framework/op_def.proto b/paddle/fluid/framework/op_def.proto new file mode 100644 index 00000000000000..7c4b42b1344b8b --- /dev/null +++ b/paddle/fluid/framework/op_def.proto @@ -0,0 +1,43 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto2"; + +import "framework.proto"; +package paddle.framework.proto; + +message OpDef { + + message VarDef { + required string name = 1; + + // For the type of input / output variables. + reserved 2; + } + + message AttrDef { + required string name = 1; + required AttrType type = 2; + } + + message Desc { + repeated VarDef inputs = 1; + repeated VarDef outputs = 2; + repeated AttrDef attrs = 3; + } + + required string type = 1; + required Desc def = 2; + optional Desc extra = 3; +} diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index 9244b9af0bbd6c..e6a0ecf4aececc 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -52,11 +52,6 @@ class ActivationOpConverter : public OpConverter { engine_->GetITensor(op_desc.Input("X")[0]); auto op_pair = ops.find(op_type_); - if (op_pair == ops.end()) { - PADDLE_THROW(platform::errors::Fatal( - "Wrong activation op type, the trt do not support the %s act type.", - op_type_)); - } nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER( engine_, Activation, *const_cast(input_tensor), diff --git a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc index 813342c08483b7..eba67c3c098ca6 100644 --- a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc @@ -55,16 +55,6 @@ class AffineChannelOpConverter : public OpConverter { auto* bias_t = bias_v->GetMutable(); float* bias_ptr = engine_->GetWeightCPUData(bias_name, bias_t, false); - auto data_layout = framework::StringToDataLayout( - BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout"))); - - PADDLE_ENFORCE_EQ( - data_layout, framework::DataLayout::kNCHW, - platform::errors::InvalidArgument( - "TensorRT affine channel converter can only convert NCHW format. " - "Other format should be run in fluid mode. Report a bug on github " - "issue if you see this line.")); - // tensorrt scalend layer only support spatial dims >= 2, // so nhwc is not availabe (spatial dims == 0) const int channel_axis = engine_->with_dynamic_shape(); diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 47f5cc97d39cdf..df2400854414c3 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -25,10 +25,6 @@ static bool CheckDims(const nvinfer1::Dims& dims_x, return false; } for (int i = 0; i < dims_x.nbDims; i++) { - // conservative judgment - if (dims_x.d[i] == -1 || dims_y.d[i] == -1) { - return false; - } if (dims_x.d[i] != dims_y.d[i]) { return false; } diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 54fc9492b7193e..9df3ec0445ad1c 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -143,6 +143,19 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); if (paddings.size() > 2) return false; +// strides > 1 is only supported by trt7.0 above +#if !IS_TRT_VERSION_GE(7000) + if (desc.HasAttr("strides")) { + const std::vector strides = + BOOST_GET_CONST(std::vector, desc.GetAttr("strides")); + // there is no issue if strides.size() less than 2 + if (strides.size() > 1) { + for (size_t i = 0; i < strides.size(); i++) { + if (strides[i] > 1) return false; + } + } + } +#endif } if (op_type == "pool2d") { @@ -225,6 +238,20 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << desc.Output("Output").size() << " output."; return false; } + +// strides > 1 is only supported by trt7.0 above +#if !IS_TRT_VERSION_GE(7000) + if (desc.HasAttr("strides")) { + const std::vector strides = + BOOST_GET_CONST(std::vector, desc.GetAttr("strides")); + // there is no issue if strides.size() less than 2 + if (strides.size() > 1) { + for (size_t i = 0; i < strides.size(); i++) { + if (strides[i] > 1) return false; + } + } + } +#endif } if (op_type == "matmul") { diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 60479f806f3667..a5f075b8dc68c2 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -176,7 +176,7 @@ if(NOT APPLE AND WITH_MKLML) inference_analysis_api_test(test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc) inference_analysis_api_test(test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_statis_tester.cc) inference_analysis_api_test(test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_profile_tester.cc) - if(NOT WIN32) + if(NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") set_tests_properties(test_analyzer_seq_pool1_compare_determine PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_seq_pool1_fuse_compare_zero_copy PROPERTIES TIMEOUT 120) diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index 7252ed72b20836..952e9ca329f102 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -27,6 +27,9 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "The output tensor of cast op"); AddAttr("out_dtype", "output data type"); AddAttr("in_dtype", "input data type"); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddComment(R"DOC( Cast Operator. @@ -50,6 +53,7 @@ class CastOpGradMaker : public framework::SingleGradOpMaker { grad->SetOutput("Out", this->InputGrad("X")); grad->SetAttr("out_dtype", this->GetAttr("in_dtype")); grad->SetAttr("in_dtype", this->GetAttr("out_dtype")); + grad->SetAttr("use_mkldnn", this->GetAttr("use_mkldnn")); } }; @@ -77,6 +81,28 @@ class CastOp : public framework::OperatorWithKernel { if (platform::is_cuda_pinned_place(tensor_place)) { return framework::OpKernelType(tensor->type(), ctx.device_context()); } + +#ifdef PADDLE_WITH_MKLDNN + int in_dtype = ctx.Attr("in_dtype"); + int out_dtype = ctx.Attr("out_dtype"); + + auto MKLDNNSupportsCast = [&]() -> bool { + int dtype_fp32 = static_cast(framework::proto::VarType::FP32); + int dtype_bf16 = static_cast(framework::proto::VarType::BF16); + + if ((in_dtype != dtype_fp32 && in_dtype != dtype_bf16) || + (out_dtype != dtype_fp32 && out_dtype != dtype_bf16)) + return false; + + return true; + }; + + if (this->CanMKLDNNBeUsed(ctx, tensor->type()) && MKLDNNSupportsCast()) { + return framework::OpKernelType(tensor->type(), ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif return framework::OpKernelType(tensor->type(), tensor_place); } }; diff --git a/paddle/fluid/operators/compat/while.pbtxt b/paddle/fluid/operators/compat/while.pbtxt new file mode 100644 index 00000000000000..34435e1d9e5ff3 --- /dev/null +++ b/paddle/fluid/operators/compat/while.pbtxt @@ -0,0 +1,49 @@ +type: "while" +def { + inputs { + name: "X" + } + inputs { + name: "Condition" + } + outputs { + name: "Out" + } + outputs { + name: "StepScopes" + } + attrs { + name: "sub_block" + type: BLOCK + } +} +extra { + attrs { + name: "is_test" + type: BOOLEAN + } + attrs { + name: "skip_eager_deletion_vars" + type: STRINGS + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index b551629169deed..67e2e3a1e96772 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -20,8 +20,8 @@ limitations under the License. */ namespace paddle { namespace platform { -struct complex128; -struct complex64; +template +struct complex; } // namespace platform } // namespace paddle @@ -135,9 +135,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, + paddle::platform::complex>, ops::ElementwiseAddKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_add_grad, ops::ElementwiseAddGradKernel, @@ -145,9 +145,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, + paddle::platform::complex>, ops::ElementwiseAddGradKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_add_grad_grad, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, + paddle::platform::complex>, ops::ElementwiseAddDoubleGradKernel); + paddle::platform::complex>); // A specialization elementwise_add operator, used in gradient accumulation with // inplace addto. @@ -178,9 +178,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, + paddle::platform::complex>, ops::ElementwiseAddKernel); + paddle::platform::complex>); REGISTER_OP_VERSION(elementwise_add) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index a4b97301a2611b..37e5fa5a206577 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -13,8 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -141,8 +140,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); + ops::ElementwiseAddKernel>, + ops::ElementwiseAddKernel>); REGISTER_OP_CUDA_KERNEL( elementwise_add_grad, ops::ElementwiseAddGradKernel, @@ -150,8 +149,10 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel); + ops::ElementwiseAddGradKernel>, + ops::ElementwiseAddGradKernel>); REGISTER_OP_CUDA_KERNEL( elementwise_add_grad_grad, ops::ElementwiseAddDoubleGradKernel, @@ -160,9 +161,9 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, + plat::complex>, ops::ElementwiseAddDoubleGradKernel); + plat::complex>); REGISTER_OP_CUDA_KERNEL( grad_add, ops::ElementwiseAddKernel, @@ -170,5 +171,5 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); + ops::ElementwiseAddKernel>, + ops::ElementwiseAddKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 0252e6dfff5d75..9a899ec11b4c17 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -17,8 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" namespace paddle { namespace operators { @@ -135,9 +134,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, + paddle::platform::complex>, ops::ElementwiseDivKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_div_grad, ops::ElementwiseDivGradKernel, @@ -145,9 +144,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, + paddle::platform::complex>, ops::ElementwiseDivGradKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_div_grad_grad, @@ -160,9 +159,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel, + paddle::platform::complex>, ops::ElementwiseDivDoubleGradKernel); + paddle::platform::complex>); REGISTER_OP_VERSION(elementwise_div) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index 0cf9294c9de67f..b10ed57af901f0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -14,8 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_div_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -76,18 +75,21 @@ static __global__ void SimpleElemwiseDivGradCUDAKernel(const T* x, const T* y, } template <> -__global__ void SimpleElemwiseDivGradCUDAKernel( - const paddle::platform::complex64* x, const paddle::platform::complex64* y, - const paddle::platform::complex64* out, - const paddle::platform::complex64* dout, int64_t size, - paddle::platform::complex64* dx, paddle::platform::complex64* dy) { +__global__ void +SimpleElemwiseDivGradCUDAKernel>( + const paddle::platform::complex* x, + const paddle::platform::complex* y, + const paddle::platform::complex* out, + const paddle::platform::complex* dout, int64_t size, + paddle::platform::complex* dx, + paddle::platform::complex* dy) { int col = blockIdx.x * blockDim.x + threadIdx.x; while (col < size) { - paddle::platform::complex64 o = dout[col]; - paddle::platform::complex64 y_conj(y[col].real, -y[col].imag); - paddle::platform::complex64 out_div_y_conj((out[col] / y[col]).real, - -(out[col] / y[col]).imag); + paddle::platform::complex o = dout[col]; + paddle::platform::complex y_conj(y[col].real, -y[col].imag); + paddle::platform::complex out_div_y_conj((out[col] / y[col]).real, + -(out[col] / y[col]).imag); dx[col] = o / y_conj; dy[col] = -o * out_div_y_conj; col += blockDim.x * gridDim.x; @@ -95,19 +97,21 @@ __global__ void SimpleElemwiseDivGradCUDAKernel( } template <> -__global__ void SimpleElemwiseDivGradCUDAKernel( - const paddle::platform::complex128* x, - const paddle::platform::complex128* y, - const paddle::platform::complex128* out, - const paddle::platform::complex128* dout, int64_t size, - paddle::platform::complex128* dx, paddle::platform::complex128* dy) { +__global__ void +SimpleElemwiseDivGradCUDAKernel>( + const paddle::platform::complex* x, + const paddle::platform::complex* y, + const paddle::platform::complex* out, + const paddle::platform::complex* dout, int64_t size, + paddle::platform::complex* dx, + paddle::platform::complex* dy) { int col = blockIdx.x * blockDim.x + threadIdx.x; while (col < size) { - paddle::platform::complex128 o = dout[col]; - paddle::platform::complex128 y_conj(y[col].real, -y[col].imag); - paddle::platform::complex128 out_div_y_conj((out[col] / y[col]).real, - -(out[col] / y[col]).imag); + paddle::platform::complex o = dout[col]; + paddle::platform::complex y_conj(y[col].real, -y[col].imag); + paddle::platform::complex out_div_y_conj((out[col] / y[col]).real, + -(out[col] / y[col]).imag); dx[col] = o / y_conj; dy[col] = -o * out_div_y_conj; col += blockDim.x * gridDim.x; @@ -145,9 +149,9 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, + paddle::platform::complex>, ops::ElementwiseDivKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( elementwise_div_grad, ops::ElementwiseDivGradKernel, @@ -157,9 +161,9 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, + paddle::platform::complex>, ops::ElementwiseDivGradKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( elementwise_div_grad_grad, ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel, + paddle::platform::complex>, ops::ElementwiseDivDoubleGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index 0be8d934b17af7..a0b9633acb2e59 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -74,23 +74,13 @@ struct DivGradDX { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } }; -template <> -struct DivGradDX { - HOSTDEVICE paddle::platform::complex64 operator()( - paddle::platform::complex64 x, paddle::platform::complex64 y, - paddle::platform::complex64 out, paddle::platform::complex64 dout) const { - paddle::platform::complex64 y_conj(y.real, -y.imag); - return dout / y_conj; - } -}; - -template <> -struct DivGradDX { - HOSTDEVICE paddle::platform::complex128 operator()( - paddle::platform::complex128 x, paddle::platform::complex128 y, - paddle::platform::complex128 out, - paddle::platform::complex128 dout) const { - paddle::platform::complex128 y_conj(y.real, -y.imag); +template +struct DivGradDX> { + HOSTDEVICE paddle::platform::complex operator()( + paddle::platform::complex x, paddle::platform::complex y, + paddle::platform::complex out, + paddle::platform::complex dout) const { + paddle::platform::complex y_conj(y.real, -y.imag); return dout / y_conj; } }; @@ -102,23 +92,13 @@ struct DivGradDY { } }; -template <> -struct DivGradDY { - HOSTDEVICE paddle::platform::complex64 operator()( - paddle::platform::complex64 x, paddle::platform::complex64 y, - paddle::platform::complex64 out, paddle::platform::complex64 dout) const { - paddle::platform::complex64 out_div_y_conj((out / y).real, -(out / y).imag); - return -dout * out_div_y_conj; - } -}; - -template <> -struct DivGradDY { - HOSTDEVICE paddle::platform::complex128 operator()( - paddle::platform::complex128 x, paddle::platform::complex128 y, - paddle::platform::complex128 out, - paddle::platform::complex128 dout) const { - paddle::platform::complex128 out_div_y_conj((out / y).real, +template +struct DivGradDY> { + HOSTDEVICE paddle::platform::complex operator()( + paddle::platform::complex x, paddle::platform::complex y, + paddle::platform::complex out, + paddle::platform::complex dout) const { + paddle::platform::complex out_div_y_conj((out / y).real, -(out / y).imag); return -dout * out_div_y_conj; } diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index 6bf296f0e0b57a..0045f00ecc6c25 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -16,8 +16,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" namespace paddle { namespace operators { @@ -134,9 +133,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, + paddle::platform::complex>, ops::ElementwiseMulKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_mul_grad, ops::ElementwiseMulGradKernel, @@ -144,9 +143,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, + paddle::platform::complex>, ops::ElementwiseMulGradKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_mul_grad_grad, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, + paddle::platform::complex>, ops::ElementwiseMulDoubleGradKernel); + paddle::platform::complex>); REGISTER_OP_VERSION(elementwise_mul) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index e01b5eb5fb73d9..8fd4609c3aa850 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -14,8 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -76,31 +75,31 @@ static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y, } template <> -__global__ void SimpleElemwiseMulGradCUDAKernel( - const plat::complex64* x, const plat::complex64* y, - const plat::complex64* out, const plat::complex64* dout, int64_t size, - plat::complex64* dx, plat::complex64* dy) { +__global__ void SimpleElemwiseMulGradCUDAKernel>( + const plat::complex* x, const plat::complex* y, + const plat::complex* out, const plat::complex* dout, + int64_t size, plat::complex* dx, plat::complex* dy) { int col = blockIdx.x * blockDim.x + threadIdx.x; while (col < size) { - plat::complex64 o = dout[col]; - dx[col] = plat::complex64(y[col].real, -y[col].imag) * o; - dy[col] = plat::complex64(x[col].real, -x[col].imag) * o; + plat::complex o = dout[col]; + dx[col] = plat::complex(y[col].real, -y[col].imag) * o; + dy[col] = plat::complex(x[col].real, -x[col].imag) * o; col += blockDim.x * gridDim.x; } } template <> -__global__ void SimpleElemwiseMulGradCUDAKernel( - const plat::complex128* x, const plat::complex128* y, - const plat::complex128* out, const plat::complex128* dout, int64_t size, - plat::complex128* dx, plat::complex128* dy) { +__global__ void SimpleElemwiseMulGradCUDAKernel>( + const plat::complex* x, const plat::complex* y, + const plat::complex* out, const plat::complex* dout, + int64_t size, plat::complex* dx, plat::complex* dy) { int col = blockIdx.x * blockDim.x + threadIdx.x; while (col < size) { - plat::complex128 o = dout[col]; - dx[col] = plat::complex128(y[col].real, -y[col].imag) * o; - dy[col] = plat::complex128(x[col].real, -x[col].imag) * o; + plat::complex o = dout[col]; + dx[col] = plat::complex(y[col].real, -y[col].imag) * o; + dy[col] = plat::complex(x[col].real, -x[col].imag) * o; col += blockDim.x * gridDim.x; } } @@ -133,8 +132,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); + ops::ElementwiseMulKernel>, + ops::ElementwiseMulKernel>); REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad, ops::ElementwiseMulGradKernel, @@ -142,8 +141,10 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel>, + ops::ElementwiseMulGradKernel>); REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad_grad, ops::ElementwiseMulDoubleGradKernel, @@ -152,6 +153,6 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, + plat::complex>, ops::ElementwiseMulDoubleGradKernel); + plat::complex>); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 46a00268e4134a..10e69491643c92 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -132,23 +132,13 @@ struct MulGradDX { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } }; -template <> -struct MulGradDX { - HOSTDEVICE paddle::platform::complex64 operator()( - paddle::platform::complex64 x, paddle::platform::complex64 y, - paddle::platform::complex64 out, paddle::platform::complex64 dout) const { - paddle::platform::complex64 y_conj(y.real, -y.imag); - return dout * y_conj; - } -}; - -template <> -struct MulGradDX { - HOSTDEVICE paddle::platform::complex128 operator()( - paddle::platform::complex128 x, paddle::platform::complex128 y, - paddle::platform::complex128 out, - paddle::platform::complex128 dout) const { - paddle::platform::complex128 y_conj(y.real, -y.imag); +template +struct MulGradDX> { + HOSTDEVICE paddle::platform::complex operator()( + paddle::platform::complex x, paddle::platform::complex y, + paddle::platform::complex out, + paddle::platform::complex dout) const { + paddle::platform::complex y_conj(y.real, -y.imag); return dout * y_conj; } }; @@ -158,23 +148,13 @@ struct MulGradDY { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } }; -template <> -struct MulGradDY { - HOSTDEVICE paddle::platform::complex64 operator()( - paddle::platform::complex64 x, paddle::platform::complex64 y, - paddle::platform::complex64 out, paddle::platform::complex64 dout) const { - paddle::platform::complex64 x_conj(x.real, -x.imag); - return dout * x_conj; - } -}; - -template <> -struct MulGradDY { - HOSTDEVICE paddle::platform::complex128 operator()( - paddle::platform::complex128 x, paddle::platform::complex128 y, - paddle::platform::complex128 out, - paddle::platform::complex128 dout) const { - paddle::platform::complex128 x_conj(x.real, -x.imag); +template +struct MulGradDY> { + HOSTDEVICE paddle::platform::complex operator()( + paddle::platform::complex x, paddle::platform::complex y, + paddle::platform::complex out, + paddle::platform::complex dout) const { + paddle::platform::complex x_conj(x.real, -x.imag); return dout * x_conj; } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index 1951ed7f5da673..84aa189b89e909 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -20,8 +20,8 @@ limitations under the License. */ namespace paddle { namespace platform { -struct complex128; -struct complex64; +template +struct complex; } // namespace platform } // namespace paddle @@ -134,9 +134,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, + paddle::platform::complex>, ops::ElementwiseSubKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_sub_grad, ops::ElementwiseSubGradKernel, @@ -144,9 +144,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, + paddle::platform::complex>, ops::ElementwiseSubGradKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_sub_grad_grad, ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubDoubleGradKernel, + paddle::platform::complex>, ops::ElementwiseSubDoubleGradKernel); + paddle::platform::complex>); REGISTER_OP_VERSION(elementwise_sub) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu index 192999fd2ac831..19cbbb7bf04287 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu @@ -14,8 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -103,9 +102,9 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, + paddle::platform::complex>, ops::ElementwiseSubKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( elementwise_sub_grad, ops::ElementwiseSubGradKernel, @@ -115,9 +114,9 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, + paddle::platform::complex>, ops::ElementwiseSubGradKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( elementwise_sub_grad_grad, ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubDoubleGradKernel, + paddle::platform::complex>, ops::ElementwiseSubDoubleGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc new file mode 100644 index 00000000000000..9cfeace6bef99f --- /dev/null +++ b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc @@ -0,0 +1,73 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +template +class CastMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx); + } + + void RunKernel(const framework::ExecutionContext& ctx) const { + const auto& dev_ctx = + ctx.template device_context(); + + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + int in_dtype = ctx.Attr("in_dtype"); + int out_dtype = ctx.Attr("out_dtype"); + + auto x_paddle_type = framework::proto::VarType::Type(in_dtype); + auto out_paddle_type = framework::proto::VarType::Type(out_dtype); + + mkldnn::memory::data_type x_type = + framework::ToMKLDNNDataType(x_paddle_type); + mkldnn::memory::data_type out_type = + framework::ToMKLDNNDataType(out_paddle_type); + + auto x_tz = framework::vectorize(x->dims()); + + std::string key = + platform::CreateKey(dev_ctx, x_tz, x->format(), x->format(), x_type); + platform::ReorderMKLDNNHandler reorder_handler( + x_tz, x_paddle_type, x_type, out_paddle_type, out_type, dev_ctx, + dev_ctx.GetEngine(), key); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + x->format(), platform::to_void_cast(x->data())); + auto reorder_dst_memory_p = + reorder_handler.AcquireDstMemory(out, x->format(), dev_ctx.GetPlace()); + auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p, + reorder_src_memory_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(cast, MKLDNN, paddle::platform::CPUPlace, + ops::CastMKLDNNKernel, + ops::CastMKLDNNKernel); diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index e9bc351de4d692..8262273b7ca7da 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -30,6 +30,7 @@ register_unity_group(cc bmm_op.cc bpr_loss_op.cc cast_op.cc + mkldnn/cast_mkldnn_op.cc cholesky_op.cc chunk_eval_op.cc clip_by_norm_op.cc diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index dde9531e591442..4095720f71eb71 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -16,8 +16,7 @@ limitations under the License. */ // NOTE(): support float16 to half in header file. #define PADDLE_CUDA_FP16 -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -82,28 +81,52 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, #endif } -// CUDA 9.0 have native compatible float16 shfl_down #if defined(PADDLE_WITH_HIP) template <> __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, float16 val, int delta, int width) { -#ifdef PADDLE_WITH_HIP return float16(__shfl_down(static_cast(val), static_cast(delta), width)); -#else - return float16( - __shfl_down(static_cast(val), static_cast(delta), width)); -#endif } + +template <> +__forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( + unsigned mask, paddle::platform::complex val, int delta, int width) { + float real = __shfl_down(val.real, delta, width); + float imag = __shfl_down(val.imag, delta, width); + return paddle::platform::complex(real, imag); +} + +template <> +__forceinline__ __device__ paddle::platform::complex +CudaShuffleDownSync(unsigned mask, paddle::platform::complex val, + int delta, int width) { + double real = __shfl_down(val.real, delta, width); + double imag = __shfl_down(val.imag, delta, width); + return paddle::platform::complex(real, imag); +} + template <> __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, float16 val, int width) { -#ifdef PADDLE_WITH_HIP return float16(__shfl_xor(static_cast(val), width)); -#else - return float16(__shfl_xor(static_cast(val), width)); -#endif +} + +template <> +__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( + unsigned mask, paddle::platform::complex val, int width) { + float real = __shfl_xor(val.real, width); + float imag = __shfl_xor(val.imag, width); + return paddle::platform::complex(real, imag); +} + +template <> +__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( + unsigned mask, paddle::platform::complex val, int width) { + double real = __shfl_xor(val.real, width); + double imag = __shfl_xor(val.imag, width); + return paddle::platform::complex(real, imag); } #else template <> @@ -115,25 +138,26 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, } template <> -__forceinline__ __device__ paddle::platform::complex64 CudaShuffleDownSync( - unsigned mask, paddle::platform::complex64 val, int delta, int width) { +__forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( + unsigned mask, paddle::platform::complex val, int delta, int width) { float real = static_cast(__shfl_down_sync( mask, static_cast(val.real), static_cast(delta), width)); float imag = static_cast(__shfl_down_sync( mask, static_cast(val.imag), static_cast(delta), width)); - return paddle::platform::complex64(real, imag); + return paddle::platform::complex(real, imag); } template <> -__forceinline__ __device__ paddle::platform::complex128 CudaShuffleDownSync( - unsigned mask, paddle::platform::complex128 val, int delta, int width) { +__forceinline__ __device__ paddle::platform::complex +CudaShuffleDownSync(unsigned mask, paddle::platform::complex val, + int delta, int width) { double real = static_cast( __shfl_down_sync(mask, static_cast(val.real), static_cast(delta), width)); double imag = static_cast( __shfl_down_sync(mask, static_cast(val.imag), static_cast(delta), width)); - return paddle::platform::complex128(real, imag); + return paddle::platform::complex(real, imag); } template <> @@ -143,23 +167,23 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, } template <> -__forceinline__ __device__ paddle::platform::complex64 CudaShuffleXorSync( - unsigned mask, paddle::platform::complex64 val, int width) { +__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( + unsigned mask, paddle::platform::complex val, int width) { float real = static_cast( __shfl_xor_sync(mask, static_cast(val.real), width)); float imag = static_cast( __shfl_xor_sync(mask, static_cast(val.imag), width)); - return paddle::platform::complex64(real, imag); + return paddle::platform::complex(real, imag); } template <> -__forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync( - unsigned mask, paddle::platform::complex128 val, int width) { +__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( + unsigned mask, paddle::platform::complex val, int width) { double real = static_cast( __shfl_xor_sync(mask, static_cast(val.real), width)); double imag = static_cast( __shfl_xor_sync(mask, static_cast(val.imag), width)); - return paddle::platform::complex128(real, imag); + return paddle::platform::complex(real, imag); } #endif diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 5ff6f893a89531..d6563be48fe484 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -926,7 +926,23 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { : platform::MKLDNNHandler(dev_ctx, engine, base_key), dims_(dims), vtype_(vtype), - dtype_(dtype) {} + vtype_dst_(vtype), + dtype_(dtype), + dtype_dst_(dtype) {} + + ReorderMKLDNNHandler(std::vector& dims, // NOLINT + framework::proto::VarType::Type vtype, + mkldnn::memory::data_type dtype, + framework::proto::VarType::Type vtype_dst, + mkldnn::memory::data_type dtype_dst, + const platform::MKLDNNDeviceContext& dev_ctx, + mkldnn::engine engine, const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key), + dims_(dims), + vtype_(vtype), + vtype_dst_(vtype_dst), + dtype_(dtype), + dtype_dst_(dtype_dst) {} std::shared_ptr AcquireSrcMemory( const MKLDNNMemoryFormat& fmt, void* ptr) { @@ -940,15 +956,16 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { auto mem_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { - auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_, fmt); - auto dst_data = output->mutable_data(place, vtype_, dst_md.get_size()); + auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt); + auto dst_data = + output->mutable_data(place, vtype_dst_, dst_md.get_size()); mem_p = std::make_shared(dst_md, engine_, dst_data); dev_ctx_.SetBlob(local_key, mem_p); } else { // Even if memory object exists , we may be using it for diffrent tensor auto dst_data = - output->mutable_data(place, vtype_, mem_p->get_desc().get_size()); + output->mutable_data(place, vtype_dst_, mem_p->get_desc().get_size()); mem_p->set_data_handle(dst_data); } return mem_p; @@ -970,8 +987,8 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { private: std::vector dims_; - framework::proto::VarType::Type vtype_; - mkldnn::memory::data_type dtype_; + framework::proto::VarType::Type vtype_, vtype_dst_; + mkldnn::memory::data_type dtype_, dtype_dst_; }; template diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc index 91461aa26f341a..fa14ad4f63be08 100644 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -28,6 +28,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/distributed/common/sparse_sharding_merge.h" #include "paddle/fluid/distributed/communicator_common.h" #include "paddle/fluid/distributed/fleet.h" #include "paddle/fluid/distributed/index_dataset/index_sampler.h" @@ -48,6 +49,7 @@ using paddle::distributed::GraphNode; using paddle::distributed::GraphPyServer; using paddle::distributed::GraphPyClient; using paddle::distributed::FeatureNode; +using paddle::distributed::ShardingMerge; namespace paddle { namespace pybind { @@ -85,6 +87,12 @@ void BindPSHost(py::module* m) { .def("to_string", &distributed::PSHost::to_string); } +void BindSparseShardingTools(py::module* m) { + py::class_(*m, "ShardingMerge") + .def(py::init<>()) + .def("merge", &ShardingMerge::Merge); +} + void BindCommunicatorContext(py::module* m) { py::class_(*m, "CommContext") .def( diff --git a/paddle/fluid/pybind/fleet_py.h b/paddle/fluid/pybind/fleet_py.h index 206a69f5a80197..4dc0f002ad3c1d 100644 --- a/paddle/fluid/pybind/fleet_py.h +++ b/paddle/fluid/pybind/fleet_py.h @@ -36,5 +36,6 @@ void BindIndexNode(py::module* m); void BindTreeIndex(py::module* m); void BindIndexWrapper(py::module* m); void BindIndexSampler(py::module* m); +void BindSparseShardingTools(py::module* m); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 560d8c892b09f9..6dd08e5dfa4bf2 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -3159,7 +3159,7 @@ All parameter, weight, gradient are variables in Paddle. BindTreeIndex(&m); BindIndexWrapper(&m); BindIndexSampler(&m); - + BindSparseShardingTools(&m); #endif } } // namespace pybind diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 69138a37f461ca..dd8146aa3a1147 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -18,7 +18,7 @@ rem Paddle CI Task On Windows Platform rem ================================================= @ECHO ON -setlocal +setlocal enabledelayedexpansion rem -------clean up environment----------- set work_dir=%cd% @@ -63,7 +63,7 @@ if not defined WITH_PYTHON set WITH_PYTHON=ON if not defined ON_INFER set ON_INFER=ON if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON -if not defined WITH_TPCACHE set WITH_TPCACHE=ON +if not defined WITH_TPCACHE set WITH_TPCACHE=OFF if not defined WITH_CLCACHE set WITH_CLCACHE=OFF if not defined WITH_CACHE set WITH_CACHE=OFF if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF @@ -236,6 +236,8 @@ call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary set DISTUTILS_USE_SDK=1 rem Windows 10 Kit bin dir set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH% +rem Use 64-bit ToolSet to compile +set PreferredToolArchitecture=x64 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%# set start=%start:~4,10% @@ -263,12 +265,12 @@ rem ------initialize the python environment------ @ECHO ON set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH% -if %WITH_PYTHON% == "ON" ( +if "%WITH_PYTHON%" == "ON" ( where python where pip pip install wheel --user pip install -r %work_dir%\python\requirements.txt --user - if %ERRORLEVEL% NEQ 0 ( + if !ERRORLEVEL! NEQ 0 ( echo pip install requirements.txt failed! exit /b 7 ) @@ -329,14 +331,14 @@ if "%WITH_GPU%"=="ON" ( ) :cmake_impl -echo cmake .. -G %GENERATOR% -T host=x64 -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ +echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^ -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ +cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^ -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ @@ -366,7 +368,7 @@ echo Build third_party the %build_times% time: if %GENERATOR% == "Ninja" ( ninja third_party ) else ( - MSBuild /m /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:quiet third_party.vcxproj + MSBuild /m /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:%LOG_LEVEL% third_party.vcxproj ) if %ERRORLEVEL% NEQ 0 ( set /a build_times=%build_times%+1 @@ -412,10 +414,10 @@ if "%WITH_TESTING%"=="ON" ( echo Build Paddle the %build_times% time: if %GENERATOR% == "Ninja" ( - ninja -j %PARALLEL_PROJECT_COUNT% + ninja all ) else ( if "%WITH_CLCACHE%"=="OFF" ( - MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj + MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj ) else ( MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj ) @@ -644,7 +646,7 @@ echo git fetch upstream $BRANCH # develop is not fetched>> check_change_of_ echo fi>> check_change_of_unittest.sh echo git checkout -b origin_pr >> check_change_of_unittest.sh echo git checkout -f $BRANCH >> check_change_of_unittest.sh -echo cmake .. -G %GENERATOR% -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ +echo cmake .. -G %GENERATOR% -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^ -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index d3df57fcf6b7d3..4f3a6f4768933d 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -977,6 +977,11 @@ def _parallel_linear(x, group=None): """ Parallel Linear + + axis the dimension of the parameter of linear layer. + axis = 0: the row dimension + axid = 1: the col dimension + """ if group is not None and not group.is_member(): return @@ -1008,6 +1013,12 @@ def _parallel_linear(x, main_block = paddle.static.default_main_program().global_block() startup_block.vars[linear.weight.name].is_distributed = True main_block.vars[linear.weight.name].is_distributed = True + # set is_distributed for splited bias + # if a linear layer is splited by row, each rank would hold a complete bias and they should be the same in each rank. + # if a linear layer is splited by col, the bias would also be split into each rank as its weight + if axis == 1 and linear._bias_attr != False: + startup_block.vars[linear.bias.name].is_distributed = True + main_block.vars[linear.bias.name].is_distributed = True if not gather_out: return linear_out diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py old mode 100755 new mode 100644 index f9cd623afef76a..0a989fe90f96a6 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -814,7 +814,7 @@ def sharding_configs(self): "sharding_segment_strategy": "segment_broadcast_MB", "segment_broadcast_MB": 32, "sharding_degree": 8, - "sharding_degree": 2, + "dp_degree": 2, "gradient_merge_acc_step": 4, } """ diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py index d31fa549ad5623..f18b82eaecd76a 100644 --- a/python/paddle/distributed/fleet/runtime/the_one_ps.py +++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py @@ -847,8 +847,6 @@ def _init_server(self, dirname=None, var_names=None, **kwargs): dirname = os.path.normpath(dirname) pserver_id = self.role_maker._role_id() - import time - begin = time.time() for var_name in load_varnames: table_id = sparse_table_maps[var_name] path = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX, @@ -856,9 +854,6 @@ def _init_server(self, dirname=None, var_names=None, **kwargs): meta = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX, "{}.block{}.meta".format(var_name, pserver_id)) self._server.load_sparse(path, meta, table_id) - end = time.time() - print("init sparse variables: {} cost time: {}".format(load_varnames, - end - begin)) def _run_server(self): if self.role_maker._is_heter_worker(): diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index 30981f531289ae..0221a42e2a3e78 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# Copyright (c) 2021 NVIDIA Corporation. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,6 +33,8 @@ from . import layers from .layers import * from . import optimizer +from . import sparsity +from .sparsity import * __all__ = [] __all__ += decoder.__all__ @@ -42,3 +45,4 @@ __all__ += ['mixed_precision'] __all__ += layers.__all__ __all__ += optimizer.__all__ +__all__ += sparsity.__all__ diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py index f940f6a3143a09..2913d99ee6b217 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -145,6 +145,7 @@ def _update_list(self): 'sign', 'cast', 'fused_bn_add_activation', + 'c_identity', } # The set of ops that don't support fp16 calculation diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py new file mode 100644 index 00000000000000..f78ea1b1c38b85 --- /dev/null +++ b/python/paddle/fluid/contrib/sparsity/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import utils +from .utils import * + +__all__ = utils.__all__ diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py new file mode 100644 index 00000000000000..f1108c327407ff --- /dev/null +++ b/python/paddle/fluid/contrib/sparsity/utils.py @@ -0,0 +1,587 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utilities of Auto SParsity (ASP). +""" + +from __future__ import print_function + +import sys +import math +import collections +import numpy as np +from enum import Enum +from itertools import permutations +import threading + +__all__ = [ + 'density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d', + 'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity', + 'MaskAlgo', 'CheckMethod' +] + + +class MaskAlgo(Enum): + r""" + A collection of all mask generating algorithms. + There currently are three algorithms, `MASK_1D`, `MASK_2D_GREEDY` and `MASK_2D_BEST` + """ + MASK_1D = 'get_mask_1d' + MASK_2D_GREEDY = 'get_mask_2d_greedy' + MASK_2D_BEST = 'get_mask_2d_best' + + +class CheckMethod(Enum): + r""" + A collection of all sparsity checking approaches. + There currently are two methods, `CHECK_1D` and `CHECK_2D` + """ + CHECK_1D = 'check_mask_1d' + CHECK_2D = 'check_mask_2d' + + @staticmethod + def get_checking_method(mask_algo): + r""" + Get sparsity checking method by mask generating algorithm. + + Args: + mask_algo (MaskAlgo): The algorithm of mask generating. + Returns: + CheckMethod: The corresponded sparsity checking method. + Examples: + .. code-block:: python + + import numpy as np + from paddle.fluid.contrib.sparsity import MaskAlgo, CheckMethod + + CheckMethod.get_checking_method(MaskAlgo.MASK_1D) + # CheckMethod.CHECK_1D + + CheckMethod.get_checking_method(MaskAlgo.MASK_2D_GREEDY) + # CheckMethod.CHECK_2D + + CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST) + # CheckMethod.CHECK_2D + """ + assert type(mask_algo) == MaskAlgo, \ + "mask_algo should be MaskAlgo type" + if mask_algo == MaskAlgo.MASK_1D: + return CheckMethod.CHECK_1D + else: + return CheckMethod.CHECK_2D + + +def density(x): + r""" + Return the density of the input tensor. + + Args: + x (nparray): The input tensor. + Returns: + float: The density of :attr:`x`. + Examples: + .. code-block:: python + + import numpy as np + import paddle.fluid.contrib.sparsity as sparsity + + x = np.array([[0, 1, 3, 0], + [1, 1, 0, 1]]) + sparsity.density(x) # 0.625 + """ + x_flattened = x.flatten() + return float(np.nonzero(x_flattened)[0].size) / x_flattened.size + + +def reshape_1d(mat, m): + r""" + Reshape the input matrix to shape (-1, m). + If the second dimension of :attr:`mat` is not a multiples of :attr:`m`, + then this function would pad the remainder with 0 before reshaping. + + .. math:: + + remainder = mat.shape[1] % m + + Args: + mat (nparray): The input matrix. + m (int): The second dimension of reshaped matrix. + Returns: + tuple: A pair of the reshaped and padded matrix and the shape of padded matrix (non-reshaping). + """ + remainder = mat.shape[1] % m + if mat.shape[1] % m > 0: + mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder))) + mat_padded[:, :mat.shape[1]] = mat + shape = mat_padded.shape + return mat_padded.reshape(-1, m), shape + else: + return mat.reshape(-1, m), mat.shape + + +def check_mask_1d(mat, n, m): + r""" + Check if every row of the input matrix :attr:`mat` is in 1D `n:m` sparse pattern. + This function would pad the second dimension of :attr:`mat` by zero + to be a multiples of :attr:`m` if necessary. + + 1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block. + + Args: + mat (nparray): The input matrix. + n (int): n of `n:m` sparse pattern. + m (int): m of `n:m` sparse pattern. + Returns: + bool: True if every row of :attr:`mat` is in 1D n:m sparse pattern, else False. + Examples: + .. code-block:: python + + import numpy as np + import paddle.fluid.contrib.sparsity as sparsity + + x = np.array([[0, 1, 3, 0], + [1, 0, 0, 1]]) + sparsity.check_mask_1d(x, 2, 4) # True + + x = np.array([[0, 1, 5, 4], + [1, 0, 0, 1]]) + sparsity.check_mask_1d(x, 2, 4) # False + + # x would be padded to shape (2, 8) + x = np.array([[0, 1, 0, 4, 6], + [1, 0, 0, 1, 7]]) + sparsity.check_mask_1d(x, 2, 4) # True + """ + if len(mat.shape) <= 1: + mat_flattern, shape = reshape_1d(mat.reshape(1, mat.shape[0]), m) + else: + mat_flattern, shape = reshape_1d(mat, m) + + for sub_mat in mat_flattern: + if np.nonzero(sub_mat)[0].size > (m - n): + return False + return True + + +def get_mask_1d(mat, n, m): + r""" + Generate 1D `n:m` sparse pattern mask of the input matrix :attr:`mat` + in row-directory. This function would pad the second dimension of :attr:`mat` + by zero to be a multiples of :attr:`m` before mask generation. + + 1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block. + + Args: + mat (nparray): The input matrix. + n (int): n of `n:m` sparse pattern. + m (int): m of `n:m` sparse pattern. + Returns: + nparray: The 1D `n:m` sparse mask of :attr:`mat`. + Examples: + .. code-block:: python + + import numpy as np + import paddle.fluid.contrib.sparsity as sparsity + + mat = np.array([[0, 1, 5, 4], + [2, 7, 3, 6]]) + mask = sparsity.get_mask_1d(mat, 2, 4) + # nparray([[0, 0, 1, 1], + # [0, 1, 0, 1]]) + sparsity.check_mask_1d(mask, 2, 4) # True + """ + mat_flattern, shape = reshape_1d(mat, m) + + mask_flattern = np.ones_like(mat_flattern) + mask = np.ones_like(mat) + for i in range(mat_flattern.shape[0]): + sub_mat = mat_flattern[i] + min_order_indices = np.argsort(np.absolute(sub_mat)) + mask_flattern[i, min_order_indices[:n].tolist()] = 0 + mask_flattern = mask_flattern.reshape(shape) + mask[:, :] = mask_flattern[:, :mat.shape[1]] + return mask + + +def reshape_2d(mat, m): + r""" + Reshape the input matrix to shape (-1, :math:`m \times m`). + In each dimension of :attr:`mat`, if it is not a multiples of :attr:`m`, + then this function would pad the remainder with 0 before reshaping. + + .. math:: + + remainder_0 = mat.shape[0] % m \\ + remainder_1 = mat.shape[1] % m + + Args: + mat (nparray): The input matrix. + m (int): The square root of second dimension of reshaped matrix. + Returns: + tuple: A pair of the reshaped and padded matrix and the shape of padded matrix (non-reshaping). + """ + remainder_0 = mat.shape[0] % m + remainder_1 = mat.shape[1] % m + + new_shape = (mat.shape[0] if remainder_0 == 0 \ + else mat.shape[0] + (m - remainder_0), + mat.shape[1] if remainder_1 == 0 \ + else mat.shape[1] + (m - remainder_1)) + mat_padded = np.zeros(new_shape) + mat_padded[:mat.shape[0], :mat.shape[1]] = mat + + mat_flattern = np.empty(new_shape).reshape(-1, m * m) + curr_idx = 0 + for row_start in range(0, mat_padded.shape[0], m): + row_end = row_start + m + for col_start in range(0, mat_padded.shape[1], m): + col_end = col_start + m + sub_mat = np.squeeze(mat_padded[row_start:row_end, \ + col_start:col_end] \ + .reshape(-1)) + mat_flattern[curr_idx] = sub_mat + curr_idx += 1 + return mat_flattern, mat_padded.shape + + +def check_mask_2d(mat, n, m): + r""" + Check if every :math:`m \times m` block of the input matrix :attr:`mat` is in 2D `n:m` sparse pattern. + This function would pad each dimension of :attr:`mat` by zero to be a multiples of + :attr:`m` if necessary. + + 2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block + under the constraint of at least :attr:`n` zeros for each row and column. + + Args: + mat (nparray): The input matrix. + n (int): n of `n:m` sparse pattern. + m (int): m of `n:m` sparse pattern. + Returns: + bool: True if every :math:`m \times m` block of the input matrix :attr:`mat` is in 2D `n:m` sparse pattern, else False. + Examples: + .. code-block:: python + + import numpy as np + import paddle.fluid.contrib.sparsity as sparsity + + x = np.array([[0, 8, 9, 0], + [9, 0, 0, 10], + [5, 0, 0, 6], + [0, 4, 6, 0]]) + sparsity.check_mask_2d(x, 2, 4) # True + + x = np.array([[0, 8, 0, 9], + [9, 0, 0, 10], + [0, 5, 0, 6], + [0, 4, 6, 0]]) + sparsity.check_mask_2d(x, 2, 4) # False + + # x would be padded to shape (8, 8) + x = np.array([[0, 8, 0, 9], + [9, 0, 7, 0], + [0, 5, 0, 6], + [3, 0, 6, 0], + [1, 1, 0, 1]]) + sparsity.check_mask_2d(x, 2, 4) # True + """ + mat_padded, shape = reshape_2d(mat, m) + for sub_mat in mat_padded: + sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0 + if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \ + (np.sum(np.sum(sub_mask, axis=0) > (m-n)) != 0): + return False + return True + + +def get_mask_2d_greedy(mat, n, m): + r""" + Greedily generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`. + This function would pad each dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation. + + 2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block + under the constraint of at least :attr:`n` zeros for each row and column. + Greedily generating: For each :math:`m \times m` block, selecting values to keep in descent order. + + Args: + mat (nparray): The input matrix. + n (int): n of `n:m` sparse pattern. + m (int): m of `n:m` sparse pattern. + Returns: + nparray: The 2D `n:m` sparse mask of :attr:`mat`. + Examples: + .. code-block:: python + + import numpy as np + import paddle.fluid.contrib.sparsity as sparsity + + mat = np.array([[9, 8, 3, 7], + [9, 2, 1, 10], + [5, 1, 3, 6], + [2, 4, 6, 1]]) + mask = sparsity.get_mask_2d_greedy(mat, 2, 4) + # nparray([[1. 1. 0. 0.] + # [1. 0. 0. 1.] + # [0. 0. 1. 1.] + # [0. 1. 1. 0.]]) + sparsity.check_mask_2d(mask, 2, 4) # True + """ + mat_padded, shape = reshape_2d(mat, m) + mask_padded = np.zeros_like(mat_padded).reshape(-1, m, m) + + for idx in range(len(mat_padded)): + sub_mat = np.absolute(np.squeeze(mat_padded[idx])) + sub_mask = np.squeeze(mask_padded[idx]) + + min_order_1d_indices = np.argsort(sub_mat) + min_order_2d_indices = [(int(x / m), x % m) + for x in min_order_1d_indices] + row_counter = collections.Counter() + col_counter = collections.Counter() + + for i in range(len(min_order_1d_indices) - 1, -1, -1): + matrix_entry = min_order_2d_indices[i] + if (row_counter[matrix_entry[0]] == n) or \ + (col_counter[matrix_entry[1]] == n): + continue + + sub_mask[matrix_entry[0], matrix_entry[1]] = 1.0 + row_counter[matrix_entry[0]] += 1 + col_counter[matrix_entry[1]] += 1 + + mask = np.empty(shape) + curr_idx = 0 + for row_start in range(0, shape[0], m): + row_end = row_start + m + for col_start in range(0, shape[1], m): + col_end = col_start + m + mask[row_start:row_end, col_start:col_end] = mask_padded[curr_idx] + curr_idx += 1 + return mask[:mat.shape[0], :mat.shape[1]] + + +valid_2d_patterns_lock = threading.Lock() +valid_2d_patterns = {} + + +def compute_valid_2d_patterns(n, m): + r""" + Compute all vaild 2D `n:m` sparse patterns. + + 2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block + under the constraint of at least :attr:`n` zeros for each row and column. + + Args: + n (int): n of `n:m` sparse pattern. + m (int): m of `n:m` sparse pattern. + Returns: + dictionary: A dictionary with key: *m_n* (string) and value: all vaild 2D `n:m` sparse patterns. + """ + global valid_2d_patterns_lock + global valid_2d_patterns + + valid_key = '{}_{}'.format(m, n) + if valid_key in valid_2d_patterns: + return valid_2d_patterns[valid_key] + else: + patterns = np.zeros(m) + patterns[:n] = 1 + patterns = list(set(permutations(patterns.tolist()))) + patterns = patterns + patterns + patterns = np.asarray(list(set(permutations(patterns, m)))) + + valid = ((patterns.sum(axis=1) <= n).sum(axis=1) == m + ).nonzero()[0].reshape(-1) + valid_patterns = np.empty((valid.shape[0], m, m)) + valid_patterns[:] = patterns[valid[:]] + + valid_2d_patterns_lock.acquire() + valid_2d_patterns[valid_key] = valid_patterns + valid_2d_patterns_lock.release() + + return valid_patterns + + +def get_mask_2d_best(mat, n, m): + r""" + Generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat` + to form sparse matrix with maximun L1 norm .This function would pad each + dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation. + + 2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block + under the constraint of at least :attr:`n` zeros for each row and column. + + *Note*: L1 norm of sparse matrix from `Best` API is greater than or equal to the one from `Greedy`. + + Args: + mat (nparray): The input matrix. + n (int): n of `n:m` sparse pattern. + m (int): m of `n:m` sparse pattern. + Returns: + nparray: The 1D `n:m` sparse mask of :attr:`mat`. + Examples: + .. code-block:: python + + import numpy as np + import paddle.fluid.contrib.sparsity as sparsity + + mat = np.array([[2, 8, 9, 9], + [9, 1, 3, 9], + [5, 6, 3, 9], + [2, 4, 6, 9]]) + mask_greedy = sparsity.get_mask_2d_greedy(mat, 2, 4) + mask_greedy = sparsity.get_mask_2d_best(mat, 2, 4) + print("L1 norm of `greedy` sparse matrix", np.multiply(mat, mask_greedy).sum()) # 56 + print("L1 norm of `best` sparse matrix", np.multiply(mat, mask_best).sum()) # 61 + """ + patterns = compute_valid_2d_patterns(n, m) + + mat_flattern, shape = reshape_2d(mat, m) + mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m) + pmax = np.argmax( + np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T), + axis=1) + + mask_flattern[:] = patterns[pmax[:]] + mask = np.empty(shape) + + curr_idx = 0 + for row_start in range(0, shape[0], m): + row_end = row_start + m + for col_start in range(0, shape[1], m): + col_end = col_start + m + mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx] + curr_idx += 1 + return mask[:mat.shape[0], :mat.shape[1]] + + +def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4): + r""" + Create `n:m` sparse pattern mask of the input tensor via function given by :attr:`func_name`. + Currently only support tensor with dimension less than or equal to 4. + + Args: + tensor (nparray): The input tensor. + func_name (MaskAlgo, optional): The function name to generate spase mask. Default is `MaskAlgo.MASK_1D`. All options please refer to `MaskAlgo`. + n (int, optional): n of `n:m` sparse pattern. Default is 2. + m (int, optional): m of `n:m` sparse pattern. Default is 4. + Returns: + nparray: The `n:m` sparse mask of :attr:`tensor` generated by :attr:`func_name`. + Examples: + .. code-block:: python + + import numpy as np + import paddle.fluid.contrib.sparsity as sparsity + + tensor = np.array([[2, 8, 9, 9], + [9, 1, 3, 9], + [5, 6, 3, 9], + [2, 4, 6, 9]]) + mask_1d = sparsity.create_mask(tensor, func_name=sparsity.MaskAlgo.MASK_1D) + # nparray([[0 0 1 1], + # [1 0 0 1], + # [0 1 0 1], + # [0 0 1 1]]) + mask_2d = sparsity.create_mask(tensor, func_name=sparsity.MaskAlgo.MASK_2D_BEST) + # nparray([[0 1 1 0], + # [1 0 0 1], + # [1 1 0 0], + # [0 0 1 1]]) + """ + shape = tensor.shape + dtype = tensor.dtype + t = tensor.astype(float) + + assert type(func_name) == MaskAlgo, \ + "func_name argumet of create_mask is only accepted as type MaskAlgo. " \ + "But got {}".format(type(func_name)) + func = getattr(sys.modules[__name__], func_name.value, None) + if len(shape) == 1: + t = t.reshape(1, shape[0]) + mask = func(t, n=n, m=m) + return mask.reshape(shape).astype(dtype) + elif len(shape) == 2: + t = t.reshape(shape[0], shape[1]) + mask = func(t, n=n, m=m) + return mask.reshape(shape).astype(dtype) + elif len(shape) == 3: + t = t.reshape(shape[0] * shape[1], shape[2]) + mask = func(t, n=n, m=m) + return mask.reshape(shape).astype(dtype) + # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op + elif len(shape) == 4: + t = t.reshape(shape[0], shape[1] * shape[2] * shape[3]) + mask = func(t, n=n, m=m) + return mask.reshape(shape).astype(dtype) + else: + assert True, "The dimension of input tensor is not supported in create_mask, " \ + "Only dimension < 4 is supported but got {}".format(len(shape)) + + +def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4): + r""" + Check if input tensor is in `n:m` sparse pattern via function given by :attr:`func_name`. + Currently only support tensor with dimension less than or equal to 4. + + Args: + tensor (nparray): The input tensor. + func_name (CheckMethod, optional): The function name to generate spase mask. Default is `CheckMethod.CHECK_1D`. All options please refer to `CheckMethod`. + n (int, optional): n of `n:m` sparse pattern. Default is 2. + m (int, optional): m of `n:m` sparse pattern. Default is 4. + Returns: + bool: True if tensor pass checking of function given by :attr:`func_name`, else False. + Examples: + .. code-block:: python + + import numpy as np + import paddle.fluid.contrib.sparsity as sparsity + + tensor = np.array([[2, 8, 9, 9], + [9, 1, 3, 9], + [5, 6, 3, 9], + [2, 4, 6, 9]]) + mask_1d = sparsity.create_mask(tensor, func_name=sparsity.MaskAlgo.MASK_1D) + # nparray([[0 0 1 1], + # [1 0 0 1], + # [0 1 0 1], + # [0 0 1 1]]) + sparsity.check_sparsity(mask_1d, func_name=sparsity.CheckMethod.CHECK_1D) # True + sparsity.check_sparsity(mask_1d, func_name=sparsity.CheckMethod.CHECK_2D) # False + """ + shape = tensor.shape + t = tensor.astype(float) + + assert type(func_name) == CheckMethod, \ + "func_name argumet of check_sparsity is only accepted as type CheckMethod. " \ + "But got {}".format(type(func_name)) + func = getattr(sys.modules[__name__], func_name.value, None) + if len(shape) == 1: + t = t.reshape(1, shape[0]) + return func(t, n=n, m=m) + elif len(shape) == 2: + t = t.reshape(shape[0], shape[1]) + return func(t, n=n, m=m) + elif len(shape) == 3: + t = t.reshape(shape[0] * shape[1], shape[2]) + return func(t, n=n, m=m) + # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op + elif len(shape) == 4: + t = t.reshape(shape[0], shape[1] * shape[2] * shape[3]) + return func(t, n=n, m=m) + else: + assert True, "The dimension of input tensor is not supported in check_sparsity, " \ + "Only dimension < 4 is supported but got {}".format(len(shape)) + + return False diff --git a/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py index cfe70cf2922392..815018dc4b2f4e 100644 --- a/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py +++ b/python/paddle/fluid/tests/unittests/column_parallel_linear_api.py @@ -69,7 +69,7 @@ def get_model(self, main_prog, startup_program, rank): axis=1, num_partitions=2, weight_attr=param_attr, - bias_attr=False, ) + bias_attr=True, ) return [linear_out] diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py index 14d7e960f4a68c..912849ffbeb71c 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py @@ -113,7 +113,7 @@ def test_pp_model(self): print("loss: ", loss_a.numpy(), loss_b.numpy()) np.testing.assert_allclose( - loss_a.numpy(), loss_b.numpy(), rtol=1e-5) + loss_a.numpy(), loss_b.numpy(), rtol=5e-5) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py new file mode 100644 index 00000000000000..95de37fdc0251a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_cast_mkldnn_op.py @@ -0,0 +1,78 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard +from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16 + + +@unittest.skipIf(not core.supports_bfloat16(), + "place does not support BF16 evaluation") +class TestCastBF16ToFP32MKLDNNOp(OpTest): + def init_data(self): + self.out = np.random.random(size=[10, 10]).astype("float32") + self.x = convert_float_to_uint16(self.out) + + def setUp(self): + self.init_data() + self.inputs = {'X': self.x} + self.outputs = {'Out': self.out} + prepare_dtype = lambda x: int(core.VarDesc.VarType.BF16 if x.dtype != np.float32 else core.VarDesc.VarType.FP32) + self.attrs = { + 'in_dtype': prepare_dtype(self.x), + 'out_dtype': prepare_dtype(self.out), + 'use_mkldnn': True + } + self.op_type = 'cast' + + def test_check_output(self): + self.check_output(check_dygraph=False) + + def test_check_grad(self): + self.check_grad_with_place( + core.CPUPlace(), ["X"], + "Out", + check_dygraph=False, + user_defined_grads=[self.inputs['X']], + user_defined_grad_outputs=[self.outputs['Out']]) + + +class TestCastFP32ToBF16MKLDNNOp(TestCastBF16ToFP32MKLDNNOp): + def init_data(self): + self.x = np.random.random(size=[2, 6]).astype("float32") + self.out = convert_float_to_uint16(self.x) + + +class TestCastBF16ToBF16MKLDNNOp(TestCastBF16ToFP32MKLDNNOp): + def init_data(self): + self.x = np.random.random(size=[6, 13]).astype("uint16") + self.out = self.x + + +class TestCastFP32ToFP32MKLDNNOp(TestCastBF16ToFP32MKLDNNOp): + def init_data(self): + self.x = np.random.random(size=[7, 15]).astype("float32") + self.out = self.x + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 3524d1e553d1bb..654723d8629900 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -1191,8 +1191,12 @@ def find_actual(target_name, fetch_list): np.float32, np.float64 ]: actual_t = convert_uint16_to_float(actual_t) - atol = 0.03 + atol = max(atol, 0.03) + if expect_t.dtype == np.uint16 and actual_t.dtype == np.uint16: + expect_t = convert_uint16_to_float(expect_t) + actual_t = convert_uint16_to_float(actual_t) + atol = max(atol, 0.03) # NOTE(zhiqiu): np.allclose([], [1.]) returns True # see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng if expect_t.size == 0: @@ -1501,13 +1505,21 @@ def check_grad_with_place(self, # comparison of bf16 results will happen as fp32 # loop over list of grads and convert bf16 to fp32 - fp32_grads = [] + fp32_analytic_grads = [] for grad in analytic_grads: if grad.dtype == np.uint16: grad = convert_uint16_to_float(grad) max_relative_error = 0.03 - fp32_grads.append(grad) - analytic_grads = fp32_grads + fp32_analytic_grads.append(grad) + analytic_grads = fp32_analytic_grads + + fp32_numeric_grads = [] + for grad in numeric_grads: + if grad.dtype == np.uint16: + grad = convert_uint16_to_float(grad) + max_relative_error = 0.03 + fp32_numeric_grads.append(grad) + numeric_grads = fp32_numeric_grads self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check, max_relative_error, diff --git a/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py b/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py index a62e3c05508a16..a24c0874482113 100644 --- a/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py +++ b/python/paddle/fluid/tests/unittests/row_parallel_linear_api.py @@ -65,12 +65,12 @@ def get_model(self, main_prog, startup_program, rank): linear_out = paddle.distributed.split( data, - size=(1000, 8), + size=(1000, 16), operation='linear', axis=0, num_partitions=2, weight_attr=param_attr, - bias_attr=False, ) + bias_attr=True, ) return [linear_out] diff --git a/python/paddle/fluid/tests/unittests/test_asp_utils.py b/python/paddle/fluid/tests/unittests/test_asp_utils.py new file mode 100644 index 00000000000000..faffd477ae5661 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_asp_utils.py @@ -0,0 +1,189 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import threading, time +import paddle +from paddle.fluid.contrib import sparsity +import numpy as np + + +class TestASPUtils(unittest.TestCase): + def test_get_check_method(self): + self.assertEqual( + sparsity.CheckMethod.get_checking_method(sparsity.MaskAlgo.MASK_1D), + sparsity.CheckMethod.CHECK_1D) + self.assertEqual( + sparsity.CheckMethod.get_checking_method( + sparsity.MaskAlgo.MASK_2D_GREEDY), + sparsity.CheckMethod.CHECK_2D) + self.assertEqual( + sparsity.CheckMethod.get_checking_method( + sparsity.MaskAlgo.MASK_2D_BEST), sparsity.CheckMethod.CHECK_2D) + + def test_density(self): + x = np.array([[1.0, 1.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0], + [1.0, 0.0, 0.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0], + [0.0, 1.0, 0.0, 0.0, 1.0]]) + self.assertEqual(sparsity.density(x), 0.56) + x[:, 0] = 0.0 + self.assertEqual(sparsity.density(x), 0.4) + + def test_check_mask_1d(self): + x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0], + [1.0, 1.0, 0.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0], + [0.0, 1.0, 0.0, 0.0, 1.0]]) + self.assertTrue(sparsity.check_mask_1d(x, 2, 4)) + self.assertFalse(sparsity.check_mask_1d(x, 3, 4)) + self.assertTrue(sparsity.check_mask_1d(x, 2, 5)) + self.assertFalse(sparsity.check_mask_1d(x, 3, 5)) + self.assertTrue(sparsity.check_mask_1d(x, 3, 6)) + self.assertFalse(sparsity.check_mask_1d(x, 4, 6)) + + def test_get_mask_1d(self): + for _ in range(10): + x = np.random.randint(10, size=(5, 5)) + x = sparsity.get_mask_1d(x, 2, 4) + self.assertTrue(sparsity.check_mask_1d(x, 2, 4)) + + x = np.random.randn(5, 4) + x = sparsity.get_mask_1d(x, 2, 4) + self.assertTrue(sparsity.check_mask_1d(x, 2, 4)) + + def test_check_mask_2d(self): + x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0, 1.0]]) + self.assertTrue(sparsity.check_mask_2d(x, 2, 4)) + self.assertFalse(sparsity.check_mask_2d(x, 3, 4)) + self.assertTrue(sparsity.check_mask_2d(x, 2, 5)) + self.assertFalse(sparsity.check_mask_2d(x, 3, 5)) + self.assertTrue(sparsity.check_mask_2d(x, 3, 6)) + self.assertFalse(sparsity.check_mask_2d(x, 4, 6)) + + def test_get_mask_2d_greedy(self): + for _ in range(10): + x = np.random.randint(10, size=(5, 5)) + x = sparsity.get_mask_2d_greedy(x, 2, 4) + self.assertTrue(sparsity.check_mask_2d(x, 2, 4)) + + x = np.random.randn(5, 4) + x = sparsity.get_mask_2d_greedy(x, 2, 4) + self.assertTrue(sparsity.check_mask_2d(x, 2, 4)) + + def test_get_mask_2d_best(self): + for _ in range(10): + x = np.random.randint(10, size=(5, 5)) + x = sparsity.get_mask_2d_best(x, 2, 4) + self.assertTrue(sparsity.check_mask_2d(x, 2, 4)) + + x = np.random.randn(5, 4) + x = sparsity.get_mask_2d_best(x, 2, 4) + self.assertTrue(sparsity.check_mask_2d(x, 2, 4)) + + def test_threadsafe_valid_2d_patterns(self): + def get_reference(m=4, n=2): + from itertools import permutations + + patterns = np.zeros(m) + patterns[:n] = 1 + patterns = list(set(permutations(patterns.tolist()))) + patterns = patterns + patterns + patterns = np.asarray(list(set(permutations(patterns, m)))) + + valid = ((patterns.sum(axis=1) <= n).sum(axis=1) == m + ).nonzero()[0].reshape(-1) + valid_patterns = np.empty((valid.shape[0], m, m)) + valid_patterns[:] = patterns[valid[:]] + return valid_patterns + + for _ in range(4): + computing_thread = threading.Thread( + target=paddle.fluid.contrib.sparsity.utils. + compute_valid_2d_patterns, + args=(2, 4)) + computing_thread.start() + time.sleep(3) + patterns_map = paddle.fluid.contrib.sparsity.utils.valid_2d_patterns + reference_patterns = get_reference() + reference_key = '4_2' + + self.assertTrue(reference_key in patterns_map) + self.assertTrue(len(patterns_map) == 1) + self.assertTrue((reference_patterns == patterns_map[reference_key]).all( + )) + + def test_check_sparsity(self): + for _ in range(10): + x = np.random.randint(10, size=(5)) + x_2d = x.reshape(1, x.shape[0]) + self.__test_1D_2D_sparsity_checking_methods(x_2d) + + x = np.random.randint(10, size=(5, 5)) + x_2d = x + self.__test_1D_2D_sparsity_checking_methods(x_2d) + + x = np.random.randint(10, size=(5, 5, 5)) + x_2d = x.reshape(x.shape[0] * x.shape[1], x.shape[2]) + self.__test_1D_2D_sparsity_checking_methods(x_2d) + + x = np.random.randint(10, size=(5, 5, 5, 5)) + x_2d = x.reshape(x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]) + self.__test_1D_2D_sparsity_checking_methods(x_2d) + + def test_create_mask(self): + for _ in range(10): + x = np.random.randint(10, size=(5)) + self.__test_1D_2D_sparse_mask_generation_methods(x) + + x = np.random.randint(10, size=(5, 5)) + self.__test_1D_2D_sparse_mask_generation_methods(x) + + x = np.random.randint(10, size=(5, 5, 5)) + self.__test_1D_2D_sparse_mask_generation_methods(x) + + x = np.random.randint(10, size=(5, 5, 5, 5)) + self.__test_1D_2D_sparse_mask_generation_methods(x) + + def __test_1D_2D_sparsity_checking_methods(self, x_2d): + mask = sparsity.get_mask_1d(x_2d, 2, 4) + self.assertEqual( + sparsity.check_sparsity( + mask, func_name=sparsity.CheckMethod.CHECK_1D, n=2, m=4), + sparsity.check_mask_1d(mask, 2, 4)) + mask = sparsity.get_mask_2d_best(x_2d, 2, 4) + self.assertEqual( + sparsity.check_sparsity( + mask, func_name=sparsity.CheckMethod.CHECK_2D, n=2, m=4), + sparsity.check_mask_2d(mask, 2, 4)) + + def __test_1D_2D_sparse_mask_generation_methods(self, x): + mask = sparsity.create_mask( + x, func_name=sparsity.MaskAlgo.MASK_1D, n=2, m=4) + self.assertTrue( + sparsity.check_sparsity( + mask, func_name=sparsity.CheckMethod.CHECK_1D, n=2, m=4)) + mask = sparsity.create_mask( + x, func_name=sparsity.MaskAlgo.MASK_2D_GREEDY, n=2, m=4) + self.assertTrue( + sparsity.check_sparsity( + mask, func_name=sparsity.CheckMethod.CHECK_2D, n=2, m=4)) + mask = sparsity.create_mask( + x, func_name=sparsity.MaskAlgo.MASK_2D_BEST, n=2, m=4) + self.assertTrue( + sparsity.check_sparsity( + mask, func_name=sparsity.CheckMethod.CHECK_2D, n=2, m=4)) diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py index e6693b676cf643..f0c042eb7e95b6 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py +++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py @@ -154,7 +154,10 @@ def _run_cluster(self, model_file, envs): #update environment env0.update(envs) env1.update(envs) - tr_cmd = "%s %s" + if os.getenv('WITH_COVERAGE', 'OFF') == 'ON': + tr_cmd = "%s -m coverage run --branch -p %s" + else: + tr_cmd = "%s %s" tr0_cmd = tr_cmd % (self._python_interp, model_file) tr1_cmd = tr_cmd % (self._python_interp, model_file) tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w") diff --git a/python/setup.py.in b/python/setup.py.in index 0f2e97192c1df1..79c67182f9c791 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -185,6 +185,7 @@ packages=['paddle', 'paddle.fluid.contrib.mixed_precision', 'paddle.fluid.contrib.mixed_precision.bf16', 'paddle.fluid.contrib.layers', + 'paddle.fluid.contrib.sparsity', 'paddle.fluid.transpiler', 'paddle.fluid.transpiler.details', 'paddle.fluid.incubate', diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index 470242da34ddd0..0df3b4914f5df9 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -233,9 +233,9 @@ def is_only_comment(self, f): def get_all_count(self): os.system( - "cd %s/build && ctest -N|grep 'Total Tests:' | awk -F ': ' '{print $2}' > testCount" + "cd %sbuild && ctest -N|grep 'Total Tests:' | awk -F ': ' '{print $2}' > testCount" % PADDLE_ROOT) - f = open("%s/build/testCount" % PADDLE_ROOT) + f = open("%sbuild/testCount" % PADDLE_ROOT) testCount = f.read() f.close() return int(testCount.strip()) diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index cb0581d671006e..55b82084f6bc5a 100644 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -650,7 +650,6 @@ 'test_collective_wait', 'test_collective_split_row_linear', 'test_collective_split_embedding', - 'test_custom_attrs_jit', 'float16_gpu_test', 'test_leaky_relu_grad_grad_functor', 'test_complex_simplenet', diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index c5ea8891a21ee2..2c50c4bf9f6207 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -589,6 +589,7 @@ 'test_matmul_op_with_head', 'test_var_conv_2d', 'test_batch_norm_mkldnn_op', + 'test_cast_mkldnn_op', 'test_concat_int8_mkldnn_op', 'test_concat_bf16_mkldnn_op', 'test_concat_mkldnn_op', diff --git a/tools/test_model_benchmark.sh b/tools/test_model_benchmark.sh index 8f8026b0adcef7..98066d7beeaa77 100644 --- a/tools/test_model_benchmark.sh +++ b/tools/test_model_benchmark.sh @@ -24,11 +24,13 @@ function check_whl { mkdir -p /tmp/pr && mkdir -p /tmp/develop unzip -q build/python/dist/*.whl -d /tmp/pr + rm -f build/python/dist/*.whl && rm -f build/python/build/.timestamp git checkout . git checkout -b develop_base_pr upstream/$BRANCH + bash -x paddle/scripts/paddle_build.sh build + [ $? -ne 0 ] && echo "install paddle failed." && exit 1 cd build - make -j `nproc` unzip -q python/dist/*.whl -d /tmp/develop sed -i '/version.py/d' /tmp/pr/*/RECORD diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat index 4a61a99c34fa24..603c9911a44f99 100644 --- a/tools/windows/build_compile_environment.bat +++ b/tools/windows/build_compile_environment.bat @@ -132,7 +132,7 @@ goto :eof :vs echo ">>>>>>>> step [4/7]: Visual Studio 2017 " cmd /C "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" > nul 2> nul || call :install_visual_studio -goto :cuda10 +goto :cuda :install_visual_studio echo There is not Visual Studio in this PC, will install VS2017. @@ -153,7 +153,7 @@ goto :eof :: ===== end step 4: Visual Studio 2017 ===== :: ===== start step 5: CUDA 11 ===== -:cuda10 +:cuda echo ">>>>>>>> step [5/7]: CUDA 11.2" cmd /C nvcc --version 2> nul | findstr /C:"11.2" > nul 2> nul || call :install_cuda goto java-jre