diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30f9e3a3dcdd2c..f30671bd3a87e8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License
-cmake_minimum_required(VERSION 3.15)
+cmake_minimum_required(VERSION 3.10)
cmake_policy(VERSION 3.10)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
@@ -22,9 +22,6 @@ include(system)
project(paddle CXX C)
-include(init)
-include(generic) # simplify cmake module
-
# enable language CUDA
# TODO(Shibo Tao): remove find_package(CUDA) completely.
find_package(CUDA QUIET)
@@ -34,10 +31,14 @@ option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
-# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
+# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
# to develop some acl related functionality on x86
option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
+# Note(zhouwei): It use option above, so put here
+include(init)
+include(generic) # simplify cmake module
+
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
endif()
@@ -65,7 +66,7 @@ if(WITH_MUSL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
endif()
-if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
endif()
@@ -103,9 +104,9 @@ if(WIN32)
endif()
endforeach(flag_var)
endif()
-
- # NOTE(Avin0323): Less parallel count result in faster compilation.
+
math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
+
# windows build turn off warnings, use parallel compiling.
foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
@@ -113,7 +114,10 @@ if(WIN32)
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
- set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
+ # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling
+ if(NOT WITH_GPU)
+ set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
+ endif()
endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w")
@@ -133,6 +137,9 @@ if(WIN32)
foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
+ if(MSVC_STATIC_CRT)
+ set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
+ endif()
endforeach(flag_var)
if (WITH_WIN_DUMP_DBG)
@@ -182,7 +189,6 @@ option(WITH_PSLIB "Compile with pslib support" OFF)
option(WITH_BOX_PS "Compile with box_ps support" OFF)
option(WITH_XBYAK "Compile with xbyak support" ON)
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
-option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE})
option(WITH_HETERPS "Compile with heterps" OFF})
option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF)
@@ -199,6 +205,7 @@ option(WITH_SW "Compile PaddlePaddle with sw support" OFF)
option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF)
option(WITH_MUSL "Compile with musl libc instead of gblic" OFF)
option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF)
+option(WITH_STRIP "Strip so files of Whl packages" OFF)
# PY_VERSION
if(NOT PY_VERSION)
@@ -259,9 +266,6 @@ endif()
if(WITH_BRPC_RDMA)
message(STATUS "Use brpc with rdma.")
- if(WITH_GRPC)
- message(FATAL_ERROR "Can't use grpc with brpc rdma.")
- endif()
if(NOT WITH_DISTRIBUTE)
message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
endif()
@@ -349,6 +353,11 @@ if (WITH_MIPS)
add_definitions(-DPADDLE_WITH_MIPS)
endif()
+if (WITH_HETERPS)
+ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
+ endif()
+endif()
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
@@ -366,6 +375,13 @@ else()
message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
endif()
+if(WITH_STRIP)
+ find_program(STRIP_PATH strip)
+ if(NOT STRIP_PATH OR NOT LINUX)
+ set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE)
+ endif()
+endif()
+
add_subdirectory(paddle)
if(WITH_PYTHON)
add_subdirectory(python)
diff --git a/README.md b/README.md
index e8a7013d0b4432..8b437e4115abe8 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
-
-
+
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index bf1352d4e11479..e7f125269be1f5 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -177,10 +177,6 @@ if(WITH_HETERPS)
add_definitions(-DPADDLE_WITH_HETERPS)
endif()
-if(WITH_GRPC)
- add_definitions(-DPADDLE_WITH_GRPC)
-endif(WITH_GRPC)
-
if(WITH_BRPC_RDMA)
add_definitions(-DPADDLE_WITH_BRPC_RDMA)
endif(WITH_BRPC_RDMA)
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index bddd2023b437b1..414b2a54be0342 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -21,7 +21,13 @@ else()
set(ASCEND_DIR /usr/local/Ascend)
endif()
-if(WITH_ASCEND)
+if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
+ # It means CANN 20.2 +
+ add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+endif()
+
+
+if(WITH_ASCEND OR WITH_ASCEND_CL)
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
@@ -43,9 +49,6 @@ if(WITH_ASCEND)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
- if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
- add_definitions(-DPADDLE_WITH_ASCEND_STRING)
- endif()
ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
@@ -62,17 +65,23 @@ endif()
if(WITH_ASCEND_CL)
set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+ set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
- set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+ set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+ set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
- message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
+ message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
- INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
+ INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
+ INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})
ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
+ ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
+ SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
+
ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 4619f9f7b7e34c..aa471002eacb6a 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -33,7 +33,9 @@ elseif(LINUX)
# which will cause compiler error of using __host__ funciont in __host__ __device__
file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src)
file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst)
- set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst})
+ file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorReductionGpu.h native_src1)
+ file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h native_dst1)
+ set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst} && cp ${native_src1} ${native_dst1})
endif()
endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c85654a5674a00..a5c74a46631e9d 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -447,9 +447,20 @@ function(cc_test TARGET_NAME)
cc_test_build(${TARGET_NAME}
SRCS ${cc_test_SRCS}
DEPS ${cc_test_DEPS})
- cc_test_run(${TARGET_NAME}
- COMMAND ${TARGET_NAME}
- ARGS ${cc_test_ARGS})
+ # we dont test hcom op, because it need complex configuration
+ # with more than one machine
+ if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test" OR
+ "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test" OR
+ "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test" OR
+ "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test" OR
+ "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test" OR
+ "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test" OR
+ "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test" OR
+ "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
+ cc_test_run(${TARGET_NAME}
+ COMMAND ${TARGET_NAME}
+ ARGS ${cc_test_ARGS})
+ endif()
endif()
endfunction(cc_test)
@@ -807,7 +818,7 @@ function(py_test TARGET_NAME)
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
-
+
if (WIN32)
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 4864e04fa05164..9694a7bc59c12a 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -211,11 +211,11 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
if(WIN32)
set(paddle_inference_c_lib $/paddle_inference_c.*)
else(WIN32)
- set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_inference_c.*)
+ set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*)
endif(WIN32)
copy(inference_lib_dist
- SRCS ${src_dir}/inference/capi/paddle_c_api.h ${paddle_inference_c_lib}
+ SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib}
DSTS ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)
# fluid library for both train and inference
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 19fdb6c601a112..b11156d2e9986f 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -18,10 +18,10 @@ if(NOT WIN32)
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
else()
- # It has not been used now, it can specify CUDA compile flag manualy,
+ # It can specify CUDA compile flag manualy,
# its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
# because CUDA will update by nvidia, then error will occur.
- # Now, it's used in CUDA:[10.0, 10.2]
+ # Now, it's only used in VS2015 + CUDA:[10.0, 10.2]
set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
endif()
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 7dac91e531e4cf..75b1100caa915e 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -180,8 +180,8 @@ function(op_library TARGET)
list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
- list(REMOVE_ITEM hip_srcs "correlation_op.cu")
list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
+ list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
else()
diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props
index 296940dc3f50cc..3c069bd2981c43 100644
--- a/cmake/paddle_win.props
+++ b/cmake/paddle_win.props
@@ -88,4 +88,3 @@ set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
-
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 81fa7d0dfa98f0..f90fa3509d63d4 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -29,9 +29,9 @@ set(third_party_deps)
# 2. REPOSITORY: specify git REPOSITORY of 3rd party
# 3. TAG: specify git tag/branch/commitID of 3rd party
# 4. DIR: overwrite the original SOURCE_DIR when cache directory
-#
+#
# The function Return 1 PARENT_SCOPE variables:
-# - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add,
+# - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add,
# and you no longer need to set any donwnload steps in ExternalProject_Add.
# For example:
# Cache_third_party(${TARGET}
@@ -52,7 +52,7 @@ FUNCTION(cache_third_party TARGET)
SET(${TARGET_NAME}_DOWNLOAD_CMD
GIT_REPOSITORY ${cache_third_party_REPOSITORY})
IF(cache_third_party_TAG)
- LIST(APPEND ${TARGET_NAME}_DOWNLOAD_CMD
+ LIST(APPEND ${TARGET_NAME}_DOWNLOAD_CMD
GIT_TAG ${cache_third_party_TAG})
ENDIF()
ELSEIF(cache_third_party_URL)
@@ -130,7 +130,7 @@ ENDFUNCTION()
# Correction of flags on different Platform(WIN/MAC) and Print Warning Message
if (APPLE)
if(WITH_MKL)
- MESSAGE(WARNING
+ MESSAGE(WARNING
"Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.")
set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE)
endif()
@@ -141,7 +141,7 @@ if(WIN32 OR APPLE)
SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
if(WITH_LIBXSMM)
- MESSAGE(WARNING
+ MESSAGE(WARNING
"Windows, Mac are not supported with libxsmm in Paddle yet."
"Force WITH_LIBXSMM=OFF")
SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE)
@@ -276,7 +276,7 @@ endif(WITH_BOX_PS)
if(WITH_ASCEND OR WITH_ASCEND_CL)
include(external/ascend)
- if(WITH_ASCEND)
+ if(WITH_ASCEND OR WITH_ASCEND_CL)
list(APPEND third_party_deps extern_ascend)
endif()
if(WITH_ASCEND_CL)
@@ -290,7 +290,7 @@ if (WITH_PSCORE)
include(external/leveldb)
list(APPEND third_party_deps extern_leveldb)
-
+
include(external/brpc)
list(APPEND third_party_deps extern_brpc)
diff --git a/go/demo/mobilenet_c_exp.cc b/go/demo/mobilenet_c_exp.cc
new file mode 100644
index 00000000000000..b4f42dab6790bf
--- /dev/null
+++ b/go/demo/mobilenet_c_exp.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include
+#include
+#include
+
+void ReadData(float* data, int size);
+
+int main(int argc, char* argv[]) {
+ PD_Config* config = PD_ConfigCreate();
+ PD_ConfigSetModel(config, "data/model/__model__", "data/model/__params__");
+ PD_ConfigDisableGlogInfo(config);
+
+ PD_Predictor* predictor = PD_PredictorCreate(config);
+ // config has destroyed in PD_PredictorCreate
+ config = NULL;
+
+ int input_num = PD_PredictorGetInputNum(predictor);
+ printf("Input num: %d\n", input_num);
+ int output_num = PD_PredictorGetOutputNum(predictor);
+ printf("Output num: %d\n", output_num);
+
+ PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+ PD_Tensor* input_tensor =
+ PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+ PD_OneDimArrayCstrDestroy(input_names);
+ input_names = NULL;
+
+ int32_t shape[] = {1, 3, 300, 300};
+ float* data = (float*)malloc(sizeof(float) * 1 * 3 * 300 * 300); // NOLINT
+ ReadData(data, 1 * 3 * 300 * 300); // NOLINT
+ PD_TensorReshape(input_tensor, 4, shape);
+ PD_TensorCopyFromCpuFloat(input_tensor, data);
+ free(data);
+ data = NULL;
+ PD_PredictorRun(predictor);
+
+ PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+ PD_Tensor* output_tensor =
+ PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+ PD_OneDimArrayCstrDestroy(output_names);
+ output_names = nullptr;
+
+ PD_OneDimArrayInt32* out_shape = PD_TensorGetShape(output_tensor);
+ int32_t size = 1;
+ for (size_t index = 0; index < out_shape->size; ++index) {
+ size = size * out_shape->data[index];
+ }
+ PD_OneDimArrayInt32Destroy(out_shape);
+ out_shape = NULL;
+
+ data = (float*)malloc(sizeof(float) * size); // NOLINT
+ PD_TensorCopyToCpuFloat(output_tensor, data);
+ free(data);
+ data = NULL;
+
+ PD_TensorDestroy(output_tensor);
+ output_tensor = NULL;
+ PD_TensorDestroy(input_tensor);
+ input_tensor = NULL;
+ PD_PredictorDestroy(predictor);
+ predictor = NULL;
+
+ return 0;
+}
+
+void ReadData(float* data, int n) {
+ FILE* fp = fopen("data/data.txt", "r");
+ for (int i = 0; i < n; i++) {
+ fscanf(fp, "%f", &data[i]);
+ }
+ fclose(fp);
+}
diff --git a/paddle/extension.h b/paddle/extension.h
index 71469576853a33..98d4bfd0326c5c 100644
--- a/paddle/extension.h
+++ b/paddle/extension.h
@@ -15,4 +15,4 @@ limitations under the License. */
#pragma once
// All paddle apis in C++ frontend
-#include "paddle/fluid/extension/include/ext_all.h"
+#include "paddle/extension/include/ext_all.h"
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index a2062d82c8130b..905347d031b35b 100644
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -11,8 +11,8 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
"${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
endif()
-add_subdirectory(table)
add_subdirectory(service)
+add_subdirectory(table)
add_subdirectory(test)
add_subdirectory(index_dataset)
diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc
index 9aafdd769ed4a0..dfd55f16e1a065 100644
--- a/paddle/fluid/distributed/fleet.cc
+++ b/paddle/fluid/distributed/fleet.cc
@@ -146,6 +146,44 @@ void FleetWrapper::CreateClient2ClientConnection() {
client2client_max_retry_);
}
+std::future FleetWrapper::PullSparseVarsAsync(
+ const Scope& scope, const uint64_t table_id,
+ const std::vector& var_names, std::vector* fea_keys,
+ std::vector>* fea_values, int fea_value_dim) {
+ fea_keys->clear();
+ fea_keys->resize(0);
+ fea_keys->reserve(MAX_FEASIGN_NUM);
+ for (auto name : var_names) {
+ Variable* var = scope.FindVar(name);
+ if (var == nullptr) {
+ continue;
+ }
+ LoDTensor* tensor = var->GetMutable();
+ CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
+ int64_t* ids = tensor->data();
+ size_t len = tensor->numel();
+ for (auto i = 0u; i < len; ++i) {
+ if (ids[i] == 0u) {
+ continue;
+ }
+ fea_keys->push_back(static_cast(ids[i]));
+ }
+ }
+ fea_values->resize(fea_keys->size() + 1);
+ for (auto& t : *fea_values) {
+ t.resize(fea_value_dim);
+ }
+ std::vector pull_result_ptr;
+ for (auto& t : *fea_values) {
+ pull_result_ptr.push_back(t.data());
+ }
+
+ bool training = true;
+ return pserver_ptr_->_worker_ptr->pull_sparse(pull_result_ptr.data(),
+ table_id, fea_keys->data(),
+ fea_keys->size(), training);
+}
+
void FleetWrapper::PullSparseVarsSync(
const Scope& scope, const uint64_t table_id,
const std::vector& var_names, std::vector* fea_keys,
diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h
index 863440180a808d..0da5d1e2bf987f 100644
--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
@@ -84,6 +84,15 @@ class FleetWrapper {
int fea_dim,
const std::vector& var_emb_names);
+ // Pull sparse variables from server in async mode
+ // Param: scope, table_id, var_names, fea_keys, fea_dim
+ // Param: fea_values std::future
+ std::future PullSparseVarsAsync(
+ const Scope& scope, const uint64_t table_id,
+ const std::vector& var_names,
+ std::vector* fea_keys,
+ std::vector>* fea_values, int fea_dim);
+
// Pull sparse variables from server in sync mode
// pull immediately to tensors
// is_training is true means training, false means inference, the behavior is
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc
index 58f85d98fb09c6..3e573bbdd2de97 100644
--- a/paddle/fluid/distributed/index_dataset/index_sampler.cc
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc
@@ -13,13 +13,10 @@
// limitations under the License.
#include "paddle/fluid/distributed/index_dataset/index_sampler.h"
-#include "paddle/fluid/operators/math/sampler.h"
namespace paddle {
namespace distributed {
-using Sampler = paddle::operators::math::Sampler;
-
std::vector> LayerWiseSampler::sample(
const std::vector>& user_inputs,
const std::vector& target_ids, bool with_hierarchy) {
@@ -30,22 +27,7 @@ std::vector> LayerWiseSampler::sample(
std::vector(user_feature_num + 2));
auto max_layer = tree_->Height();
- std::vector sampler_vec(max_layer - start_sample_layer_);
- std::vector> layer_ids(max_layer -
- start_sample_layer_);
-
- auto layer_index = max_layer - 1;
size_t idx = 0;
- while (layer_index >= start_sample_layer_) {
- auto layer_codes = tree_->GetLayerCodes(layer_index);
- layer_ids[idx] = tree_->GetNodes(layer_codes);
- sampler_vec[idx] = new paddle::operators::math::UniformSampler(
- layer_ids[idx].size() - 1, seed_);
- layer_index--;
- idx++;
- }
-
- idx = 0;
for (size_t i = 0; i < input_num; i++) {
auto travel_codes =
tree_->GetTravelCodes(target_ids[i], start_sample_layer_);
@@ -76,18 +58,15 @@ std::vector> LayerWiseSampler::sample(
for (int idx_offset = 0; idx_offset < layer_counts_[j]; idx_offset++) {
int sample_res = 0;
do {
- sample_res = sampler_vec[j]->Sample();
- } while (layer_ids[j][sample_res].id() == travel_path[j].id());
+ sample_res = sampler_vec_[j]->Sample();
+ } while (layer_ids_[j][sample_res].id() == travel_path[j].id());
outputs[idx + idx_offset][user_feature_num] =
- layer_ids[j][sample_res].id();
+ layer_ids_[j][sample_res].id();
outputs[idx + idx_offset][user_feature_num + 1] = 0;
}
idx += layer_counts_[j];
}
}
- for (size_t i = 0; i < sampler_vec.size(); i++) {
- delete sampler_vec[i];
- }
return outputs;
}
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h
index 66882bedc9b765..8813421446a21c 100644
--- a/paddle/fluid/distributed/index_dataset/index_sampler.h
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.h
@@ -16,6 +16,7 @@
#include
#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
@@ -83,6 +84,23 @@ class LayerWiseSampler : public IndexSampler {
}
reverse(layer_counts_.begin(), layer_counts_.end());
VLOG(3) << "sample counts sum: " << layer_counts_sum_;
+
+ auto max_layer = tree_->Height();
+ sampler_vec_.clear();
+ layer_ids_.clear();
+
+ auto layer_index = max_layer - 1;
+ size_t idx = 0;
+ while (layer_index >= start_sample_layer_) {
+ auto layer_codes = tree_->GetLayerCodes(layer_index);
+ layer_ids_.push_back(tree_->GetNodes(layer_codes));
+ auto sampler_temp =
+ std::make_shared(
+ layer_ids_[idx].size() - 1, seed_);
+ sampler_vec_.push_back(sampler_temp);
+ layer_index--;
+ idx++;
+ }
}
std::vector> sample(
const std::vector>& user_inputs,
@@ -94,6 +112,8 @@ class LayerWiseSampler : public IndexSampler {
std::shared_ptr tree_{nullptr};
int seed_{0};
int start_sample_layer_{1};
+ std::vector> sampler_vec_;
+ std::vector> layer_ids_;
};
} // end namespace distributed
diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc
index a9370561a540be..a1440260bf2e77 100644
--- a/paddle/fluid/distributed/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_server.cc
@@ -14,6 +14,7 @@
#include "paddle/fluid/distributed/service/brpc_ps_server.h"
#include // NOLINT
+#include "butil/object_pool.h"
#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
#include "paddle/fluid/distributed/table/table.h"
#include "paddle/fluid/framework/archive.h"
@@ -196,12 +197,13 @@ int32_t BrpcPsService::pull_dense(Table *table, const PsRequestMessage &request,
return 0;
}
- std::vector res_data;
- res_data.resize(num * table->value_accesor()->select_size() / sizeof(float));
- table->pull_dense(res_data.data(), num);
+ auto res_data = butil::get_object>();
+ res_data->resize(num * table->value_accesor()->select_size() / sizeof(float));
+ table->pull_dense(res_data->data(), num);
- cntl->response_attachment().append((char *)res_data.data(),
- res_data.size() * sizeof(float));
+ cntl->response_attachment().append((char *)(res_data->data()),
+ res_data->size() * sizeof(float));
+ butil::return_object(res_data);
return 0;
}
@@ -367,12 +369,13 @@ int32_t BrpcPsService::pull_sparse(Table *table,
value.DeserializeFromBytes(const_cast(data));
- std::vector res_data;
- res_data.resize(num * dim);
- table->pull_sparse(res_data.data(), value);
+ auto res_data = butil::get_object>();
+ res_data->resize(num * dim);
+ table->pull_sparse(res_data->data(), value);
- cntl->response_attachment().append((char *)res_data.data(),
- res_data.size() * sizeof(float));
+ cntl->response_attachment().append((char *)(res_data->data()),
+ res_data->size() * sizeof(float));
+ butil::return_object(res_data);
return 0;
}
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc
index a6271cac83c9a9..eafb4d596cc167 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_client.cc
@@ -135,7 +135,8 @@ std::future GraphBrpcClient::get_node_feat(
closure->request(request_idx)
->add_params(joint_feature_name.c_str(), joint_feature_name.size());
- PsService_Stub rpc_stub(get_cmd_channel(server_index));
+ GraphPsService_Stub rpc_stub =
+ getServiceStub(get_cmd_channel(server_index));
closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
closure->response(request_idx), closure);
diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h
index e185f23e3d240f..c6657be96ba446 100644
--- a/paddle/fluid/distributed/service/graph_py_service.h
+++ b/paddle/fluid/distributed/service/graph_py_service.h
@@ -54,19 +54,7 @@ class GraphPyService {
std::vector table_feat_conf_feat_dtype;
std::vector table_feat_conf_feat_shape;
- // std::thread *server_thread, *client_thread;
-
- // std::shared_ptr pserver_ptr;
-
- // std::shared_ptr worker_ptr;
-
public:
- // std::shared_ptr get_ps_server() {
- // return pserver_ptr;
- // }
- // std::shared_ptr get_ps_client() {
- // return worker_ptr;
- // }
int get_shard_num() { return shard_num; }
void set_shard_num(int shard_num) { this->shard_num = shard_num; }
void GetDownpourSparseTableProto(
diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt
index dde1f5ae8ee3a1..dab390958034af 100644
--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -13,7 +13,11 @@ set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTR
set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator)
+get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
+
+cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc
+sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS}
+${RPC_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator)
set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc
index 020bcdcc52ef4b..0dc99de1bfe82a 100644
--- a/paddle/fluid/distributed/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
@@ -171,7 +171,7 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
auto paths = paddle::string::split_string(path, ";");
- int count = 0;
+ int64_t count = 0;
std::string sample_type = "random";
bool is_weighted = false;
int valid_count = 0;
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h
index 8ddf3c8f904a6c..b18da82abe61c9 100644
--- a/paddle/fluid/distributed/table/common_graph_table.h
+++ b/paddle/fluid/distributed/table/common_graph_table.h
@@ -33,26 +33,11 @@ namespace paddle {
namespace distributed {
class GraphShard {
public:
- // static int bucket_low_bound;
- // static int gcd(int s, int t) {
- // if (s % t == 0) return t;
- // return gcd(t, s % t);
- // }
size_t get_size();
GraphShard() {}
- GraphShard(int shard_num) {
- this->shard_num = shard_num;
- // bucket_size = init_bucket_size(shard_num);
- // bucket.resize(bucket_size);
- }
+ GraphShard(int shard_num) { this->shard_num = shard_num; }
std::vector &get_bucket() { return bucket; }
std::vector get_batch(int start, int end, int step);
- // int init_bucket_size(int shard_num) {
- // for (int i = bucket_low_bound;; i++) {
- // if (gcd(i, shard_num) == 1) return i;
- // }
- // return -1;
- // }
std::vector get_ids_by_range(int start, int end) {
std::vector res;
for (int i = start; i < end && i < bucket.size(); i++) {
@@ -64,7 +49,6 @@ class GraphShard {
FeatureNode *add_feature_node(uint64_t id);
Node *find_node(uint64_t id);
void add_neighboor(uint64_t id, uint64_t dst_id, float weight);
- // std::unordered_map::iterator>
std::unordered_map get_node_location() {
return node_location;
}
@@ -131,7 +115,7 @@ class GraphTable : public SparseTable {
protected:
std::vector shards;
size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num;
- const int task_pool_size_ = 11;
+ const int task_pool_size_ = 24;
const int random_sample_nodes_ranges = 3;
std::vector feat_name;
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index 2e8c257b6aad47..718fce9950719f 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -125,34 +125,37 @@ void ProcessALine(const std::vector& columns, const Meta& meta,
int64_t SaveToText(std::ostream* os, std::shared_ptr block,
const int mode) {
- int64_t not_save_num = 0;
- for (auto value : block->values_) {
- if (mode == SaveMode::delta && !value.second->need_save_) {
- not_save_num++;
- continue;
- }
-
- auto* vs = value.second->data_.data();
- std::stringstream ss;
- auto id = value.first;
- ss << id << "\t" << value.second->count_ << "\t"
- << value.second->unseen_days_ << "\t" << value.second->is_entry_ << "\t";
-
- for (int i = 0; i < block->value_length_; i++) {
- ss << vs[i];
- ss << ",";
- }
+ int64_t save_num = 0;
+ for (auto& table : block->values_) {
+ for (auto& value : table) {
+ if (mode == SaveMode::delta && !value.second->need_save_) {
+ continue;
+ }
+ save_num += 1;
+
+ auto* vs = value.second->data_.data();
+ std::stringstream ss;
+ auto id = value.first;
+ ss << id << "\t" << value.second->count_ << "\t"
+ << value.second->unseen_days_ << "\t" << value.second->is_entry_
+ << "\t";
+
+ for (int i = 0; i < block->value_length_; i++) {
+ ss << vs[i];
+ ss << ",";
+ }
- ss << "\n";
+ ss << "\n";
- os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
+ os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
- if (mode == SaveMode::base || mode == SaveMode::delta) {
- value.second->need_save_ = false;
+ if (mode == SaveMode::base || mode == SaveMode::delta) {
+ value.second->need_save_ = false;
+ }
}
}
- return block->values_.size() - not_save_num;
+ return save_num;
}
int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
@@ -183,7 +186,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
block->Init(id, false);
- auto value_instant = block->GetValue(id);
+ VALUE* value_instant = block->GetValue(id);
if (values.size() == 5) {
value_instant->count_ = std::stoi(values[1]);
value_instant->unseen_days_ = std::stoi(values[2]);
@@ -373,8 +376,10 @@ std::pair CommonSparseTable::print_table_stat() {
int64_t feasign_size = 0;
int64_t mf_size = 0;
- for (auto& value : shard_values_) {
- feasign_size += value->values_.size();
+ for (auto& shard : shard_values_) {
+ for (auto& table : shard->values_) {
+ feasign_size += table.size();
+ }
}
return {feasign_size, mf_size};
diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h
index 68d252661edd53..5c10fca98cda4d 100644
--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -26,8 +26,10 @@
#include
#include "gflags/gflags.h"
+#include "butil/object_pool.h"
#include "paddle/fluid/distributed/common/utils.h"
#include "paddle/fluid/distributed/table/depends/initializers.h"
+#include "paddle/fluid/distributed/thirdparty/round_robin.h"
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/rw_lock.h"
@@ -47,6 +49,10 @@ namespace distributed {
enum Mode { training, infer };
+static const int SPARSE_SHARD_BUCKET_NUM_BITS = 6;
+static const size_t SPARSE_SHARD_BUCKET_NUM = (size_t)1
+ << SPARSE_SHARD_BUCKET_NUM_BITS;
+
struct VALUE {
explicit VALUE(size_t length)
: length_(length),
@@ -66,11 +72,11 @@ struct VALUE {
bool is_entry_; // whether knock-in
};
-inline bool count_entry(std::shared_ptr value, int threshold) {
+inline bool count_entry(VALUE *value, int threshold) {
return value->count_ >= threshold;
}
-inline bool probility_entry(std::shared_ptr value, float threshold) {
+inline bool probility_entry(VALUE *value, float threshold) {
UniformInitializer uniform = UniformInitializer({"uniform", "0", "0", "1"});
return uniform.GetValue() >= threshold;
}
@@ -145,7 +151,7 @@ class ValueBlock {
const std::vector &value_dims) {
auto pts = std::vector();
pts.reserve(value_names.size());
- auto &values = values_.at(id);
+ auto values = GetValue(id);
for (int i = 0; i < static_cast(value_names.size()); i++) {
PADDLE_ENFORCE_EQ(
value_dims[i], value_dims_[i],
@@ -159,35 +165,48 @@ class ValueBlock {
// pull
float *Init(const uint64_t &id, const bool with_update = true,
const int counter = 1) {
- if (!Has(id)) {
- values_[id] = std::make_shared(value_length_);
- }
+ size_t hash = _hasher(id);
+ size_t bucket = compute_bucket(hash);
+
+ auto &table = values_[bucket];
+ auto res = table.find(id);
- auto &value = values_.at(id);
+ VALUE *value = nullptr;
+ if (res == table.end()) {
+ value = butil::get_object(value_length_);
+
+ table[id] = value;
+
+ } else {
+ value = res->second;
+ }
if (with_update) {
AttrUpdate(value, counter);
}
-
return value->data_.data();
}
VALUE *InitGet(const uint64_t &id, const bool with_update = true,
const int counter = 1) {
- if (!Has(id)) {
- values_[id] = std::make_shared(value_length_);
- }
+ size_t hash = _hasher(id);
+ size_t bucket = compute_bucket(hash);
- auto &value = values_.at(id);
+ auto &table = values_[bucket];
+ auto res = table.find(id);
- if (with_update) {
- AttrUpdate(value, counter);
+ VALUE *value = nullptr;
+ if (res == table.end()) {
+ value = butil::get_object(value_length_);
+ // value = _alloc.acquire(value_length_);
+ table[id] = value;
+ } else {
+ value = (VALUE *)(void *)(res->second);
}
-
- return value.get();
+ return value;
}
- void AttrUpdate(std::shared_ptr value, const int counter) {
+ void AttrUpdate(VALUE *value, const int counter) {
// update state
value->unseen_days_ = 0;
value->count_ += counter;
@@ -211,42 +230,73 @@ class ValueBlock {
// dont jude if (has(id))
float *Get(const uint64_t &id) {
- auto &value = values_.at(id);
+ size_t hash = _hasher(id);
+ size_t bucket = compute_bucket(hash);
+ auto &table = values_[bucket];
+
+ // auto &value = table.at(id);
+ // return value->data_.data();
+ auto res = table.find(id);
+ VALUE *value = res->second;
return value->data_.data();
}
// for load, to reset count, unseen_days
- std::shared_ptr GetValue(const uint64_t &id) { return values_.at(id); }
+ VALUE *GetValue(const uint64_t &id) {
+ size_t hash = _hasher(id);
+ size_t bucket = compute_bucket(hash);
+
+ auto &table = values_[bucket];
+ auto res = table.find(id);
+ return res->second;
+ }
bool GetEntry(const uint64_t &id) {
- auto &value = values_.at(id);
+ auto value = GetValue(id);
return value->is_entry_;
}
void SetEntry(const uint64_t &id, const bool state) {
- auto &value = values_.at(id);
+ auto value = GetValue(id);
value->is_entry_ = state;
}
void Shrink(const int threshold) {
- for (auto iter = values_.begin(); iter != values_.end();) {
- auto &value = iter->second;
- value->unseen_days_++;
- if (value->unseen_days_ >= threshold) {
- iter = values_.erase(iter);
- } else {
- ++iter;
+ for (auto &table : values_) {
+ for (auto iter = table.begin(); iter != table.end();) {
+ // VALUE* value = (VALUE*)(void*)(iter->second);
+ VALUE *value = iter->second;
+ value->unseen_days_++;
+ if (value->unseen_days_ >= threshold) {
+ butil::return_object(iter->second);
+ //_alloc.release(iter->second);
+ //_alloc.release(value);
+ iter = table.erase(iter);
+ } else {
+ ++iter;
+ }
}
}
return;
}
float GetThreshold() { return threshold_; }
+ size_t compute_bucket(size_t hash) {
+ if (SPARSE_SHARD_BUCKET_NUM == 1) {
+ return 0;
+ } else {
+ return hash >> (sizeof(size_t) * 8 - SPARSE_SHARD_BUCKET_NUM_BITS);
+ }
+ }
private:
bool Has(const uint64_t id) {
- auto got = values_.find(id);
- if (got == values_.end()) {
+ size_t hash = _hasher(id);
+ size_t bucket = compute_bucket(hash);
+ auto &table = values_[bucket];
+
+ auto got = table.find(id);
+ if (got == table.end()) {
return false;
} else {
return true;
@@ -254,8 +304,9 @@ class ValueBlock {
}
public:
- std::unordered_map> values_;
+ robin_hood::unordered_map values_[SPARSE_SHARD_BUCKET_NUM];
size_t value_length_ = 0;
+ std::hash _hasher;
private:
const std::vector &value_names_;
@@ -263,7 +314,7 @@ class ValueBlock {
const std::vector &value_offsets_;
const std::unordered_map &value_idx_;
- std::function)> entry_func_;
+ std::function entry_func_;
std::vector> initializers_;
float threshold_;
};
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index b756c740ac764c..af87e1b6cc61d1 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -1,8 +1,10 @@
set_source_files_properties(table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
+cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor
+ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS})
set_source_files_properties(dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
+cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table
+tensor_accessor ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS})
set_source_files_properties(barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(barrier_table_test SRCS barrier_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
diff --git a/paddle/fluid/distributed/thirdparty/round_robin.h b/paddle/fluid/distributed/thirdparty/round_robin.h
new file mode 100644
index 00000000000000..f5075b4545af04
--- /dev/null
+++ b/paddle/fluid/distributed/thirdparty/round_robin.h
@@ -0,0 +1,2685 @@
+// ______ _____ ______ _________
+// ______________ ___ /_ ___(_)_______ ___ /_ ______ ______ ______ /
+// __ ___/_ __ \__ __ \__ / __ __ \ __ __ \_ __ \_ __ \_ __ /
+// _ / / /_/ /_ /_/ /_ / _ / / / _ / / // /_/ // /_/ // /_/ /
+// /_/ \____/ /_.___/ /_/ /_/ /_/ ________/_/ /_/ \____/ \____/ \__,_/
+// _/_____/
+//
+// Fast & memory efficient hashtable based on robin hood hashing for
+// C++11/14/17/20
+// https://github.com/martinus/robin-hood-hashing
+//
+// Licensed under the MIT License .
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2021 Martin Ankerl
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef ROBIN_HOOD_H_INCLUDED
+#define ROBIN_HOOD_H_INCLUDED
+
+// see https://semver.org/
+#define ROBIN_HOOD_VERSION_MAJOR 3 // for incompatible API changes
+#define ROBIN_HOOD_VERSION_MINOR \
+ 11 // for adding functionality in a backwards-compatible manner
+#define ROBIN_HOOD_VERSION_PATCH 1 // for backwards-compatible bug fixes
+
+#include
+#include
+#include
+#include
+#include // only to support hash of smart pointers
+#include
+#include
+#include
+#include
+#if __cplusplus >= 201703L
+#include
+#endif
+
+// #define ROBIN_HOOD_LOG_ENABLED
+#ifdef ROBIN_HOOD_LOG_ENABLED
+#include
+#define ROBIN_HOOD_LOG(...) \
+ std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \
+ << std::endl;
+#else
+#define ROBIN_HOOD_LOG(x)
+#endif
+
+// #define ROBIN_HOOD_TRACE_ENABLED
+#ifdef ROBIN_HOOD_TRACE_ENABLED
+#include
+#define ROBIN_HOOD_TRACE(...) \
+ std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \
+ << std::endl;
+#else
+#define ROBIN_HOOD_TRACE(x)
+#endif
+
+// #define ROBIN_HOOD_COUNT_ENABLED
+#ifdef ROBIN_HOOD_COUNT_ENABLED
+#include
+#define ROBIN_HOOD_COUNT(x) ++counts().x;
+namespace robin_hood {
+struct Counts {
+ uint64_t shiftUp{};
+ uint64_t shiftDown{};
+};
+inline std::ostream &operator<<(std::ostream &os, Counts const &c) {
+ return os << c.shiftUp << " shiftUp" << std::endl
+ << c.shiftDown << " shiftDown" << std::endl;
+}
+
+static Counts &counts() {
+ static Counts counts{};
+ return counts;
+}
+} // namespace robin_hood
+#else
+#define ROBIN_HOOD_COUNT(x)
+#endif
+
+// all non-argument macros should use this facility. See
+// https://www.fluentcpp.com/2019/05/28/better-macros-better-flags/
+#define ROBIN_HOOD(x) ROBIN_HOOD_PRIVATE_DEFINITION_##x()
+
+// mark unused members with this macro
+#define ROBIN_HOOD_UNUSED(identifier)
+
+// bitness
+#if SIZE_MAX == UINT32_MAX
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 32
+#elif SIZE_MAX == UINT64_MAX
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 64
+#else
+#error Unsupported bitness
+#endif
+
+// endianess
+#ifdef _MSC_VER
+#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() 1
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() 0
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() \
+ (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() \
+ (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#endif
+
+// inline
+#ifdef _MSC_VER
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __declspec(noinline)
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __attribute__((noinline))
+#endif
+
+// exceptions
+#if !defined(__cpp_exceptions) && !defined(__EXCEPTIONS) && !defined(_CPPUNWIND)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 0
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 1
+#endif
+
+// count leading/trailing bits
+#if !defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+#ifdef _MSC_VER
+#if ROBIN_HOOD(BITNESS) == 32
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward64
+#endif
+#include
+#pragma intrinsic(ROBIN_HOOD(BITSCANFORWARD))
+#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) \
+ [](size_t mask) noexcept->int { \
+ unsigned long index; \
+ return ROBIN_HOOD(BITSCANFORWARD)(&index, mask) ? static_cast(index) \
+ : ROBIN_HOOD(BITNESS); \
+ } \
+ (x)
+#else
+#if ROBIN_HOOD(BITNESS) == 32
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzl
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzl
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzll
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzll
+#endif
+#define ROBIN_HOOD_COUNT_LEADING_ZEROES(x) \
+ ((x) ? ROBIN_HOOD(CLZ)(x) : ROBIN_HOOD(BITNESS))
+#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) \
+ ((x) ? ROBIN_HOOD(CTZ)(x) : ROBIN_HOOD(BITNESS))
+#endif
+#endif
+
+// fallthrough
+#ifndef __has_cpp_attribute // For backwards compatibility
+#define __has_cpp_attribute(x) 0
+#endif
+#if __has_cpp_attribute(clang::fallthrough)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[clang::fallthrough]]
+#elif __has_cpp_attribute(gnu::fallthrough)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[gnu::fallthrough]]
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH()
+#endif
+
+// likely/unlikely
+#ifdef _MSC_VER
+#define ROBIN_HOOD_LIKELY(condition) condition
+#define ROBIN_HOOD_UNLIKELY(condition) condition
+#else
+#define ROBIN_HOOD_LIKELY(condition) __builtin_expect(condition, 1)
+#define ROBIN_HOOD_UNLIKELY(condition) __builtin_expect(condition, 0)
+#endif
+
+// detect if native wchar_t type is availiable in MSVC
+#ifdef _MSC_VER
+#ifdef _NATIVE_WCHAR_T_DEFINED
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 0
+#endif
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#endif
+
+// detect if MSVC supports the pair(std::piecewise_construct_t,...) consructor
+// being constexpr
+#ifdef _MSC_VER
+#if _MSC_VER <= 1900
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 1
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0
+#endif
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0
+#endif
+
+// workaround missing "is_trivially_copyable" in g++ < 5.0
+// See https://stackoverflow.com/a/31798726/48181
+#if defined(__GNUC__) && __GNUC__ < 5
+#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
+#else
+#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) \
+ std::is_trivially_copyable<__VA_ARGS__>::value
+#endif
+
+// helpers for C++ versions, see
+// https://gcc.gnu.org/onlinedocs/cpp/Standard-Predefined-Macros.html
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX() __cplusplus
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX98() 199711L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX11() 201103L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX14() 201402L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX17() 201703L
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD() [[nodiscard]]
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD()
+#endif
+
+namespace robin_hood {
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14)
+#define ROBIN_HOOD_STD std
+#else
+
+// c++11 compatibility layer
+namespace ROBIN_HOOD_STD {
+template
+struct alignment_of
+ : std::integral_constant<
+ std::size_t, alignof(typename std::remove_all_extents::type)> {};
+
+template
+class integer_sequence {
+ public:
+ using value_type = T;
+ static_assert(std::is_integral::value, "not integral type");
+ static constexpr std::size_t size() noexcept { return sizeof...(Ints); }
+};
+template
+using index_sequence = integer_sequence;
+
+namespace detail_ {
+template
+struct IntSeqImpl {
+ using TValue = T;
+ static_assert(std::is_integral::value, "not integral type");
+ static_assert(Begin >= 0 && Begin < End,
+ "unexpected argument (Begin<0 || Begin<=End)");
+
+ template
+ struct IntSeqCombiner;
+
+ template
+ struct IntSeqCombiner,
+ integer_sequence> {
+ using TResult = integer_sequence;
+ };
+
+ using TResult = typename IntSeqCombiner<
+ typename IntSeqImpl::TResult,
+ typename IntSeqImpl::TResult>::TResult;
+};
+
+template
+struct IntSeqImpl {
+ using TValue = T;
+ static_assert(std::is_integral::value, "not integral type");
+ static_assert(Begin >= 0, "unexpected argument (Begin<0)");
+ using TResult = integer_sequence;
+};
+
+template
+struct IntSeqImpl {
+ using TValue = T;
+ static_assert(std::is_integral::value, "not integral type");
+ static_assert(Begin >= 0, "unexpected argument (Begin<0)");
+ using TResult = integer_sequence;
+};
+} // namespace detail_
+
+template
+using make_integer_sequence =
+ typename detail_::IntSeqImpl::TResult;
+
+template
+using make_index_sequence = make_integer_sequence;
+
+template
+using index_sequence_for = make_index_sequence;
+
+} // namespace ROBIN_HOOD_STD
+
+#endif
+
+namespace detail {
+
+// make sure we static_cast to the correct type for hash_int
+#if ROBIN_HOOD(BITNESS) == 64
+using SizeT = uint64_t;
+#else
+using SizeT = uint32_t;
+#endif
+
+template
+T rotr(T x, unsigned k) {
+ return (x >> k) | (x << (8U * sizeof(T) - k));
+}
+
+// This cast gets rid of warnings like "cast from 'uint8_t*' {aka 'unsigned
+// char*'} to
+// 'uint64_t*' {aka 'long unsigned int*'} increases required alignment of target
+// type". Use with
+// care!
+template
+inline T reinterpret_cast_no_cast_align_warning(void *ptr) noexcept {
+ return reinterpret_cast(ptr);
+}
+
+template
+inline T reinterpret_cast_no_cast_align_warning(void const *ptr) noexcept {
+ return reinterpret_cast(ptr);
+}
+
+// make sure this is not inlined as it is slow and dramatically enlarges code,
+// thus making other
+// inlinings more difficult. Throws are also generally the slow path.
+template
+[[noreturn]] ROBIN_HOOD(NOINLINE)
+#if ROBIN_HOOD(HAS_EXCEPTIONS)
+ void doThrow(Args &&... args) {
+ // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
+ throw E(std::forward(args)...);
+}
+#else
+ void doThrow(Args &&... ROBIN_HOOD_UNUSED(args) /*unused*/) {
+ abort();
+}
+#endif
+
+template
+T *assertNotNull(T *t, Args &&... args) {
+ if (ROBIN_HOOD_UNLIKELY(nullptr == t)) {
+ doThrow(std::forward(args)...);
+ }
+ return t;
+}
+
+template
+inline T unaligned_load(void const *ptr) noexcept {
+ // using memcpy so we don't get into unaligned load problems.
+ // compiler should optimize this very well anyways.
+ T t;
+ std::memcpy(&t, ptr, sizeof(T));
+ return t;
+}
+
+// Allocates bulks of memory for objects of type T. This deallocates the memory
+// in the destructor,
+// and keeps a linked list of the allocated memory around. Overhead per
+// allocation is the size of a
+// pointer.
+template
+class BulkPoolAllocator {
+ public:
+ BulkPoolAllocator() noexcept = default;
+
+ // does not copy anything, just creates a new allocator.
+ BulkPoolAllocator(const BulkPoolAllocator &ROBIN_HOOD_UNUSED(
+ o) /*unused*/) noexcept : mHead(nullptr),
+ mListForFree(nullptr) {}
+
+ BulkPoolAllocator(BulkPoolAllocator &&o) noexcept
+ : mHead(o.mHead),
+ mListForFree(o.mListForFree) {
+ o.mListForFree = nullptr;
+ o.mHead = nullptr;
+ }
+
+ BulkPoolAllocator &operator=(BulkPoolAllocator &&o) noexcept {
+ reset();
+ mHead = o.mHead;
+ mListForFree = o.mListForFree;
+ o.mListForFree = nullptr;
+ o.mHead = nullptr;
+ return *this;
+ }
+
+ BulkPoolAllocator &
+ // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
+ operator=(const BulkPoolAllocator &ROBIN_HOOD_UNUSED(o) /*unused*/) noexcept {
+ // does not do anything
+ return *this;
+ }
+
+ ~BulkPoolAllocator() noexcept { reset(); }
+
+ // Deallocates all allocated memory.
+ void reset() noexcept {
+ while (mListForFree) {
+ T *tmp = *mListForFree;
+ ROBIN_HOOD_LOG("std::free")
+ std::free(mListForFree);
+ mListForFree = reinterpret_cast_no_cast_align_warning(tmp);
+ }
+ mHead = nullptr;
+ }
+
+ // allocates, but does NOT initialize. Use in-place new constructor, e.g.
+ // T* obj = pool.allocate();
+ // ::new (static_cast(obj)) T();
+ T *allocate() {
+ T *tmp = mHead;
+ if (!tmp) {
+ tmp = performAllocation();
+ }
+
+ mHead = *reinterpret_cast_no_cast_align_warning(tmp);
+ return tmp;
+ }
+
+ // does not actually deallocate but puts it in store.
+ // make sure you have already called the destructor! e.g. with
+ // obj->~T();
+ // pool.deallocate(obj);
+ void deallocate(T *obj) noexcept {
+ *reinterpret_cast_no_cast_align_warning(obj) = mHead;
+ mHead = obj;
+ }
+
+ // Adds an already allocated block of memory to the allocator. This allocator
+ // is from now on
+ // responsible for freeing the data (with free()). If the provided data is not
+ // large enough to
+ // make use of, it is immediately freed. Otherwise it is reused and freed in
+ // the destructor.
+ void addOrFree(void *ptr, const size_t numBytes) noexcept {
+ // calculate number of available elements in ptr
+ if (numBytes < ALIGNMENT + ALIGNED_SIZE) {
+ // not enough data for at least one element. Free and return.
+ ROBIN_HOOD_LOG("std::free")
+ std::free(ptr);
+ } else {
+ ROBIN_HOOD_LOG("add to buffer")
+ add(ptr, numBytes);
+ }
+ }
+
+ void swap(BulkPoolAllocator &other) noexcept {
+ using std::swap;
+ swap(mHead, other.mHead);
+ swap(mListForFree, other.mListForFree);
+ }
+
+ private:
+ // iterates the list of allocated memory to calculate how many to alloc next.
+ // Recalculating this each time saves us a size_t member.
+ // This ignores the fact that memory blocks might have been added manually
+ // with addOrFree. In
+ // practice, this should not matter much.
+ ROBIN_HOOD(NODISCARD) size_t calcNumElementsToAlloc() const noexcept {
+ auto tmp = mListForFree;
+ size_t numAllocs = MinNumAllocs;
+
+ while (numAllocs * 2 <= MaxNumAllocs && tmp) {
+ auto x = reinterpret_cast(tmp);
+ tmp = *x;
+ numAllocs *= 2;
+ }
+
+ return numAllocs;
+ }
+
+ // WARNING: Underflow if numBytes < ALIGNMENT! This is guarded in addOrFree().
+ void add(void *ptr, const size_t numBytes) noexcept {
+ const size_t numElements = (numBytes - ALIGNMENT) / ALIGNED_SIZE;
+
+ auto data = reinterpret_cast(ptr);
+
+ // link free list
+ auto x = reinterpret_cast(data);
+ *x = mListForFree;
+ mListForFree = data;
+
+ // create linked list for newly allocated data
+ auto *const headT = reinterpret_cast_no_cast_align_warning(
+ reinterpret_cast(ptr) + ALIGNMENT);
+
+ auto *const head = reinterpret_cast(headT);
+
+ // Visual Studio compiler automatically unrolls this loop, which is pretty
+ // cool
+ for (size_t i = 0; i < numElements; ++i) {
+ *reinterpret_cast_no_cast_align_warning(
+ head + i * ALIGNED_SIZE) = head + (i + 1) * ALIGNED_SIZE;
+ }
+
+ // last one points to 0
+ *reinterpret_cast_no_cast_align_warning(
+ head + (numElements - 1) * ALIGNED_SIZE) = mHead;
+ mHead = headT;
+ }
+
+ // Called when no memory is available (mHead == 0).
+ // Don't inline this slow path.
+ ROBIN_HOOD(NOINLINE) T *performAllocation() {
+ size_t const numElementsToAlloc = calcNumElementsToAlloc();
+
+ // alloc new memory: [prev |T, T, ... T]
+ size_t const bytes = ALIGNMENT + ALIGNED_SIZE * numElementsToAlloc;
+ ROBIN_HOOD_LOG("std::malloc " << bytes << " = " << ALIGNMENT << " + "
+ << ALIGNED_SIZE << " * "
+ << numElementsToAlloc)
+ add(assertNotNull(std::malloc(bytes)), bytes);
+ return mHead;
+ }
+
+// enforce byte alignment of the T's
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14)
+ static constexpr size_t ALIGNMENT =
+ (std::max)(std::alignment_of::value, std::alignment_of::value);
+#else
+ static const size_t ALIGNMENT =
+ (ROBIN_HOOD_STD::alignment_of::value >
+ ROBIN_HOOD_STD::alignment_of::value)
+ ? ROBIN_HOOD_STD::alignment_of::value
+ : +ROBIN_HOOD_STD::alignment_of::value; // the + is for
+ // walkarround
+#endif
+
+ static constexpr size_t ALIGNED_SIZE =
+ ((sizeof(T) - 1) / ALIGNMENT + 1) * ALIGNMENT;
+
+ static_assert(MinNumAllocs >= 1, "MinNumAllocs");
+ static_assert(MaxNumAllocs >= MinNumAllocs, "MaxNumAllocs");
+ static_assert(ALIGNED_SIZE >= sizeof(T *), "ALIGNED_SIZE");
+ static_assert(0 == (ALIGNED_SIZE % sizeof(T *)), "ALIGNED_SIZE mod");
+ static_assert(ALIGNMENT >= sizeof(T *), "ALIGNMENT");
+
+ T *mHead{nullptr};
+ T **mListForFree{nullptr};
+};
+
+template
+struct NodeAllocator;
+
+// dummy allocator that does nothing
+template
+struct NodeAllocator {
+ // we are not using the data, so just free it.
+ void addOrFree(void *ptr,
+ size_t ROBIN_HOOD_UNUSED(numBytes) /*unused*/) noexcept {
+ ROBIN_HOOD_LOG("std::free")
+ std::free(ptr);
+ }
+};
+
+template
+struct NodeAllocator
+ : public BulkPoolAllocator {};
+
+// c++14 doesn't have is_nothrow_swappable, and clang++ 6.0.1 doesn't like it
+// either, so I'm making
+// my own here.
+namespace swappable {
+#if ROBIN_HOOD(CXX) < ROBIN_HOOD(CXX17)
+using std::swap;
+template
+struct nothrow {
+ static const bool value =
+ noexcept(swap(std::declval(), std::declval()));
+};
+#else
+template
+struct nothrow {
+ static const bool value = std::is_nothrow_swappable::value;
+};
+#endif
+} // namespace swappable
+
+} // namespace detail
+
+struct is_transparent_tag {};
+
+// A custom pair implementation is used in the map because std::pair is not
+// is_trivially_copyable,
+// which means it would not be allowed to be used in std::memcpy. This struct
+// is copyable, which is
+// also tested.
+template
+struct pair {
+ using first_type = T1;
+ using second_type = T2;
+
+ template ::value &&
+ std::is_default_constructible::value>::type>
+ constexpr pair() noexcept(noexcept(U1()) && noexcept(U2()))
+ : first(), second() {}
+
+ // pair constructors are explicit so we don't accidentally call this ctor when
+ // we don't have to.
+ explicit constexpr pair(std::pair const &o) noexcept(
+ noexcept(T1(std::declval())) &&
+ noexcept(T2(std::declval())))
+ : first(o.first), second(o.second) {}
+
+ // pair constructors are explicit so we don't accidentally call this ctor when
+ // we don't have to.
+ explicit constexpr pair(std::pair &&o) noexcept(
+ noexcept(T1(std::move(std::declval()))) &&
+ noexcept(T2(std::move(std::declval()))))
+ : first(std::move(o.first)), second(std::move(o.second)) {}
+
+ constexpr pair(T1 &&a, T2 &&b) noexcept(
+ noexcept(T1(std::move(std::declval()))) &&
+ noexcept(T2(std::move(std::declval()))))
+ : first(std::move(a)), second(std::move(b)) {}
+
+ template
+ constexpr pair(U1 &&a, U2 &&b) noexcept(
+ noexcept(T1(std::forward(std::declval()))) &&
+ noexcept(T2(std::forward(std::declval()))))
+ : first(std::forward(a)), second(std::forward(b)) {}
+
+ template
+// MSVC 2015 produces error "C2476: ‘constexpr’ constructor does not initialize
+// all members"
+// if this constructor is constexpr
+#if !ROBIN_HOOD(BROKEN_CONSTEXPR)
+ constexpr
+#endif
+ pair(std::piecewise_construct_t /*unused*/, std::tuple a,
+ std::tuple
+ b) noexcept(noexcept(pair(std::declval &>(),
+ std::declval &>(),
+ ROBIN_HOOD_STD::index_sequence_for<
+ U1...>(),
+ ROBIN_HOOD_STD::index_sequence_for<
+ U2...>())))
+ : pair(a, b, ROBIN_HOOD_STD::index_sequence_for(),
+ ROBIN_HOOD_STD::index_sequence_for()) {
+ }
+
+ // constructor called from the std::piecewise_construct_t ctor
+ template
+ pair(
+ std::tuple &a, std::tuple &b,
+ ROBIN_HOOD_STD::index_sequence /*unused*/,
+ ROBIN_HOOD_STD::index_sequence<
+ I2...> /*unused*/) noexcept(noexcept(T1(std::
+ forward(std::get(
+ std::declval<
+ std::tuple
+ &>()))...)) &&
+ noexcept(T2(std::forward(std::get(
+ std::declval<
+ std::tuple &>()))...)))
+ : first(std::forward(std::get(a))...),
+ second(std::forward(std::get(b))...) {
+ // make visual studio compiler happy about warning about unused a & b.
+ // Visual studio's pair implementation disables warning 4100.
+ (void)a;
+ (void)b;
+ }
+
+ void swap(pair &o) noexcept((detail::swappable::nothrow::value) &&
+ (detail::swappable::nothrow::value)) {
+ using std::swap;
+ swap(first, o.first);
+ swap(second, o.second);
+ }
+
+ T1 first; // NOLINT(misc-non-private-member-variables-in-classes)
+ T2 second; // NOLINT(misc-non-private-member-variables-in-classes)
+};
+
+template
+inline void swap(pair &a, pair &b) noexcept(
+ noexcept(std::declval &>().swap(std::declval &>()))) {
+ a.swap(b);
+}
+
+template
+inline constexpr bool operator==(pair const &x, pair const &y) {
+ return (x.first == y.first) && (x.second == y.second);
+}
+template
+inline constexpr bool operator!=(pair const &x, pair const &y) {
+ return !(x == y);
+}
+template
+inline constexpr bool
+operator<(pair const &x, pair const &y) noexcept(
+ noexcept(std::declval() < std::declval()) &&
+ noexcept(std::declval() < std::declval())) {
+ return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template
+inline constexpr bool operator>(pair const &x, pair const &y) {
+ return y < x;
+}
+template
+inline constexpr bool operator<=(pair const &x, pair const &y) {
+ return !(x > y);
+}
+template
+inline constexpr bool operator>=(pair const &x, pair const &y) {
+ return !(x < y);
+}
+
+inline size_t hash_bytes(void const *ptr, size_t len) noexcept {
+ static constexpr uint64_t m = UINT64_C(0xc6a4a7935bd1e995);
+ static constexpr uint64_t seed = UINT64_C(0xe17a1465);
+ static constexpr unsigned int r = 47;
+
+ auto const *const data64 = static_cast(ptr);
+ uint64_t h = seed ^ (len * m);
+
+ size_t const n_blocks = len / 8;
+ for (size_t i = 0; i < n_blocks; ++i) {
+ auto k = detail::unaligned_load(data64 + i);
+
+ k *= m;
+ k ^= k >> r;
+ k *= m;
+
+ h ^= k;
+ h *= m;
+ }
+
+ auto const *const data8 =
+ reinterpret_cast(data64 + n_blocks);
+ switch (len & 7U) {
+ case 7:
+ h ^= static_cast(data8[6]) << 48U;
+ ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+ case 6:
+ h ^= static_cast(data8[5]) << 40U;
+ ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+ case 5:
+ h ^= static_cast(data8[4]) << 32U;
+ ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+ case 4:
+ h ^= static_cast(data8[3]) << 24U;
+ ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+ case 3:
+ h ^= static_cast(data8[2]) << 16U;
+ ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+ case 2:
+ h ^= static_cast(data8[1]) << 8U;
+ ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+ case 1:
+ h ^= static_cast(data8[0]);
+ h *= m;
+ ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH
+ default:
+ break;
+ }
+
+ h ^= h >> r;
+
+ // not doing the final step here, because this will be done by keyToIdx
+ // anyways
+ // h *= m;
+ // h ^= h >> r;
+ return static_cast(h);
+}
+
+inline size_t hash_int(uint64_t x) noexcept {
+ // tried lots of different hashes, let's stick with murmurhash3. It's simple,
+ // fast, well tested,
+ // and doesn't need any special 128bit operations.
+ x ^= x >> 33U;
+ x *= UINT64_C(0xff51afd7ed558ccd);
+ x ^= x >> 33U;
+
+ // not doing the final step here, because this will be done by keyToIdx
+ // anyways
+ // x *= UINT64_C(0xc4ceb9fe1a85ec53);
+ // x ^= x >> 33U;
+ return static_cast(x);
+}
+
+// A thin wrapper around std::hash, performing an additional simple mixing step
+// of the result.
+template
+struct hash : public std::hash {
+ size_t operator()(T const &obj) const noexcept(noexcept(
+ std::declval>().operator()(std::declval()))) {
+ // call base hash
+ auto result = std::hash::operator()(obj);
+ // return mixed of that, to be save against identity has
+ return hash_int(static_cast(result));
+ }
+};
+
+template
+struct hash> {
+ size_t operator()(std::basic_string const &str) const noexcept {
+ return hash_bytes(str.data(), sizeof(CharT) * str.size());
+ }
+};
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+template