Skip to content

Commit af4943f

Browse files
committed
merge develop
2 parents c130cf2 + 85642a0 commit af4943f

File tree

157 files changed

+3731
-1153
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

157 files changed

+3731
-1153
lines changed

CMakeLists.txt

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,14 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License
1414

15-
cmake_minimum_required(VERSION 3.10)
16-
cmake_policy(VERSION 3.10)
15+
if(APPLE AND WITH_ARM)
16+
# cmake 3.19.2 version starts to support M1
17+
cmake_minimum_required(VERSION 3.19.2)
18+
cmake_policy(VERSION 3.19.2)
19+
else(APPLE AND WITH_ARM)
20+
cmake_minimum_required(VERSION 3.10)
21+
cmake_policy(VERSION 3.10)
22+
endif(APPLE AND WITH_ARM)
1723
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
1824
set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
1925
set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
@@ -73,6 +79,11 @@ if(WITH_MUSL)
7379
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
7480
endif()
7581

82+
if(APPLE AND WITH_ARM)
83+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
84+
set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
85+
endif()
86+
7687
if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
7788
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
7889
endif()
@@ -97,10 +108,6 @@ if(WIN32)
97108

98109
if (MSVC_STATIC_CRT)
99110
message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
100-
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /MTd")
101-
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /MT")
102-
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
103-
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
104111
foreach(flag_var
105112
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
106113
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
@@ -336,8 +343,9 @@ endif()
336343
if(WITH_ARM)
337344
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
338345
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
339-
set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON" FORCE)
346+
set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON." FORCE)
340347
set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
348+
set(WITH_AVX OFF CACHE STRING "Disable AVX when compiling WITH_AVX=OFF." FORCE)
341349
add_definitions(-DPADDLE_WITH_ARM)
342350
endif()
343351

cmake/ccache.cmake

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,29 @@
11
# Use ccache if found ccache program
22

3-
find_program(CCACHE_PATH ccache)
3+
if(NOT WIN32)
4+
find_program(CCACHE_PATH ccache)
5+
if(CCACHE_PATH)
6+
execute_process(COMMAND ccache -V OUTPUT_VARIABLE ccache_output)
7+
execute_process(COMMAND ccache -s cache directory OUTPUT_VARIABLE cache_directory)
8+
string(REGEX MATCH "[0-9]+.[0-9]+" ccache_version ${ccache_output})
9+
message(STATUS "ccache is founded, use ccache to speed up compile on Unix.")
10+
# show statistics summary of ccache
11+
message("ccache version\t\t\t " ${ccache_version} "\n" ${cache_directory})
12+
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH})
13+
set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH})
14+
endif(CCACHE_PATH)
15+
elseif("${CMAKE_GENERATOR}" STREQUAL "Ninja")
16+
# (Note:zhouwei25) Only Ninja Generator can support sccache now
17+
find_program(SCCACHE_PATH sccache)
418

5-
if(CCACHE_PATH)
6-
execute_process(COMMAND ccache -V OUTPUT_VARIABLE ccache_output)
7-
execute_process(COMMAND ccache -s cache directory OUTPUT_VARIABLE cache_directory)
8-
string(REGEX MATCH "[0-9]+.[0-9]+" ccache_version ${ccache_output})
9-
message(STATUS "Ccache is founded, use ccache to speed up compile.")
10-
# show statistics summary of ccache
11-
message("ccache version\t\t\t " ${ccache_version} "\n" ${cache_directory})
12-
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH})
13-
set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH})
14-
endif(CCACHE_PATH)
19+
if(SCCACHE_PATH)
20+
execute_process(COMMAND sccache -V OUTPUT_VARIABLE sccache_version)
21+
message(STATUS "${sccache_version} is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.")
22+
23+
set(CMAKE_C_COMPILER_LAUNCHER ${SCCACHE_PATH})
24+
set(CMAKE_CXX_COMPILER_LAUNCHER ${SCCACHE_PATH})
25+
# (Note:zhouwei25) sccache for cuda compiler has bug so that it can't be hit
26+
# refer to https://github.com/mozilla/sccache/issues/1017, so we fix it
27+
set(CMAKE_CUDA_COMPILER_LAUNCHER ${SCCACHE_PATH})
28+
endif(SCCACHE_PATH)
29+
endif()

cmake/cuda.cmake

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,8 +218,6 @@ if(WIN32)
218218
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"/wd4244 /wd4267 /wd4819 \"")
219219
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /bigobj")
220220
if(MSVC_STATIC_CRT)
221-
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -Xcompiler /MTd")
222-
set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler /MT")
223221
foreach(flag_var
224222
CMAKE_CUDA_FLAGS CMAKE_CUDA_FLAGS_DEBUG CMAKE_CUDA_FLAGS_RELEASE
225223
CMAKE_CUDA_FLAGS_MINSIZEREL CMAKE_CUDA_FLAGS_RELWITHDEBINFO)

cmake/external/cryptopp.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ ELSE(WIN32)
3333
SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE)
3434
ENDIF(WIN32)
3535

36+
IF(APPLE AND WITH_ARM)
37+
SET(CMAKE_CXX_FLAGS "-DCRYPTOPP_ARM_CRC32_AVAILABLE=0")
38+
ENDIF()
39+
3640
set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
3741
-DBUILD_SHARED=ON
3842
-DBUILD_STATIC=ON

cmake/external/cub.cmake

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,27 +14,27 @@
1414

1515
include(ExternalProject)
1616

17-
set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub)
18-
set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub)
19-
set(CUB_REPOSITORY ${GIT_URL}/NVlabs/cub.git)
20-
set(CUB_TAG 1.8.0)
17+
# Note(zhouwei): extern_cub has code __FILE_, If the path of extern_cub is changed,
18+
# it will effect about 30+ cu files sccache hit and slow compile speed on windows.
19+
# Therefore, a fixed CUB_PATH will be input to increase the sccache hit rate.
20+
set(CUB_PATH "${THIRD_PARTY_PATH}/cub" CACHE STRING "A path setting for external_cub path.")
21+
set(CUB_PREFIX_DIR ${CUB_PATH})
2122

22-
cache_third_party(extern_cub
23-
REPOSITORY ${CUB_REPOSITORY}
24-
TAG ${CUB_TAG}
25-
DIR CUB_SOURCE_DIR)
23+
set(CUB_REPOSITORY ${GIT_URL}/NVlabs/cub.git)
24+
set(CUB_TAG 1.8.0)
2625

27-
SET(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR})
26+
SET(CUB_INCLUDE_DIR ${CUB_PREFIX_DIR}/src/extern_cub)
27+
message("CUB_INCLUDE_DIR is ${CUB_INCLUDE_DIR}")
2828
include_directories(${CUB_INCLUDE_DIR})
2929

3030
ExternalProject_Add(
3131
extern_cub
3232
${EXTERNAL_PROJECT_LOG_ARGS}
3333
${SHALLOW_CLONE}
34-
"${CUB_DOWNLOAD_CMD}"
34+
GIT_REPOSITORY ${CUB_REPOSITORY}
35+
GIT_TAG ${CUB_TAG}
3536
PREFIX ${CUB_PREFIX_DIR}
36-
SOURCE_DIR ${CUB_SOURCE_DIR}
37-
UPDATE_COMMAND ""
37+
UPDATE_COMMAND ""
3838
CONFIGURE_COMMAND ""
3939
BUILD_COMMAND ""
4040
INSTALL_COMMAND ""

cmake/external/openblas.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ SET(CBLAS_SOURCE_DIR ${THIRD_PARTY_PATH}/openblas/src/extern_openblas)
1919
SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
2020
SET(CBLAS_REPOSITORY ${GIT_URL}/xianyi/OpenBLAS.git)
2121
SET(CBLAS_TAG v0.3.7)
22+
if(APPLE AND WITH_ARM)
23+
SET(CBLAS_TAG v0.3.13)
24+
endif()
25+
2226
if(WITH_MIPS)
2327
SET(CBLAS_TAG v0.3.13)
2428
endif()

cmake/flags.cmake

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,11 @@ endif()
186186
endif(NOT WIN32)
187187

188188
if (APPLE)
189-
# On Mac OS X build fat binaries with x86_64 architectures by default.
190-
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
189+
if(WITH_ARM)
190+
set (CMAKE_OSX_ARCHITECTURES "arm64" CACHE STRING "Build architectures for OSX" FORCE)
191+
else(WITH_ARM)
192+
set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
193+
endif(WITH_ARM)
191194
# On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
192195
set (COMMON_FLAGS -Wno-deprecated-register)
193196
endif(APPLE)

paddle/fluid/extension/include/ext_place.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,6 @@ limitations under the License. */
1717
namespace paddle {
1818

1919
// TODO(yangjiabin): Add other place support in next PR
20-
enum class PlaceType { kUNK = -1, kCPU, kGPU };
20+
enum class PlaceType { kUNK = -1, kCPU, kGPU, kHIP };
2121

2222
} // namespace paddle

paddle/fluid/extension/include/ext_tensor.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,11 @@ class PD_DLL_DECL Tensor {
116116
/// \brief Check Tensor is initialized
117117
bool is_initialized() const;
118118

119-
#ifdef PADDLE_WITH_CUDA
119+
#if defined(PADDLE_WITH_CUDA)
120120
/// \bref Get current stream of Tensor
121121
cudaStream_t stream() const;
122+
#elif defined(PADDLE_WITH_HIP)
123+
hipStream_t stream() const;
122124
#endif
123125

124126
private:

paddle/fluid/extension/src/ext_tensor.cc

Lines changed: 71 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ struct CastDataType {
5353
auto *context = static_cast<const platform::CPUDeviceContext *>(ctx_);
5454
trans(*context, in_begin, in_end, out_begin,
5555
CastDataTypeFunctor<InType, OutType>());
56-
#ifdef __NVCC__
56+
#if defined(__NVCC__) || defined(__HIPCC__)
5757
} else if (platform::is_gpu_place(in_.place())) {
5858
platform::Transform<platform::CUDADeviceContext> trans;
5959
auto *context = static_cast<const platform::CUDADeviceContext *>(ctx_);
@@ -67,10 +67,11 @@ struct CastDataType {
6767
}
6868
}
6969
};
70+
7071
template <typename T>
71-
void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
72-
int64_t ele_size) {
73-
#ifdef PADDLE_WITH_CUDA
72+
void DeviceCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
73+
int64_t ele_size) {
74+
#if defined(PADDLE_WITH_CUDA)
7475
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
7576
int device_num = paddle::platform::GetCurrentDeviceId();
7677
platform::CUDAPlace gpu_place(device_num);
@@ -90,6 +91,30 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
9091
"Only GPU related Copy can reach this func."));
9192
}
9293
cudaStreamSynchronize(dev_ctx->stream());
94+
#elif defined(PADDLE_WITH_HIP)
95+
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
96+
int device_num = paddle::platform::GetCurrentDeviceId();
97+
platform::CUDAPlace gpu_place(device_num);
98+
auto *dev_ctx =
99+
static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
100+
if ((src_plc == PlaceType::kHIP) && (dst_plc == PlaceType::kCPU)) {
101+
memory::Copy(platform::CPUPlace(), static_cast<void *>(dst), gpu_place, src,
102+
ele_size, dev_ctx->stream());
103+
} else if ((src_plc == PlaceType::kHIP) && (dst_plc == PlaceType::kHIP)) {
104+
memory::Copy(gpu_place, static_cast<void *>(dst), gpu_place, src, ele_size,
105+
dev_ctx->stream());
106+
} else if ((src_plc == PlaceType::kCPU) && (dst_plc == PlaceType::kHIP)) {
107+
memory::Copy(gpu_place, static_cast<void *>(dst), platform::CPUPlace(), src,
108+
ele_size, dev_ctx->stream());
109+
} else {
110+
PADDLE_THROW(platform::errors::Unavailable(
111+
"Only GPU related Copy can reach this func."));
112+
}
113+
hipStreamSynchronize(dev_ctx->stream());
114+
#else
115+
PADDLE_THROW(platform::errors::Unavailable(
116+
"This function can only be used if compiled with"
117+
"either -DWITH_ROCM=ON or -DWITH_GPU=ON"));
93118
#endif
94119
}
95120

@@ -137,11 +162,16 @@ T *Tensor::mutable_data() {
137162
case static_cast<int>(PlaceType::kCPU): {
138163
return tensor->mutable_data<T>(platform::CPUPlace());
139164
}
140-
#ifdef PADDLE_WITH_CUDA
165+
#if defined(PADDLE_WITH_CUDA)
141166
case static_cast<int>(PlaceType::kGPU): {
142167
int device_num = platform::GetCurrentDeviceId();
143168
return tensor->mutable_data<T>(platform::CUDAPlace(device_num));
144169
}
170+
#elif defined(PADDLE_WITH_HIP)
171+
case static_cast<int>(PlaceType::kHIP): {
172+
int device_num = platform::GetCurrentDeviceId();
173+
return tensor->mutable_data<T>(platform::CUDAPlace(device_num));
174+
}
145175
#endif
146176
default:
147177
PADDLE_THROW(platform::errors::Unavailable(
@@ -202,17 +232,23 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
202232
target.reshape(shape());
203233
auto *p_target_data = target.template mutable_data<T>();
204234

235+
bool supported_gpu_transform = false;
236+
#if defined(PADDLE_WITH_CUDA)
237+
supported_gpu_transform =
238+
(src_place == PlaceType::kGPU && target_place == PlaceType::kCPU) ||
239+
(src_place == PlaceType::kCPU && target_place == PlaceType::kGPU) ||
240+
(src_place == PlaceType::kGPU && target_place == PlaceType::kGPU);
241+
#elif defined(PADDLE_WITH_HIP)
242+
supported_gpu_transform =
243+
(src_place == PlaceType::kHIP && target_place == PlaceType::kCPU) ||
244+
(src_place == PlaceType::kCPU && target_place == PlaceType::kHIP) ||
245+
(src_place == PlaceType::kHIP && target_place == PlaceType::kHIP);
246+
#endif
247+
205248
if ((src_place == PlaceType::kCPU) && (target_place == PlaceType::kCPU)) {
206249
std::memcpy(static_cast<void *>(p_target_data), p_src_data, ele_size);
207-
} else if ((src_place == PlaceType::kGPU) &&
208-
(target_place == PlaceType::kCPU)) {
209-
GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
210-
} else if ((src_place == PlaceType::kCPU) &&
211-
(target_place == PlaceType::kGPU)) {
212-
GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
213-
} else if ((src_place == PlaceType::kGPU) &&
214-
(target_place == PlaceType::kGPU)) {
215-
GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
250+
} else if (supported_gpu_transform) {
251+
DeviceCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
216252
} else {
217253
PADDLE_THROW(platform::errors::Unavailable(
218254
"Not supported place transform of place: %d to place: %d",
@@ -304,13 +340,18 @@ const PlaceType &Tensor::place() const {
304340
GET_CASTED_TENSOR;
305341
if (platform::is_cpu_place(tensor->place())) {
306342
place_ = PlaceType::kCPU;
343+
#if defined(PADDLE_WITH_CUDA)
307344
} else if (platform::is_gpu_place(tensor->place())) {
308345
place_ = PlaceType::kGPU;
346+
#elif defined(PADDLE_WITH_HIP)
347+
} else if (platform::is_gpu_place(tensor->place())) {
348+
place_ = PlaceType::kHIP;
349+
#endif
309350
} else {
310351
PADDLE_THROW(platform::errors::Unimplemented(
311352
"Current Tensor hold unsupported Place Type, Please Init it"
312-
"using Tensor::mutable_data<T>(PaddlePlace) which T is"
313-
"either Place::kCPU or Place::kGPU"));
353+
"using Tensor::mutable_data<T>(PaddlePlace) with T among:"
354+
"Place::kCPU or Place::kGPU or Place::kHIP"));
314355
}
315356
return place_;
316357
}
@@ -392,16 +433,21 @@ bool Tensor::is_initialized() const {
392433
}
393434
}
394435

395-
#ifdef PADDLE_WITH_CUDA
396-
cudaStream_t Tensor::stream() const {
397-
if (!stream_.IsStreamSet()) {
398-
PADDLE_THROW(platform::errors::PreconditionNotMet(
399-
"Stream is not Set, only input tensor will have "
400-
"stream which is set by framework "));
401-
} else {
402-
return reinterpret_cast<cudaStream_t>(stream_.GetStream());
436+
#define DEFINE_STREAM(_stream_t_) \
437+
_stream_t_ Tensor::stream() const { \
438+
if (!stream_.IsStreamSet()) { \
439+
PADDLE_THROW(platform::errors::PreconditionNotMet( \
440+
"Stream is not Set, only input tensor will have " \
441+
"stream which is set by framework ")); \
442+
} else { \
443+
return reinterpret_cast<_stream_t_>(stream_.GetStream()); \
444+
} \
403445
}
404-
}
446+
447+
#if defined(PADDLE_WITH_CUDA)
448+
DEFINE_STREAM(cudaStream_t)
449+
#elif defined(PADDLE_WITH_HIP)
450+
DEFINE_STREAM(hipStream_t)
405451
#endif
406452

407453
namespace framework {

0 commit comments

Comments
 (0)