Skip to content

Commit 5f2a054

Browse files
caizhi-mtmt-robot
authored andcommitted
Merge pull request PaddlePaddle#33 from mthreads/fix_MUSAAA
improve cmakelist.txt
2 parents 172dc98 + 68f25aa commit 5f2a054

30 files changed

Lines changed: 218 additions & 112 deletions

cmake/generic.cmake

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -425,10 +425,6 @@ function(cc_binary TARGET_NAME)
425425
if(WITH_ROCM)
426426
target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
427427
endif()
428-
# TODO(@caizhi): enable target_link_libraries for MUSA
429-
#if(WITH_MUSA)
430-
# target_link_libraries(${TARGET_NAME} ${MUSA_LIB})
431-
#endif()
432428

433429
check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
434430

cmake/mccl.cmake

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ if(WIN32)
77
return()
88
endif()
99

10-
# FIXME(MTAI): please make sure that we can find MCCL successfully
1110
if(WITH_MCCL)
1211
set(MCCL_ROOT
1312
"/usr/local/musa/"

cmake/musa.cmake

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@ find_package(MUSA REQUIRED)
1717
include_directories(${MUSA_PATH}/include)
1818

1919
# set openmp include directory
20-
set(llvm_search_list)
20+
set(llvm_openmp_search_list)
2121
foreach(item RANGE 6 20 1)
22-
list(APPEND llvm_search_list /usr/lib/llvm-${item}/include/openmp/)
22+
list(APPEND llvm_openmp_search_list /usr/lib/llvm-${item}/include/openmp/)
2323
endforeach()
2424

2525
find_path(
2626
OPENMP_INCLUDE_DIR omp.h
27-
PATHS ${llvm_search_list}
27+
PATHS ${llvm_openmp_search_list}
2828
REQUIRED
2929
NO_DEFAULT_PATH)
3030
include_directories(${OPENMP_INCLUDE_DIR})
@@ -77,9 +77,7 @@ macro(find_musa_version musa_version_file)
7777
endmacro()
7878
find_musa_version(${MUSA_PATH}/version.json)
7979

80-
list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
8180
list(APPEND MUSA_MCC_FLAGS -Wno-macro-redefined)
82-
list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
8381
list(APPEND MUSA_MCC_FLAGS -Wno-deprecated-copy-with-user-provided-copy)
8482
list(APPEND MUSA_MCC_FLAGS -Wno-pragma-once-outside-header)
8583
list(APPEND MUSA_MCC_FLAGS -Wno-return-type)
@@ -89,7 +87,12 @@ list(APPEND MUSA_MCC_FLAGS -Wno-pessimizing-move)
8987
list(APPEND MUSA_MCC_FLAGS -Wno-unused-but-set-variable)
9088
list(APPEND MUSA_MCC_FLAGS -Wno-bitwise-instead-of-logical)
9189
list(APPEND MUSA_MCC_FLAGS -Wno-format)
90+
list(APPEND MUSA_MCC_FLAGS -Wno-self-assign)
91+
list(APPEND MUSA_MCC_FLAGS -Wno-literal-conversion)
92+
list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
93+
list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
9294
list(APPEND MUSA_MCC_FLAGS -Wno-unused-local-typedef)
95+
list(APPEND MUSA_MCC_FLAGS -Wno-unused-lambda-capture)
9396
list(APPEND MUSA_MCC_FLAGS -Wno-reorder-ctor)
9497
list(APPEND MUSA_MCC_FLAGS -Wno-braced-scalar-init)
9598
list(APPEND MUSA_MCC_FLAGS -Wno-pass-failed)

paddle/phi/kernels/CMakeLists.txt

Lines changed: 84 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -26,62 +26,95 @@ file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h")
2626
file(GLOB kernel_primitive_h "primitive/*.h")
2727

2828
# fusion ops would be included here
29-
#file(
30-
# GLOB kernel_cu
31-
# RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
32-
# "gpu/*.cu"
33-
# "gpu/*.cu.cc"
34-
# "gpudnn/*.cu"
35-
# "kps/*.cu"
36-
# "legacy/kps/*.cu"
37-
# "legacy/gpu/*.cu"
38-
# "selected_rows/gpu/*.cu"
39-
# "sparse/gpu/*.cu"
40-
# "strings/gpu/*.cu"
41-
# "fusion/gpu/*.cu")
4229
file(
4330
GLOB kernel_cu
4431
RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
45-
"gpu/s*.cu.cc"
46-
"gpu/c*.cu"
47-
"gpu/s*.cu"
48-
"gpu/abs_kernel.cu"
49-
"gpu/uniform_kernel.cu"
50-
"gpu/activation_kernel.cu"
51-
"gpu/full_kernel.cu"
52-
"gpu/cholesky_kernel.cu"
53-
"gpu/cholesky_solve_kernel.cu"
54-
"gpu/svd_kernel.cu"
55-
"gpu/p_norm_grad_kernel.cu"
56-
"gpu/matmul_kernel.cu"
57-
"gpu/expand_kernel.cu"
58-
"gpu/isfinite_kernel.cu"
32+
"gpu/*.cu"
33+
"gpu/*.cu.cc"
34+
"gpudnn/*.cu"
5935
"kps/*.cu"
60-
"legacy/gpu/uniform_kernel.cu"
61-
"sparse/gpu/mask_kernel.cu"
6236
"legacy/kps/*.cu"
63-
)
64-
list(REMOVE_ITEM kernel_cu
65-
"gpu/check_numerics_kernel.cu"
66-
"gpu/cross_entropy_grad_kernel.cu"
67-
"gpu/instance_norm_grad_kernel.cu"
68-
"gpu/cross_entropy_kernel.cu"
69-
"gpu/cholesky_grad_kernel.cu"
70-
"gpu/cholesky_solve_grad_kernel.cu"
71-
"gpu/conv_transpose_kernel.cu"
72-
"gpu/conv_grad_kernel.cu"
73-
"gpu/solve_kernel.cu"
74-
"gpu/solve_grad_kernel.cu"
75-
"gpu/stft_kernel.cu"
76-
"gpu/conv_kernel.cu"
77-
"gpu/cudnn_lstm_grad_kernel.cu"
78-
"gpu/cudnn_lstm_kernel.cu"
79-
"gpu/softmax_kernel.cu"
80-
"gpu/slogdeterminant_grad_kernel.cu"
81-
"gpu/spectral_norm_grad_kernel.cu"
82-
"gpu/spectral_norm_kernel.cu"
83-
"gpu/svd_grad_kernel.cu"
84-
"gpu/conv_transpose_grad_kernel.cu")
37+
"legacy/gpu/*.cu"
38+
"selected_rows/gpu/*.cu"
39+
"sparse/gpu/*.cu"
40+
"strings/gpu/*.cu"
41+
"fusion/gpu/*.cu")
42+
43+
# FIXME(@MTAI): compilation error will occur when compiling the following files.
44+
# This need to be fixed later.
45+
if(WITH_MUSA)
46+
list(REMOVE_ITEM kernel_cu
47+
"fusion/gpu/fused_softmax_mask_grad_kernel.cu"
48+
"fusion/gpu/fused_softmax_mask_kernel.cu"
49+
"gpu/batch_norm_grad_kernel.cu"
50+
"gpu/batch_norm_kernel.cu"
51+
"gpu/check_numerics_kernel.cu"
52+
"gpu/cholesky_grad_kernel.cu"
53+
"gpu/cholesky_solve_grad_kernel.cu"
54+
"gpu/conv_grad_kernel.cu"
55+
"gpu/conv_kernel.cu"
56+
"gpu/cross_entropy_grad_kernel.cu"
57+
"gpu/cross_entropy_kernel.cu"
58+
"gpu/conv_transpose_grad_kernel.cu"
59+
"gpu/conv_transpose_kernel.cu"
60+
"gpu/cudnn_lstm_grad_kernel.cu"
61+
"gpu/cudnn_lstm_kernel.cu"
62+
"gpu/depthwise_conv_grad_kernel.cu"
63+
"gpu/depthwise_conv_kernel.cu"
64+
"gpu/dist_kernel.cu"
65+
"gpu/elementwise_divide_grad_kernel.cu"
66+
"gpu/elementwise_grad_kernel.cu"
67+
"gpu/elementwise_multiply_grad_kernel.cu"
68+
"gpu/erfinv_kernel.cu"
69+
"gpu/exponential_kernel.cu"
70+
"gpu/fft_grad_kernel.cu"
71+
"gpu/fft_kernel.cu"
72+
"gpu/fused_softmax_mask_grad_kernel.cu"
73+
"gpu/gaussian_kernel.cu"
74+
"gpu/gelu_grad_kernel.cu"
75+
"gpu/gelu_kernel.cu"
76+
"gpu/histogram_kernel.cu"
77+
"gpu/instance_norm_grad_kernel.cu"
78+
"gpu/instance_norm_kernel.cu"
79+
"gpu/interpolate_grad_kernel.cu"
80+
"gpu/kthvalue_grad_kernel.cu"
81+
"gpu/kthvalue_kernel.cu"
82+
"gpu/layer_norm_grad_kernel.cu"
83+
"gpu/layer_norm_kernel.cu"
84+
"gpu/llm_int8_mat_mul_kernel.cu"
85+
"gpu/log_softmax_grad_kernel.cu"
86+
"gpu/log_softmax_kernel.cu"
87+
"gpu/lstsq_kernel.cu"
88+
"gpu/nanmedian_kernel.cu"
89+
"gpu/rnn_grad_kernel.cu.cc"
90+
"gpu/rnn_kernel.cu.cc"
91+
"gpu/slogdeterminant_grad_kernel.cu"
92+
"gpu/solve_grad_kernel.cu"
93+
"gpu/solve_kernel.cu"
94+
"gpu/spectral_norm_grad_kernel.cu"
95+
"gpu/spectral_norm_kernel.cu"
96+
"gpu/stft_kernel.cu"
97+
"gpu/svd_grad_kernel.cu"
98+
"gpu/top_k_grad_kernel.cu"
99+
"gpu/top_k_kernel.cu"
100+
"gpu/truncated_gaussian_random_kernel.cu"
101+
"gpudnn/affine_grid_grad_kernel.cu"
102+
"gpudnn/affine_grid_kernel.cu"
103+
"gpudnn/softmax_grad_kernel.cu"
104+
"gpudnn/softmax_kernel.cu"
105+
"gpudnn/conv_grad_kernel.cu"
106+
"gpudnn/conv_kernel.cu"
107+
"gpudnn/conv_transpose_grad_kernel.cu"
108+
"gpudnn/conv_transpose_kernel.cu"
109+
"gpudnn/pool_grad_kernel.cu"
110+
"gpudnn/pool_kernel.cu"
111+
"sparse/gpu/softmax_grad_kernel.cu"
112+
"sparse/gpu/softmax_kernel.cu"
113+
"sparse/gpu/conv_kernel.cu"
114+
"sparse/gpu/pool_kernel.cu"
115+
"strings/gpu/strings_copy_kernel.cu"
116+
"strings/gpu/strings_lower_upper_kernel.cu")
117+
endif()
85118

86119
if(APPLE OR WIN32)
87120
list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")

paddle/phi/kernels/batch_norm_kernel.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer,
9797
}
9898
#endif
9999
#endif
100-
#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSAAA)
100+
#if defined(PADDLE_WITH_HIP)
101101
PD_REGISTER_KERNEL(batch_norm_infer,
102102
GPU,
103103
ALL_LAYOUT,

paddle/phi/kernels/funcs/CMakeLists.txt

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,26 +8,19 @@ file(
88
GLOB func_cc_srcs
99
RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
1010
"*.cc")
11-
# TODO(@caizhi): enable compiling all cu kernels
12-
#if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
13-
# file(
14-
# GLOB func_cu_srcs
15-
# RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
16-
# "*.cu")
17-
#endif()
1811
if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
1912
file(
2013
GLOB func_cu_srcs
2114
RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
22-
"concat_and_split_functor.cu"
23-
"math_function.cu"
24-
"segment_pooling.cu"
25-
"sequence_pooling.cu"
26-
"softmax.cu"
27-
"matrix_inverse.cu"
28-
"im2col.cu"
29-
"selected_rows_functor.cu"
30-
"gather_scatter_functor.cu")
15+
"*.cu")
16+
endif()
17+
18+
# TODO(@MTAI): compilation error will occur when compiling the following files.
19+
# Compiler mcc need fix this bug.
20+
if(WITH_MUSA)
21+
list(REMOVE_ITEM func_cu_srcs
22+
"cross_entropy.cu"
23+
"gru_compute.cu")
3124
endif()
3225

3326
collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})

paddle/phi/kernels/funcs/top_k_function_cuda.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1129,6 +1129,15 @@ bool SortTopk(const phi::GPUContext& ctx,
11291129
<< hipGetErrorString(err);
11301130
return false;
11311131
}
1132+
#elif defined(__MUSACC__)
1133+
if (err != musaSuccess) {
1134+
LOG(ERROR) << "TopKOP failed as could not launch "
1135+
"cub::DeviceSegmentedRadixSort::SortPairsDescending to "
1136+
"calculate "
1137+
"temp_storage_bytes, status: "
1138+
<< musaGetErrorString(err);
1139+
return false;
1140+
}
11321141
#else
11331142
if (err != cudaSuccess) {
11341143
LOG(ERROR)
@@ -1162,6 +1171,14 @@ bool SortTopk(const phi::GPUContext& ctx,
11621171
<< hipGetErrorString(err);
11631172
return false;
11641173
}
1174+
#elif defined(__MUSACC__)
1175+
if (err != musaSuccess) {
1176+
LOG(ERROR) << "TopKOP failed as could not launch "
1177+
"cub::DeviceSegmentedRadixSort::SortPairs to calculate "
1178+
"temp_storage_bytes, status: "
1179+
<< musaGetErrorString(err);
1180+
return false;
1181+
}
11651182
#else
11661183
if (err != cudaSuccess) {
11671184
LOG(ERROR) << "TopKOP failed as could not launch "
@@ -1200,6 +1217,16 @@ bool SortTopk(const phi::GPUContext& ctx,
12001217
<< ", status: " << hipGetErrorString(err);
12011218
return false;
12021219
}
1220+
#elif defined(__MUSACC__)
1221+
if (err != musaSuccess) {
1222+
LOG(ERROR) << "TopKOP failed as could not launch "
1223+
"cub::DeviceSegmentedRadixSort::SortPairsDescending to "
1224+
"sort input, "
1225+
"temp_storage_bytes: "
1226+
<< temp_storage_bytes
1227+
<< ", status: " << musaGetErrorString(err);
1228+
return false;
1229+
}
12031230
#else
12041231
if (err != cudaSuccess) {
12051232
LOG(ERROR) << "TopKOP failed as could not launch "
@@ -1236,6 +1263,16 @@ bool SortTopk(const phi::GPUContext& ctx,
12361263
<< ", status: " << hipGetErrorString(err);
12371264
return false;
12381265
}
1266+
#elif defined(__MUSACC__)
1267+
if (err != musaSuccess) {
1268+
LOG(ERROR) << "TopKOP failed as could not launch "
1269+
"cub::DeviceSegmentedRadixSort::SortPairs to "
1270+
"sort input, "
1271+
"temp_storage_bytes: "
1272+
<< temp_storage_bytes
1273+
<< ", status: " << musaGetErrorString(err);
1274+
return false;
1275+
}
12391276
#else
12401277
if (err != cudaSuccess) {
12411278
LOG(ERROR) << "TopKOP failed as could not launch "

paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ __global__ void VectorizedDropoutBackward(const size_t n,
105105
hiprandStatePhilox4_32_10_t state;
106106
hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
107107
using SType = hiprandStatePhilox4_32_10_t;
108+
#elif defined(PADDLE_WITH_MUSA)
109+
murand_state_philox4x32_10 state;
110+
murand_init(seed, idx + THREAD_ID_X, increment, &state);
111+
using SType = murand_state_philox4x32_10;
108112
#else
109113
curandStatePhilox4_32_10_t state;
110114
curand_init(seed, idx + THREAD_ID_X, increment, &state);

paddle/phi/kernels/gpu/bernoulli_kernel.cu

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,12 @@ __global__ void bernoulli_cuda_kernel(
4646
#if defined(__NVCC__)
4747
curandStatePhilox4_32_10_t state;
4848
curand_init(seed, thread_idx, offset, &state);
49-
#else
49+
#elif defined(__HIPCC__)
5050
hiprandStatePhilox4_32_10_t state;
5151
hiprand_init(seed, thread_idx, offset, &state);
52+
#elif defined(__MUSACC__)
53+
murand_state_philox4x32_10 state;
54+
murand_init(seed, thread_idx, offset, &state);
5255
#endif
5356

5457
size_t total_thread = gridDim.x * blockDim.x;

paddle/phi/kernels/gpu/dirichlet_kernel.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ using COMPAT_RANDSTATEPHILOX4_32_10_T = hiprandStatePhilox4_32_10_t;
4343
#define COMPAT_RAND_UNIFORM hiprand_uniform
4444
#define COMPAT_RAND_NORMAL hiprand_normal
4545
#elif defined(PADDLE_WITH_MUSA)
46-
using COMPAT_RANDSTATEPHILOX4_32_10_T = murand_state_philox4x32_10_t;
46+
using COMPAT_RANDSTATEPHILOX4_32_10_T = murand_state_philox4x32_10;
4747
#define COMPAT_RAND_INIT murand_init
4848
#define COMPAT_RAND_UNIFORM murand_uniform
4949
#define COMPAT_RAND_NORMAL murand_normal

0 commit comments

Comments
 (0)