@@ -53,7 +53,7 @@ struct CastDataType {
5353 auto *context = static_cast <const platform::CPUDeviceContext *>(ctx_);
5454 trans (*context, in_begin, in_end, out_begin,
5555 CastDataTypeFunctor<InType, OutType>());
56- #ifdef __NVCC__
56+ #if defined( __NVCC__) || defined(__HIPCC__)
5757 } else if (platform::is_gpu_place (in_.place ())) {
5858 platform::Transform<platform::CUDADeviceContext> trans;
5959 auto *context = static_cast <const platform::CUDADeviceContext *>(ctx_);
@@ -67,10 +67,11 @@ struct CastDataType {
6767 }
6868 }
6969};
70+
7071template <typename T>
71- void GpuCopy (T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
72- int64_t ele_size) {
73- #ifdef PADDLE_WITH_CUDA
72+ void DeviceCopy (T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
73+ int64_t ele_size) {
74+ #if defined( PADDLE_WITH_CUDA)
7475 platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance ();
7576 int device_num = paddle::platform::GetCurrentDeviceId ();
7677 platform::CUDAPlace gpu_place (device_num);
@@ -90,6 +91,30 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
9091 " Only GPU related Copy can reach this func." ));
9192 }
9293 cudaStreamSynchronize (dev_ctx->stream ());
94+ #elif defined(PADDLE_WITH_HIP)
95+ platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance ();
96+ int device_num = paddle::platform::GetCurrentDeviceId ();
97+ platform::CUDAPlace gpu_place (device_num);
98+ auto *dev_ctx =
99+ static_cast <const platform::CUDADeviceContext *>(pool.Get (gpu_place));
100+ if ((src_plc == PlaceType::kHIP ) && (dst_plc == PlaceType::kCPU )) {
101+ memory::Copy (platform::CPUPlace (), static_cast <void *>(dst), gpu_place, src,
102+ ele_size, dev_ctx->stream ());
103+ } else if ((src_plc == PlaceType::kHIP ) && (dst_plc == PlaceType::kHIP )) {
104+ memory::Copy (gpu_place, static_cast <void *>(dst), gpu_place, src, ele_size,
105+ dev_ctx->stream ());
106+ } else if ((src_plc == PlaceType::kCPU ) && (dst_plc == PlaceType::kHIP )) {
107+ memory::Copy (gpu_place, static_cast <void *>(dst), platform::CPUPlace (), src,
108+ ele_size, dev_ctx->stream ());
109+ } else {
110+ PADDLE_THROW (platform::errors::Unavailable (
111+ " Only GPU related Copy can reach this func." ));
112+ }
113+ hipStreamSynchronize (dev_ctx->stream ());
114+ #else
115+ PADDLE_THROW (platform::errors::Unavailable (
116+ " This function can only be used if compiled with"
117+ " either -DWITH_ROCM=ON or -DWITH_GPU=ON" ));
93118#endif
94119}
95120
@@ -137,11 +162,16 @@ T *Tensor::mutable_data() {
137162 case static_cast <int >(PlaceType::kCPU ): {
138163 return tensor->mutable_data <T>(platform::CPUPlace ());
139164 }
140- #ifdef PADDLE_WITH_CUDA
165+ #if defined( PADDLE_WITH_CUDA)
141166 case static_cast <int >(PlaceType::kGPU ): {
142167 int device_num = platform::GetCurrentDeviceId ();
143168 return tensor->mutable_data <T>(platform::CUDAPlace (device_num));
144169 }
170+ #elif defined(PADDLE_WITH_HIP)
171+ case static_cast <int >(PlaceType::kHIP ): {
172+ int device_num = platform::GetCurrentDeviceId ();
173+ return tensor->mutable_data <T>(platform::CUDAPlace (device_num));
174+ }
145175#endif
146176 default :
147177 PADDLE_THROW (platform::errors::Unavailable (
@@ -202,17 +232,23 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
202232 target.reshape (shape ());
203233 auto *p_target_data = target.template mutable_data <T>();
204234
235+ bool supported_gpu_transform = false ;
236+ #if defined(PADDLE_WITH_CUDA)
237+ supported_gpu_transform =
238+ (src_place == PlaceType::kGPU && target_place == PlaceType::kCPU ) ||
239+ (src_place == PlaceType::kCPU && target_place == PlaceType::kGPU ) ||
240+ (src_place == PlaceType::kGPU && target_place == PlaceType::kGPU );
241+ #elif defined(PADDLE_WITH_HIP)
242+ supported_gpu_transform =
243+ (src_place == PlaceType::kHIP && target_place == PlaceType::kCPU ) ||
244+ (src_place == PlaceType::kCPU && target_place == PlaceType::kHIP ) ||
245+ (src_place == PlaceType::kHIP && target_place == PlaceType::kHIP );
246+ #endif
247+
205248 if ((src_place == PlaceType::kCPU ) && (target_place == PlaceType::kCPU )) {
206249 std::memcpy (static_cast <void *>(p_target_data), p_src_data, ele_size);
207- } else if ((src_place == PlaceType::kGPU ) &&
208- (target_place == PlaceType::kCPU )) {
209- GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
210- } else if ((src_place == PlaceType::kCPU ) &&
211- (target_place == PlaceType::kGPU )) {
212- GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
213- } else if ((src_place == PlaceType::kGPU ) &&
214- (target_place == PlaceType::kGPU )) {
215- GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
250+ } else if (supported_gpu_transform) {
251+ DeviceCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
216252 } else {
217253 PADDLE_THROW (platform::errors::Unavailable (
218254 " Not supported place transform of place: %d to place: %d" ,
@@ -304,13 +340,18 @@ const PlaceType &Tensor::place() const {
304340 GET_CASTED_TENSOR;
305341 if (platform::is_cpu_place (tensor->place ())) {
306342 place_ = PlaceType::kCPU ;
343+ #if defined(PADDLE_WITH_CUDA)
307344 } else if (platform::is_gpu_place (tensor->place ())) {
308345 place_ = PlaceType::kGPU ;
346+ #elif defined(PADDLE_WITH_HIP)
347+ } else if (platform::is_gpu_place (tensor->place ())) {
348+ place_ = PlaceType::kHIP ;
349+ #endif
309350 } else {
310351 PADDLE_THROW (platform::errors::Unimplemented (
311352 " Current Tensor hold unsupported Place Type, Please Init it"
312- " using Tensor::mutable_data<T>(PaddlePlace) which T is "
313- " either Place::kCPU or Place::kGPU" ));
353+ " using Tensor::mutable_data<T>(PaddlePlace) with T among: "
354+ " Place::kCPU or Place::kGPU or Place::kHIP " ));
314355 }
315356 return place_;
316357}
@@ -392,16 +433,21 @@ bool Tensor::is_initialized() const {
392433 }
393434}
394435
395- #ifdef PADDLE_WITH_CUDA
396- cudaStream_t Tensor::stream () const {
397- if (!stream_.IsStreamSet ()) {
398- PADDLE_THROW (platform::errors::PreconditionNotMet (
399- " Stream is not Set, only input tensor will have "
400- " stream which is set by framework " ));
401- } else {
402- return reinterpret_cast <cudaStream_t>(stream_.GetStream ());
436+ #define DEFINE_STREAM (_stream_t_ ) \
437+ _stream_t_ Tensor::stream () const { \
438+ if (!stream_.IsStreamSet ()) { \
439+ PADDLE_THROW (platform::errors::PreconditionNotMet ( \
440+ " Stream is not Set, only input tensor will have " \
441+ " stream which is set by framework " )); \
442+ } else { \
443+ return reinterpret_cast <_stream_t_>(stream_.GetStream ()); \
444+ } \
403445 }
404- }
446+
447+ #if defined(PADDLE_WITH_CUDA)
448+ DEFINE_STREAM (cudaStream_t)
449+ #elif defined(PADDLE_WITH_HIP)
450+ DEFINE_STREAM (hipStream_t)
405451#endif
406452
407453namespace framework {
0 commit comments