diff --git a/sycl/include/CL/sycl/detail/type_traits.hpp b/sycl/include/CL/sycl/detail/type_traits.hpp index df480f58f99ff..ee2a3bd9a327d 100644 --- a/sycl/include/CL/sycl/detail/type_traits.hpp +++ b/sycl/include/CL/sycl/detail/type_traits.hpp @@ -207,6 +207,19 @@ template struct is_vector_arithmetic : bool_constant::value && is_arithmetic::value> {}; +// is_bool +template +struct is_scalar_bool + : bool_constant, bool>::value> {}; + +template +struct is_vector_bool + : bool_constant::value && + is_scalar_bool>::value> {}; + +template +struct is_bool : bool_constant>::value> {}; + // is_pointer template struct is_pointer_impl : std::false_type {}; diff --git a/sycl/include/CL/sycl/group.hpp b/sycl/include/CL/sycl/group.hpp index 9692f33b094ec..52b5e61d1da75 100644 --- a/sycl/include/CL/sycl/group.hpp +++ b/sycl/include/CL/sycl/group.hpp @@ -274,58 +274,99 @@ template class group { __spirv_MemoryBarrier(__spv::Scope::Workgroup, flags); } + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest + /// with a source stride specified by \p srcStride, and returns a SYCL + /// device_event which can be used to wait on the completion of the copy. + /// Permitted types for dataT are all scalar and vector types, except boolean. template - device_event async_work_group_copy(local_ptr dest, - global_ptr src, - size_t numElements) const { + detail::enable_if_t::value, device_event> + async_work_group_copy(local_ptr dest, global_ptr src, + size_t numElements, size_t srcStride) const { using DestT = detail::ConvertToOpenCLType_t; using SrcT = detail::ConvertToOpenCLType_t; - __ocl_event_t e = OpGroupAsyncCopyGlobalToLocal( + __ocl_event_t E = OpGroupAsyncCopyGlobalToLocal( __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), - numElements, 1, 0); - return device_event(&e); + numElements, srcStride, 0); + return device_event(&E); } + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest with + /// the destination stride specified by \p destStride, and returns a SYCL + /// device_event which can be used to wait on the completion of the copy. + /// Permitted types for dataT are all scalar and vector types, except boolean. template - device_event async_work_group_copy(global_ptr dest, - local_ptr src, - size_t numElements) const { + detail::enable_if_t::value, device_event> + async_work_group_copy(global_ptr dest, local_ptr src, + size_t numElements, size_t destStride) const { using DestT = detail::ConvertToOpenCLType_t; using SrcT = detail::ConvertToOpenCLType_t; - __ocl_event_t e = OpGroupAsyncCopyLocalToGlobal( + __ocl_event_t E = OpGroupAsyncCopyLocalToGlobal( __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), - numElements, 1, 0); - return device_event(&e); + numElements, destStride, 0); + return device_event(&E); + } + + /// Specialization for scalar bool type. + /// Asynchronously copies a number of elements specified by \p NumElements + /// from the source pointed by \p Src to destination pointed by \p Dest + /// with a stride specified by \p Stride, and returns a SYCL device_event + /// which can be used to wait on the completion of the copy. + template + detail::enable_if_t::value, device_event> + async_work_group_copy(multi_ptr Dest, multi_ptr Src, + size_t NumElements, size_t Stride) const { + static_assert(sizeof(bool) == sizeof(uint8_t), + "Async copy to/from bool memory is not supported."); + auto DestP = + multi_ptr(reinterpret_cast(Dest.get())); + auto SrcP = + multi_ptr(reinterpret_cast(Src.get())); + return async_work_group_copy(DestP, SrcP, NumElements, Stride); + } + + /// Specialization for vector bool type. + /// Asynchronously copies a number of elements specified by \p NumElements + /// from the source pointed by \p Src to destination pointed by \p Dest + /// with a stride specified by \p Stride, and returns a SYCL device_event + /// which can be used to wait on the completion of the copy. + template + detail::enable_if_t::value, device_event> + async_work_group_copy(multi_ptr Dest, multi_ptr Src, + size_t NumElements, size_t Stride) const { + static_assert(sizeof(bool) == sizeof(uint8_t), + "Async copy to/from bool memory is not supported."); + using VecT = detail::change_base_type_t; + auto DestP = multi_ptr(reinterpret_cast(Dest.get())); + auto SrcP = multi_ptr(reinterpret_cast(Src.get())); + return async_work_group_copy(DestP, SrcP, NumElements, Stride); } + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest and + /// returns a SYCL device_event which can be used to wait on the completion + /// of the copy. + /// Permitted types for dataT are all scalar and vector types. template device_event async_work_group_copy(local_ptr dest, global_ptr src, - size_t numElements, - size_t srcStride) const { - using DestT = detail::ConvertToOpenCLType_t; - using SrcT = detail::ConvertToOpenCLType_t; - - __ocl_event_t e = OpGroupAsyncCopyGlobalToLocal( - __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), - numElements, srcStride, 0); - return device_event(&e); + size_t numElements) const { + return async_work_group_copy(dest, src, numElements, 1); } + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest and + /// returns a SYCL device_event which can be used to wait on the completion + /// of the copy. + /// Permitted types for dataT are all scalar and vector types. template device_event async_work_group_copy(global_ptr dest, local_ptr src, - size_t numElements, - size_t destStride) const { - using DestT = detail::ConvertToOpenCLType_t; - using SrcT = detail::ConvertToOpenCLType_t; - - __ocl_event_t e = OpGroupAsyncCopyLocalToGlobal( - __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), - numElements, destStride, 0); - return device_event(&e); + size_t numElements) const { + return async_work_group_copy(dest, src, numElements, 1); } template diff --git a/sycl/test/basic_tests/group_async_copy.cpp b/sycl/test/basic_tests/group_async_copy.cpp new file mode 100644 index 0000000000000..e41b7414a19cb --- /dev/null +++ b/sycl/test/basic_tests/group_async_copy.cpp @@ -0,0 +1,160 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.run +// RUN: %GPU_RUN_PLACEHOLDER %t.run +// RUN: %CPU_RUN_PLACEHOLDER %t.run +// RUN: %ACC_RUN_PLACEHOLDER %t.run +// RUN: env SYCL_DEVICE_FILTER=host %t.run + +#include +#include +#include + +using namespace cl::sycl; + +template class KernelName; + +// Define the number of work items to enqueue. +const size_t NElems = 32; +const size_t WorkGroupSize = 8; +const size_t NWorkGroups = NElems / WorkGroupSize; + +template void initInputBuffer(buffer &Buf, size_t Stride) { + auto Acc = Buf.template get_access(); + for (size_t I = 0; I < Buf.get_count(); I += WorkGroupSize) { + for (size_t J = 0; J < WorkGroupSize; J++) + Acc[I + J] = I + J + ((J % Stride == 0) ? 100 : 0); + } +} + +template void initOutputBuffer(buffer &Buf) { + auto Acc = Buf.template get_access(); + for (size_t I = 0; I < Buf.get_count(); I++) + Acc[I] = 0; +} + +template struct is_vec : std::false_type {}; +template struct is_vec> : std::true_type {}; + +template bool checkEqual(vec A, size_t B) { + T TB = B; + return A.s0() == TB; +} + +template bool checkEqual(vec A, size_t B) { + T TB = B; + return A.x() == TB && A.y() == TB && A.z() == TB && A.w() == TB; +} + +template +typename std::enable_if::value, bool>::type checkEqual(T A, + size_t B) { + T TB = B; + return A == TB; +} + +template std::string toString(vec A) { + std::string R("("); + return R + std::to_string(A.s0()) + ")"; +} + +template std::string toString(vec A) { + std::string R("("); + R += std::to_string(A.x()) + "," + std::to_string(A.y()) + "," + + std::to_string(A.z()) + "," + std::to_string(A.w()) + ")"; + return R; +} + +template +typename std::enable_if::value, std::string>::type toString(T A) { + return std::to_string(A); +} + +template int checkResults(buffer &OutBuf, size_t Stride) { + auto Out = OutBuf.template get_access(); + int EarlyFailout = 20; + + for (size_t I = 0; I < OutBuf.get_count(); I += WorkGroupSize) { + for (size_t J = 0; J < WorkGroupSize; J++) { + size_t ExpectedVal = (J % Stride == 0) ? (100 + I + J) : 0; + if (!checkEqual(Out[I + J], ExpectedVal)) { + std::cerr << std::string(typeid(T).name()) + ": Stride=" << Stride + << " : Incorrect value at index " << I + J + << " : Expected: " << toString(ExpectedVal) + << ", Computed: " << toString(Out[I + J]) << "\n"; + if (--EarlyFailout == 0) + return 1; + } + } + } + return EarlyFailout - 20; +} + +template int test(size_t Stride) { + queue Q; + + buffer InBuf(NElems); + buffer OutBuf(NElems); + + initInputBuffer(InBuf, Stride); + initOutputBuffer(OutBuf); + + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + auto Out = OutBuf.template get_access(CGH); + accessor Local( + range<1>{WorkGroupSize}, CGH); + + nd_range<1> NDR{range<1>(NElems), range<1>(WorkGroupSize)}; + CGH.parallel_for>(NDR, [=](nd_item<1> NDId) { + auto GrId = NDId.get_group_linear_id(); + auto Group = NDId.get_group(); + size_t NElemsToCopy = + WorkGroupSize / Stride + ((WorkGroupSize % Stride) ? 1 : 0); + size_t Offset = GrId * WorkGroupSize; + if (Stride == 1) { // Check the version without stride arg. + auto E = NDId.async_work_group_copy( + Local.get_pointer(), In.get_pointer() + Offset, NElemsToCopy); + E.wait(); + } else { + auto E = NDId.async_work_group_copy(Local.get_pointer(), + In.get_pointer() + Offset, + NElemsToCopy, Stride); + E.wait(); + } + + if (Stride == 1) { // Check the version without stride arg. + auto E = Group.async_work_group_copy( + Out.get_pointer() + Offset, Local.get_pointer(), NElemsToCopy); + Group.wait_for(E); + } else { + auto E = Group.async_work_group_copy(Out.get_pointer() + Offset, + Local.get_pointer(), NElemsToCopy, + Stride); + Group.wait_for(E); + } + }); + }).wait(); + + return checkResults(OutBuf, Stride); +} + +int main() { + for (int Stride = 1; Stride < WorkGroupSize; Stride++) { + if (test(Stride)) + return 1; + if (test>(Stride)) + return 1; + if (test(Stride)) + return 1; + if (test(Stride)) + return 1; + if (test>(Stride)) + return 1; + if (test>(Stride)) + return 1; + if (test(Stride)) + return 1; + } + + std::cout << "Test passed.\n"; + return 0; +}