diff --git a/sycl/doc/extensions/GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc b/sycl/doc/extensions/GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc index 31392cd740a82..4c25bdc7b5b20 100755 --- a/sycl/doc/extensions/GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc +++ b/sycl/doc/extensions/GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc @@ -51,7 +51,9 @@ John Pennycook, Intel (john 'dot' pennycook 'at' intel 'dot' com) == Dependencies -This extension is written against the SYCL 1.2.1 specification, Revision v1.2.1-6. +This extension is written against the SYCL 1.2.1 specification, Revision v1.2.1-6 and the following extensions: + +- +SYCL_INTEL_extended_atomics+ == Overview @@ -67,6 +69,10 @@ The extension introduces the following functions: - +reduce+ - +exclusive_scan+ - +inclusive_scan+ +- +barrier+ + +The definitions and behavior of the following functions are based on equivalents in the SYCL 2020 provisional specification: +- +barrier+ === Alignment with OpenCL vs C++ @@ -252,6 +258,21 @@ The return types of the collective functions in {cpp}17 are not deduced from the |Perform an inclusive scan over the values in the range [_first_, _last_) using the operator _binary_op_, which must be one of the group algorithms library function objects. The value written to +result + i+ is the inclusive scan of the first +i+ values in the range and an initial value specified by _init_. Returns a pointer to the end of the output range. _first_, _last_, _result_, _binary_op_ and _init_ must be the same for all work-items in the group. _binary_op(init, *first)_ must return a value of type _T_. |=== +==== Synchronization + +The behavior of memory fences in this section is aligned with the single happens-before relationship defined by the +SYCL_INTEL_extended_atomics+ extension. + +|=== +|Function|Description + +|+template void barrier(Group g);+ +|Synchronize all work-items in the group, and ensure that all memory accesses to any address space prior to the barrier are visible to all work-items in the group after the barrier. The scope of the group memory fences implied by this barrier is the narrowest scope including all work-items in the group. + +|+template void barrier(Group g, memory_scope scope);+ +|Synchronize all work-items in the group, and ensure that all memory accesses to any address space prior to the barrier are visible to all work-items specified by _scope_ after the barrier. The scope of the group memory fences implied by this barrier is controlled by _scope_ and must be broader than the narrowest scope including all work-items in the group. If the specified _scope_ is narrower than the narrowest scope including all work-items in the group, the _scope_ argument is ignored. + +|=== + == Issues None. @@ -270,6 +291,7 @@ None. |======================================== |Rev|Date|Author|Changes |1|2020-01-30|John Pennycook|*Initial public working draft* +|2|2020-07-28|John Pennycook|*Add group barrier* |======================================== //************************************************************************ diff --git a/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc b/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc index af7ed1771f393..6bb38abf05f91 100755 --- a/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc +++ b/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc @@ -70,6 +70,7 @@ This extension adds sub-group support to all of the functions from +SYCL_INTEL_g - +reduce+ - +exclusive_scan+ - +inclusive_scan+ +- +barrier+ It additionally introduces a number of functions that are currently specific to sub-groups: @@ -165,6 +166,7 @@ None. |======================================== |Rev|Date|Author|Changes |1|2020-03-16|John Pennycook|*Initial public working draft* +|2|2020-07-28|John Pennycook|*Add group barrier* |======================================== //************************************************************************ diff --git a/sycl/include/CL/sycl/intel/group_algorithm.hpp b/sycl/include/CL/sycl/intel/group_algorithm.hpp index 932a53ba07675..e6fdd057d9679 100644 --- a/sycl/include/CL/sycl/intel/group_algorithm.hpp +++ b/sycl/include/CL/sycl/intel/group_algorithm.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -77,6 +78,15 @@ template <> inline id<3> linear_id_to_id(range<3> r, size_t linear_id) { return result; } +// TODO: Replace with Group::fence_scope from SYCL 2020 provisional +template struct FenceScope { + static constexpr intel::memory_scope value = intel::memory_scope::work_group; +}; + +template <> struct FenceScope { + static constexpr intel::memory_scope value = intel::memory_scope::sub_group; +}; + template struct identity {}; template struct identity> { @@ -890,6 +900,42 @@ template bool leader(Group g) { #endif } +template void barrier(Group, memory_scope scope) { + static_assert(sycl::detail::is_generic_group::value, + "Group algorithms only support the sycl::group and " + "intel::sub_group class."); +#ifdef __SYCL_DEVICE_ONLY__ + // MemoryScope must be broader than Group scope for correctness + auto GroupScope = detail::FenceScope::value; + auto BroadestScope = (scope > GroupScope) ? scope : GroupScope; + auto MemoryScope = sycl::detail::spirv::getScope(BroadestScope); + auto ExecutionScope = sycl::detail::spirv::group_scope::value; + __spirv_ControlBarrier(ExecutionScope, MemoryScope, + __spv::MemorySemanticsMask::AcquireRelease | + __spv::MemorySemanticsMask::SubgroupMemory | + __spv::MemorySemanticsMask::WorkgroupMemory | + __spv::MemorySemanticsMask::CrossWorkgroupMemory); +#else + (void)scope; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + +template void barrier(Group g) { + static_assert(sycl::detail::is_generic_group::value, + "Group algorithms only support the sycl::group and " + "intel::sub_group class."); +#ifdef __SYCL_DEVICE_ONLY__ + auto MemoryScope = detail::FenceScope::value; + barrier(g, MemoryScope); +#else + (void)g; + throw runtime_error("Group algorithms are not supported on host device.", + PI_INVALID_DEVICE); +#endif +} + } // namespace intel } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/test/group-algorithm/barrier.cpp b/sycl/test/group-algorithm/barrier.cpp new file mode 100644 index 0000000000000..e77398983d8e7 --- /dev/null +++ b/sycl/test/group-algorithm/barrier.cpp @@ -0,0 +1,58 @@ +// UNSUPPORTED: cuda +// +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +using namespace sycl; +using namespace sycl::intel; + +class barrier_kernel; + +void test(queue q) { + + constexpr size_t N = 32; + constexpr size_t L = 16; + std::array out; + std::fill(out.begin(), out.end(), 0); + { + buffer out_buf(out.data(), range<1>{N}); + q.submit([&](handler &cgh) { + auto tmp = + accessor( + L, cgh); + auto out = out_buf.get_access(cgh); + cgh.parallel_for( + nd_range<1>(N, L), [=](nd_item<1> it) { + group<1> g = it.get_group(); + tmp[it.get_local_linear_id()] = it.get_global_linear_id() + 1; + barrier(g); + int result = 0; + for (int i = 0; i < L; ++i) { + result += tmp[i]; + } + out[it.get_global_linear_id()] = result; + }); + }); + } + + // Each work-item should see writes from all other work-items in its group + for (int g = 0; g < N / L; ++g) { + int sum = 0; + for (int wi = 0; wi < L; ++wi) { + sum += g * L + wi + 1; + } + for (int wi = 0; wi < L; ++wi) { + assert(out[g * L + wi] == sum); + } + } +} + +int main() { + queue q; + test(q); + std::cout << "Test passed." << std::endl; +}