From 09b4fe3a5e3b71f59f7a3a2c5a8eb8f798efce93 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 4 Jan 2021 16:13:24 -0500 Subject: [PATCH 1/6] [SYCL][CUDA] Add missing barrier to collectives SYCL sub-group and group functions should act as synchronization points. Group collectives need a barrier at the end to ensure that back-to-back collectives do not lead to a race condition. Note that the barrier at the beginning of each collective occurs after each work-item writes its partial results to the scratch space. This is assumed safe because only the collective functions can access the space, and collective functions must be encountered in uniform control flow; any work-item encountering a collective function can assume it is safe to use the scratch space, because all work-items in the same work-group must have either executed no collective functions or the barrier at the end of a previous collective function. Signed-off-by: John Pennycook --- libclc/ptx-nvidiacl/libspirv/group/collectives.cl | 1 + 1 file changed, 1 insertion(+) diff --git a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl index fba9ad72d8a52..96e6ba9e20632 100644 --- a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl +++ b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl @@ -264,6 +264,7 @@ __CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, double, -DBL_MAX) result = OP(sg_x, scratch[sg_id - 1]); \ } \ } \ + __spirv_ControlBarrier(Workgroup, 0, 0); \ return result; \ } From 14438de7d3ce50968ebd9f0bd43f325c0b57966b Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 11 Jan 2021 12:27:29 -0500 Subject: [PATCH 2/6] [SYCL] Add regression test for collective barrier Calls reduce, exclusive scan and inclusive scan multiple times back-to-back. Note that since we are testing for a race condition, it is possible for this test to pass even with an incorrect implementation. Signed-off-by: John Pennycook --- .../regression/back_to_back_collectives.cpp | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 sycl/test/regression/back_to_back_collectives.cpp diff --git a/sycl/test/regression/back_to_back_collectives.cpp b/sycl/test/regression/back_to_back_collectives.cpp new file mode 100644 index 0000000000000..8276f6d20a562 --- /dev/null +++ b/sycl/test/regression/back_to_back_collectives.cpp @@ -0,0 +1,64 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +using namespace cl::sycl; +using namespace cl::sycl::ONEAPI; + +int main() { + queue q; + if (q.get_device().is_host()) { + std::cout << "Skipping test\n"; + return 0; + } + + // Use max work-group size to maximize chance of race + int N = q.get_device().get_info(); + + std::vector Input(N), Sum(N), EScan(N), IScan(N); + std::iota(Input.begin(), Input.end(), 0); + std::fill(Sum.begin(), Sum.end(), 0); + std::fill(EScan.begin(), EScan.end(), 0); + std::fill(IScan.begin(), IScan.end(), 0); + + { + buffer InputBuf(Input.data(), N); + buffer SumBuf(Sum.data(), N); + buffer EScanBuf(EScan.data(), N); + buffer IScanBuf(IScan.data(), N); + q.submit([&](handler &h) { + auto Input = InputBuf.get_access(h); + auto Sum = SumBuf.get_access(h); + auto EScan = EScanBuf.get_access(h); + auto IScan = IScanBuf.get_access(h); + h.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + size_t i = it.get_global_id(0); + auto g = it.get_group(); + // Loop to increase number of back-to-back calls + for (int r = 0; r < 10; ++r) { + Sum[i] = reduce(g, Input[i], plus<>()); + EScan[i] = exclusive_scan(g, Input[i], plus<>()); + IScan[i] = inclusive_scan(g, Input[i], plus<>()); + } + }); + }); + } + + int sum = 0; + bool passed = true; + for (int i = 0; i < N; ++i) { + passed &= (sum == EScan[i]); + sum += i; + passed &= (sum == IScan[i]); + } + for (int i = 0; i < N; ++i) { + passed &= (sum == Sum[i]); + } + std::cout << "Test passed." << std::endl; + return 0; +} From d7145ab57c2590d0e4057ced0ec5dc00f79d3d10 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 11 Jan 2021 14:15:06 -0500 Subject: [PATCH 3/6] [SYCL][NFC] Move test from regression to on-device Signed-off-by: John Pennycook --- sycl/test/{regression => on-device}/back_to_back_collectives.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename sycl/test/{regression => on-device}/back_to_back_collectives.cpp (100%) diff --git a/sycl/test/regression/back_to_back_collectives.cpp b/sycl/test/on-device/back_to_back_collectives.cpp similarity index 100% rename from sycl/test/regression/back_to_back_collectives.cpp rename to sycl/test/on-device/back_to_back_collectives.cpp From 73a8788091e9844cd7fa6387e4829bb4771d8b3a Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Wed, 13 Jan 2021 12:13:30 +0300 Subject: [PATCH 4/6] Update sycl/test/on-device/back_to_back_collectives.cpp --- sycl/test/on-device/back_to_back_collectives.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/on-device/back_to_back_collectives.cpp b/sycl/test/on-device/back_to_back_collectives.cpp index 8276f6d20a562..8a6e729b93c59 100644 --- a/sycl/test/on-device/back_to_back_collectives.cpp +++ b/sycl/test/on-device/back_to_back_collectives.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out From 036e712aa4a430388d98eddb83dc5b3af574f6c0 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Wed, 13 Jan 2021 12:57:27 +0300 Subject: [PATCH 5/6] Update sycl/test/on-device/back_to_back_collectives.cpp --- sycl/test/on-device/back_to_back_collectives.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/on-device/back_to_back_collectives.cpp b/sycl/test/on-device/back_to_back_collectives.cpp index 8a6e729b93c59..165182c248cac 100644 --- a/sycl/test/on-device/back_to_back_collectives.cpp +++ b/sycl/test/on-device/back_to_back_collectives.cpp @@ -1,5 +1,5 @@ // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %RUN_ON_HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out From 6d9918b07eed9b58749014d6af16f8b41c86392e Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 13 Jan 2021 10:27:00 -0500 Subject: [PATCH 6/6] [SYCL] Derive max WG size from kernel query Device query may return a value too large for a specific kernel; kernel query is required in order to respect local memory usage. Signed-off-by: John Pennycook --- sycl/test/on-device/back_to_back_collectives.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sycl/test/on-device/back_to_back_collectives.cpp b/sycl/test/on-device/back_to_back_collectives.cpp index 165182c248cac..492ca0b6a157e 100644 --- a/sycl/test/on-device/back_to_back_collectives.cpp +++ b/sycl/test/on-device/back_to_back_collectives.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %RUN_ON_HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out @@ -10,6 +10,8 @@ using namespace cl::sycl; using namespace cl::sycl::ONEAPI; +class back_to_back; + int main() { queue q; if (q.get_device().is_host()) { @@ -18,7 +20,11 @@ int main() { } // Use max work-group size to maximize chance of race - int N = q.get_device().get_info(); + program prog(q.get_context()); + prog.build_with_kernel_type(); + kernel k = prog.get_kernel(); + device d = q.get_device(); + int N = k.get_info(d); std::vector Input(N), Sum(N), EScan(N), IScan(N); std::iota(Input.begin(), Input.end(), 0); @@ -36,7 +42,7 @@ int main() { auto Sum = SumBuf.get_access(h); auto EScan = EScanBuf.get_access(h); auto IScan = IScanBuf.get_access(h); - h.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + h.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { size_t i = it.get_global_id(0); auto g = it.get_group(); // Loop to increase number of back-to-back calls