From 09b4fe3a5e3b71f59f7a3a2c5a8eb8f798efce93 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Mon, 4 Jan 2021 16:13:24 -0500
Subject: [PATCH 1/6] [SYCL][CUDA] Add missing barrier to collectives

SYCL sub-group and group functions should act as synchronization points.
Group collectives need a barrier at the end to ensure that back-to-back
collectives do not lead to a race condition.

Note that the barrier at the beginning of each collective occurs after
each work-item writes its partial results to the scratch space.  This is
assumed safe because only the collective functions can access the space,
and collective functions must be encountered in uniform control flow; any
work-item encountering a collective function can assume it is safe to use
the scratch space, because all work-items in the same work-group must have
either executed no collective functions or the barrier at the end of a previous
collective function.

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 libclc/ptx-nvidiacl/libspirv/group/collectives.cl | 1 +
 1 file changed, 1 insertion(+)
diff --git a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl
index fba9ad72d8a52..96e6ba9e20632 100644
--- a/libclc/ptx-nvidiacl/libspirv/group/collectives.cl
+++ b/libclc/ptx-nvidiacl/libspirv/group/collectives.cl
@@ -264,6 +264,7 @@ __CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, double, -DBL_MAX)
         result = OP(sg_x, scratch[sg_id - 1]);                                 \
       }                                                                        \
     }                                                                          \
+    __spirv_ControlBarrier(Workgroup, 0, 0);                                   \
     return result;                                                             \
   }
 

From 14438de7d3ce50968ebd9f0bd43f325c0b57966b Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Mon, 11 Jan 2021 12:27:29 -0500
Subject: [PATCH 2/6] [SYCL] Add regression test for collective barrier

Calls reduce, exclusive scan and inclusive scan multiple times back-to-back.
Note that since we are testing for a race condition, it is possible for this
test to pass even with an incorrect implementation.

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 .../regression/back_to_back_collectives.cpp   | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 sycl/test/regression/back_to_back_collectives.cpp

diff --git a/sycl/test/regression/back_to_back_collectives.cpp b/sycl/test/regression/back_to_back_collectives.cpp
new file mode 100644
index 0000000000000..8276f6d20a562
--- /dev/null
+++ b/sycl/test/regression/back_to_back_collectives.cpp
@@ -0,0 +1,64 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <numeric>
+#include <vector>
+using namespace cl::sycl;
+using namespace cl::sycl::ONEAPI;
+
+int main() {
+  queue q;
+  if (q.get_device().is_host()) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  // Use max work-group size to maximize chance of race
+  int N = q.get_device().get_info<info::device::max_work_group_size>();
+
+  std::vector<int> Input(N), Sum(N), EScan(N), IScan(N);
+  std::iota(Input.begin(), Input.end(), 0);
+  std::fill(Sum.begin(), Sum.end(), 0);
+  std::fill(EScan.begin(), EScan.end(), 0);
+  std::fill(IScan.begin(), IScan.end(), 0);
+
+  {
+    buffer<int> InputBuf(Input.data(), N);
+    buffer<int> SumBuf(Sum.data(), N);
+    buffer<int> EScanBuf(EScan.data(), N);
+    buffer<int> IScanBuf(IScan.data(), N);
+    q.submit([&](handler &h) {
+      auto Input = InputBuf.get_access<access::mode::read>(h);
+      auto Sum = SumBuf.get_access<access::mode::write>(h);
+      auto EScan = EScanBuf.get_access<access::mode::write>(h);
+      auto IScan = IScanBuf.get_access<access::mode::write>(h);
+      h.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+        size_t i = it.get_global_id(0);
+        auto g = it.get_group();
+        // Loop to increase number of back-to-back calls
+        for (int r = 0; r < 10; ++r) {
+          Sum[i] = reduce(g, Input[i], plus<>());
+          EScan[i] = exclusive_scan(g, Input[i], plus<>());
+          IScan[i] = inclusive_scan(g, Input[i], plus<>());
+        }
+      });
+    });
+  }
+
+  int sum = 0;
+  bool passed = true;
+  for (int i = 0; i < N; ++i) {
+    passed &= (sum == EScan[i]);
+    sum += i;
+    passed &= (sum == IScan[i]);
+  }
+  for (int i = 0; i < N; ++i) {
+    passed &= (sum == Sum[i]);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}

From d7145ab57c2590d0e4057ced0ec5dc00f79d3d10 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Mon, 11 Jan 2021 14:15:06 -0500
Subject: [PATCH 3/6] [SYCL][NFC] Move test from regression to on-device

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 sycl/test/{regression => on-device}/back_to_back_collectives.cpp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename sycl/test/{regression => on-device}/back_to_back_collectives.cpp (100%)

diff --git a/sycl/test/regression/back_to_back_collectives.cpp b/sycl/test/on-device/back_to_back_collectives.cpp
similarity index 100%
rename from sycl/test/regression/back_to_back_collectives.cpp
rename to sycl/test/on-device/back_to_back_collectives.cpp

From 73a8788091e9844cd7fa6387e4829bb4771d8b3a Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Wed, 13 Jan 2021 12:13:30 +0300
Subject: [PATCH 4/6] Update sycl/test/on-device/back_to_back_collectives.cpp

---
 sycl/test/on-device/back_to_back_collectives.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test/on-device/back_to_back_collectives.cpp b/sycl/test/on-device/back_to_back_collectives.cpp
index 8276f6d20a562..8a6e729b93c59 100644
--- a/sycl/test/on-device/back_to_back_collectives.cpp
+++ b/sycl/test/on-device/back_to_back_collectives.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %HOST_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out

From 036e712aa4a430388d98eddb83dc5b3af574f6c0 Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Wed, 13 Jan 2021 12:57:27 +0300
Subject: [PATCH 5/6] Update sycl/test/on-device/back_to_back_collectives.cpp

---
 sycl/test/on-device/back_to_back_collectives.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test/on-device/back_to_back_collectives.cpp b/sycl/test/on-device/back_to_back_collectives.cpp
index 8a6e729b93c59..165182c248cac 100644
--- a/sycl/test/on-device/back_to_back_collectives.cpp
+++ b/sycl/test/on-device/back_to_back_collectives.cpp
@@ -1,5 +1,5 @@
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %RUN_ON_HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out

From 6d9918b07eed9b58749014d6af16f8b41c86392e Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Wed, 13 Jan 2021 10:27:00 -0500
Subject: [PATCH 6/6] [SYCL] Derive max WG size from kernel query

Device query may return a value too large for a specific kernel;
kernel query is required in order to respect local memory usage.

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 sycl/test/on-device/back_to_back_collectives.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/sycl/test/on-device/back_to_back_collectives.cpp b/sycl/test/on-device/back_to_back_collectives.cpp
index 165182c248cac..492ca0b6a157e 100644
--- a/sycl/test/on-device/back_to_back_collectives.cpp
+++ b/sycl/test/on-device/back_to_back_collectives.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %RUN_ON_HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -10,6 +10,8 @@
 using namespace cl::sycl;
 using namespace cl::sycl::ONEAPI;
 
+class back_to_back;
+
 int main() {
   queue q;
   if (q.get_device().is_host()) {
@@ -18,7 +20,11 @@ int main() {
   }
 
   // Use max work-group size to maximize chance of race
-  int N = q.get_device().get_info<info::device::max_work_group_size>();
+  program prog(q.get_context());
+  prog.build_with_kernel_type<back_to_back>();
+  kernel k = prog.get_kernel<back_to_back>();
+  device d = q.get_device();
+  int N = k.get_info<info::kernel_device_specific::work_group_size>(d);
 
   std::vector<int> Input(N), Sum(N), EScan(N), IScan(N);
   std::iota(Input.begin(), Input.end(), 0);
@@ -36,7 +42,7 @@ int main() {
       auto Sum = SumBuf.get_access<access::mode::write>(h);
       auto EScan = EScanBuf.get_access<access::mode::write>(h);
       auto IScan = IScanBuf.get_access<access::mode::write>(h);
-      h.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
+      h.parallel_for<back_to_back>(nd_range<1>(N, N), [=](nd_item<1> it) {
         size_t i = it.get_global_id(0);
         auto g = it.get_group();
         // Loop to increase number of back-to-back calls