intel · vladimirlaz · Mar 4, 2021 · Feb 26, 2021 · Mar 1, 2021 · Mar 1, 2021
@@ -154,7 +154,7 @@ void exit_if_not_equal_vec(vec<T, N> val, vec<T, N> ref, const char *name) {
 }
 
 bool core_sg_supported(const device &Device) {
-  return (Device.has_extension("cl_khr_subgroups") ||
-          Device.get_info<info::device::version>().find(" 2.1") !=
-              string_class::npos);
+  if (Device.has_extension("cl_khr_subgroups"))
+    return true;
+  return Device.get_info<info::device::version>() >= "2.1";
 }
@@ -88,7 +88,7 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_bPPlfvdGShi>,
            T>(Queue, T(0), ONEAPI::maximum<T>(), true, G, L);
 
-#if __cplusplus >= 201402L
+  // Transparent operator functors.
   check_op<sycl_subgr<SpecializationKernelName,
                       class KernelName_fkOyLRYirfMnvBcnbRFy>,
            T>(Queue, T(L), ONEAPI::plus<>(), false, G, L);
@@ -107,5 +107,52 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
   check_op<
       sycl_subgr<SpecializationKernelName, class KernelName_BaCGaWDMFeMFqvotbk>,
       T>(Queue, T(0), ONEAPI::maximum<>(), true, G, L);
-#endif
+}
+
+template <typename SpecializationKernelName, typename T>
+void check_mul(queue &Queue, size_t G = 256, size_t L = 4) {
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulF>, T>(
+      Queue, T(G), ONEAPI::multiplies<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulT>, T>(
+      Queue, T(1), ONEAPI::multiplies<T>(), true, G, L);
+
+  // Transparent operator functors.
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulFV>, T>(
+      Queue, T(G), ONEAPI::multiplies<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulTV>, T>(
+      Queue, T(1), ONEAPI::multiplies<>(), true, G, L);
+}
+
+template <typename SpecializationKernelName, typename T>
+void check_bit_ops(queue &Queue, size_t G = 256, size_t L = 4) {
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORF>, T>(
+      Queue, T(G), ONEAPI::bit_or<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORT>, T>(
+      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORF>, T>(
+      Queue, T(G), ONEAPI::bit_xor<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORT>, T>(
+      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDF>, T>(
+      Queue, T(G), ONEAPI::bit_and<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDT>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
+
+  // Transparent operator functors
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORFV>, T>(
+      Queue, T(G), ONEAPI::bit_or<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORTV>, T>(
+      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORFV>, T>(
+      Queue, T(G), ONEAPI::bit_xor<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORTV>, T>(
+      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDFV>, T>(
+      Queue, T(G), ONEAPI::bit_and<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDTV>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
 }
@@ -0,0 +1,39 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 reduce algorithm
+// used with the operation MUL, bitwise OR, XOR, AND.
+
+#include "reduce.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  check_mul<class MulA, int>(Queue);
+  check_mul<class MulB, unsigned int>(Queue);
+  check_mul<class MulC, long>(Queue);
+  check_mul<class MulD, unsigned long>(Queue);
+  check_mul<class MulE, float>(Queue);
+
+  check_bit_ops<class A, int>(Queue);
+  check_bit_ops<class B, unsigned int>(Queue);
+  check_bit_ops<class C, unsigned>(Queue);
+  check_bit_ops<class D, long>(Queue);
+  check_bit_ops<class E, unsigned long>(Queue);
+  check_bit_ops<class F, long long>(Queue);
+  check_bit_ops<class G, unsigned long long>(Queue);
+  return 0;
+}
@@ -0,0 +1,20 @@
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 reduce algorithm
+// used with MUL operation.
+
+#include "reduce.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_mul<class MulHalf, cl::sycl::half>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
@@ -0,0 +1,27 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 reduce algorithm
+// used with MUL operation.
+
+#include "reduce.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_mul<class MulDouble, double>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
@@ -115,7 +115,7 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
         Queue, std::numeric_limits<T>::min(), ONEAPI::maximum<T>(), true, G, L);
   }
 
-#if __cplusplus >= 201402L
+  // Transparent operator functors.
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_TPWS>, T>(
       Queue, T(L), ONEAPI::plus<>(), false, G, L);
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_hWZv>, T>(
@@ -150,5 +150,51 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
         T>(Queue, std::numeric_limits<T>::min(), ONEAPI::maximum<>(), true, G,
            L);
   }
-#endif
+}
+
+template <typename SpecializationKernelName, typename T>
+void check_mul(queue &Queue, size_t G = 256, size_t L = 4) {
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulF>, T>(
+      Queue, T(L), ONEAPI::multiplies<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulT>, T>(
+      Queue, T(1), ONEAPI::multiplies<>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulFV>, T>(
+      Queue, T(L), ONEAPI::multiplies<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulTV>, T>(
+      Queue, T(1), ONEAPI::multiplies<>(), true, G, L);
+}
+
+template <typename SpecializationKernelName, typename T>
+void check_bit_ops(queue &Queue, size_t G = 256, size_t L = 4) {
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORF>, T>(
+      Queue, T(L), ONEAPI::bit_or<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORT>, T>(
+      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORF>, T>(
+      Queue, T(L), ONEAPI::bit_xor<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORT>, T>(
+      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDF>, T>(
+      Queue, T(L), ONEAPI::bit_and<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDT>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
+
+  // Transparent operator functors.
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORFV>, T>(
+      Queue, T(L), ONEAPI::bit_or<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORTV>, T>(
+      Queue, T(0), ONEAPI::bit_or<>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORFV>, T>(
+      Queue, T(L), ONEAPI::bit_xor<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORTV>, T>(
+      Queue, T(0), ONEAPI::bit_xor<>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDFV>, T>(
+      Queue, T(L), ONEAPI::bit_and<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDTV>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<>(), true, G, L);
 }
@@ -0,0 +1,39 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 exclusive_scan() and
+// inclusive_scan() algoriths used with the operation MUL, bitwise OR, XOR, AND.
+
+#include "scan.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_mul<class MulA, int>(Queue);
+  check_mul<class MulB, unsigned int>(Queue);
+  check_mul<class MulC, long>(Queue);
+  check_mul<class MulD, unsigned long>(Queue);
+  check_mul<class MulE, float>(Queue);
+
+  check_bit_ops<class A, int>(Queue);
+  check_bit_ops<class B, unsigned int>(Queue);
+  check_bit_ops<class C, unsigned>(Queue);
+  check_bit_ops<class D, long>(Queue);
+  check_bit_ops<class E, unsigned long>(Queue);
+  check_bit_ops<class F, long long>(Queue);
+  check_bit_ops<class G, unsigned long long>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
@@ -0,0 +1,20 @@
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 exclusive_scan() and
+// inclusive_scan() algoriths used with the MUL operation.
+
+#include "scan.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_mul<class MulHalf, cl::sycl::half>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
@@ -0,0 +1,27 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 exclusive_scan() and
+// inclusive_scan() algoriths used with the MUL operation.
+
+#include "scan.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check<class MulDouble, double>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}