intel · vladimirlaz · Oct 5, 2021 · Sep 28, 2021 · Sep 28, 2021 · Sep 28, 2021
@@ -23,6 +23,7 @@ SYCL/DeviceCodeSplit @AlexeySachkov  @Fznamznon
 
 # Device library
 SYCL/DeviceLib @vzakhari
+SYCL/DeviceLib/ITTAnnotations @vzahkari @MrSidims @AGindinson
 
 # dot_product API
 SYCL/DotProduct @rdeodhar

@@ -0,0 +1,48 @@
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-instrument-device-code %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-instrument-device-code %s -o %t.cpu.out	\
+// RUN: -fsycl-targets=spir64_x86_64-unknown-unknown
+// RUN: %CPU_RUN_PLACEHOLDER %t.cpu.out
+
+#include "CL/sycl.hpp"
+
+using namespace sycl;
+
+int main() {
+  queue q{};
+
+  int source = 42;
+  int target = 0;
+  {
+    buffer<int> source_buf(&source, 1);
+    buffer<int> target_buf(&target, 1);
+
+    // Ensure that a simple kernel gets run when instrumented with
+    // ITT start/finish annotations and ITT atomic start/finish annotations.
+    q.submit([&](handler &cgh) {
+      auto source_acc =
+          source_buf.template get_access<access::mode::read_write>(cgh);
+      auto target_acc =
+          target_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.single_task<class simple_atomic_kernel>([=]() {
+        auto source_atomic =
+            ext::oneapi::atomic_ref<int, memory_order::relaxed,
+                                    memory_scope::device,
+                                    access::address_space::global_space>(
+                source_acc[0]);
+        // Store source value into target
+        target_acc[0] = source_atomic.load();
+        // Nullify source
+        source_atomic.store(0);
+      });
+    });
+  }
+
+  return 0;
+}
@@ -0,0 +1,45 @@
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-instrument-device-code %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-instrument-device-code %s -o %t.cpu.out	\
+// RUN: -fsycl-targets=spir64_x86_64-unknown-unknown
+// RUN: %CPU_RUN_PLACEHOLDER %t.cpu.out
+
+#include "CL/sycl.hpp"
+#include <vector>
+
+using namespace sycl;
+
+int main() {
+  queue q{};
+
+  std::vector<int> data_vec(/*size*/ 10, /*value*/ 0);
+  {
+    range<1> num_items(data_vec.size());
+    buffer<int> buf(data_vec.data(), num_items);
+    range<1> local_range(2);
+
+    // Ensure that a simple kernel gets run when instrumented with
+    // ITT start/finish annotations and ITT wg_barrier/wi_resume annotations.
+    q.submit([&](handler &cgh) {
+      auto acc = buf.get_access<access::mode::read_write>(cgh);
+      accessor<int, 1, access::mode::read_write, access::target::local>
+          local_acc(local_range, cgh);
+      cgh.parallel_for<class simple_barrier_kernel>(
+          nd_range<1>(num_items, local_range), [=](nd_item<1> item) {
+            size_t idx = item.get_global_linear_id();
+            int pos = idx & 1;
+            int opp = pos ^ 1;
+            local_acc[pos] = acc[idx];
+            item.barrier(access::fence_space::local_space);
+            acc[idx] = local_acc[opp];
+          });
+    });
+  }
+
+  return 0;
+}