Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
6260333
[SYCL] Added tests for atomics with various memory orders and scopes
t4c1 Oct 26, 2021
a900c8f
[SYCL] merged add tests into one file
t4c1 Nov 18, 2021
d7f7e34
[SYCL] merged tests for other operations
t4c1 Nov 19, 2021
0e272ec
Merge branch 'intel' into atomic_orders_scopes2
t4c1 Nov 19, 2021
0375249
[SYCL] format
t4c1 Nov 19, 2021
c215e68
[SYCL] add testing for both AtomicRef implementations
t4c1 Nov 19, 2021
8a185e1
Merge branch 'atomic_orders_scopes2' of https://github.com/t4c1/llvm-…
t4c1 Nov 19, 2021
d18ca34
[SYCL] format
t4c1 Nov 19, 2021
f3e6079
[SYCL] fixed add test
t4c1 Nov 19, 2021
94b90b7
[SYCL] format
t4c1 Nov 19, 2021
0ff5fe0
tests for remaining atomics
t4c1 Dec 9, 2021
5351b6d
format
t4c1 Dec 21, 2021
a8fb5f8
enabled add test for pointers with orders and scopes
t4c1 Jan 4, 2022
db06775
Merge branch 'intel' into atomic_orders_scopes2
t4c1 Jan 6, 2022
840c89d
Merge branch 'intel' into atomic_orders_scopes2
t4c1 Jan 6, 2022
138a98a
Merge branch 'intel' into atomic_orders_scopes2
t4c1 Jan 27, 2022
996581a
fixed missing newlines at the end of files
t4c1 Jan 27, 2022
d4af22d
Merge branch 'intel' into atomic_orders_scopes2
t4c1 Feb 4, 2022
ccd5690
simplified RUN commands and added requirement for cuda backend
t4c1 Feb 10, 2022
81abc0d
fix typo
t4c1 Feb 10, 2022
8ba8f1a
fix another typo
t4c1 Feb 10, 2022
ed4ecdb
changed how cuda arguments are passed and removed cuda requirement
t4c1 Feb 10, 2022
220b722
restored all RUN lines
t4c1 Feb 10, 2022
94e763f
marked FP tests XFAIL for other backends
t4c1 Feb 10, 2022
e8c2553
correctly set XFAILs and fix reduction tests for other backends
t4c1 Feb 11, 2022
d1e7591
fixed generic tests
t4c1 Feb 15, 2022
9708521
split tests
t4c1 Feb 15, 2022
55795d3
fix copy paste error in sub
t4c1 Feb 16, 2022
9a23a34
split native floating point tests
t4c1 Feb 18, 2022
ad77010
fixed sub_generic tests and XFAILS for native fp tests
t4c1 Feb 21, 2022
8bae70f
format
t4c1 Feb 21, 2022
19c3856
disable event_profiling_info for CUDA
t4c1 Feb 22, 2022
6a4fa77
Update SYCL/Basic/event_profiling_info.cpp
t4c1 Feb 22, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions SYCL/AtomicRef/add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,6 @@
#include <iostream>
using namespace sycl;

// Floating-point types do not support pre- or post-increment
template <> void add_test<float>(queue q, size_t N) {
add_fetch_test<float>(q, N);
add_plus_equal_test<float>(q, N);
}

int main() {
queue q;

Expand Down
127 changes: 100 additions & 27 deletions SYCL/AtomicRef/add.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,84 @@
#include <algorithm>
#include <cassert>
#include <numeric>
#include <type_traits>
#include <vector>

using namespace sycl;
using namespace sycl::ext::oneapi;

template <typename T, typename Difference = T>
template <typename T, typename Difference = T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void add_fetch_local_test(queue q, size_t N) {
T sum = 0;
std::vector<T> output(N);
std::fill(output.begin(), output.end(), T(123456));
{
buffer<T> sum_buf(&sum, 1);
buffer<T> output_buf(output.data(), output.size());
q.submit([&](handler &cgh) {
auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
auto out =
output_buf.template get_access<access::mode::discard_write>(cgh);
accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
cgh);

cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
int gid = it.get_global_id(0);
if (gid == 0)
loc[0] = 0;
it.barrier(access::fence_space::local_space);
auto atm = atomic_ref < T,
(order == memory_order::acquire || order == memory_order::release)
? memory_order::relaxed
: order,
scope, access::address_space::local_space > (loc[0]);
out[gid] = atm.fetch_add(Difference(1), order);
it.barrier(access::fence_space::local_space);
if (gid == 0)
sum[0] = loc[0];
});
}).wait_and_throw();
}

// All work-items increment by 1, so final value should be equal to N
assert(sum == T(N));

// Fetch returns original value: will be in [0, N-1]
auto min_e = std::min_element(output.begin(), output.end());
auto max_e = std::max_element(output.begin(), output.end());
assert(*min_e == 0 && *max_e == T(N - 1));

// Intermediate values should be unique
std::sort(output.begin(), output.end());
assert(std::unique(output.begin(), output.end()) == output.end());
}

template <typename T, typename Difference = T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void add_fetch_test(queue q, size_t N) {
T sum = 0;
std::vector<T> output(N);
std::fill(output.begin(), output.end(), T(0));
{
buffer<T> sum_buf(&sum, 1);
buffer<T> output_buf(output.data(), output.size());

q.submit([&](handler &cgh) {
auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
auto out =
output_buf.template get_access<access::mode::discard_write>(cgh);
cgh.parallel_for(range<1>(N), [=](item<1> it) {
int gid = it.get_id(0);
auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
access::address_space::global_space>(sum[0]);
out[gid] = atm.fetch_add(Difference(1));
});
});
auto sum = sum_buf.template get_access<access::mode::read_write>(cgh);
auto out =
output_buf.template get_access<access::mode::discard_write>(cgh);
cgh.parallel_for(range<1>(N), [=](item<1> it) {
int gid = it.get_id(0);
auto atm = atomic_ref < T,
(order == memory_order::acquire || order == memory_order::release)
? memory_order::relaxed
: order,
scope, access::address_space::global_space > (sum[0]);
out[gid] = atm.fetch_add(Difference(1), order);
});
}).wait_and_throw();
}

// All work-items increment by 1, so final value should be equal to N
Expand All @@ -37,14 +90,16 @@ void add_fetch_test(queue q, size_t N) {
// Fetch returns original value: will be in [0, N-1]
auto min_e = std::min_element(output.begin(), output.end());
auto max_e = std::max_element(output.begin(), output.end());
assert(*min_e == T(0) && *max_e == T(N - 1));
assert(*min_e == 0 && *max_e == T(N - 1));

// Intermediate values should be unique
std::sort(output.begin(), output.end());
assert(std::unique(output.begin(), output.end()) == output.end());
}

template <typename T, typename Difference = T>
template <typename T, typename Difference = T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void add_plus_equal_test(queue q, size_t N) {
T sum = 0;
std::vector<T> output(N);
Expand All @@ -59,8 +114,11 @@ void add_plus_equal_test(queue q, size_t N) {
output_buf.template get_access<access::mode::discard_write>(cgh);
cgh.parallel_for(range<1>(N), [=](item<1> it) {
int gid = it.get_id(0);
auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
access::address_space::global_space>(sum[0]);
auto atm = atomic_ref < T,
(order == memory_order::acquire || order == memory_order::release)
? memory_order::relaxed
: order,
scope, access::address_space::global_space > (sum[0]);
out[gid] = atm += Difference(1);
});
});
Expand All @@ -79,7 +137,9 @@ void add_plus_equal_test(queue q, size_t N) {
assert(std::unique(output.begin(), output.end()) == output.end());
}

template <typename T, typename Difference = T>
template <typename T, typename Difference = T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void add_pre_inc_test(queue q, size_t N) {
T sum = 0;
std::vector<T> output(N);
Expand All @@ -94,8 +154,11 @@ void add_pre_inc_test(queue q, size_t N) {
output_buf.template get_access<access::mode::discard_write>(cgh);
cgh.parallel_for(range<1>(N), [=](item<1> it) {
int gid = it.get_id(0);
auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
access::address_space::global_space>(sum[0]);
auto atm = atomic_ref < T,
(order == memory_order::acquire || order == memory_order::release)
? memory_order::relaxed
: order,
scope, access::address_space::global_space > (sum[0]);
out[gid] = ++atm;
});
});
Expand All @@ -114,7 +177,9 @@ void add_pre_inc_test(queue q, size_t N) {
assert(std::unique(output.begin(), output.end()) == output.end());
}

template <typename T, typename Difference = T>
template <typename T, typename Difference = T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void add_post_inc_test(queue q, size_t N) {
T sum = 0;
std::vector<T> output(N);
Expand All @@ -129,8 +194,11 @@ void add_post_inc_test(queue q, size_t N) {
output_buf.template get_access<access::mode::discard_write>(cgh);
cgh.parallel_for(range<1>(N), [=](item<1> it) {
int gid = it.get_id(0);
auto atm = atomic_ref<T, memory_order::relaxed, memory_scope::device,
access::address_space::global_space>(sum[0]);
auto atm = atomic_ref < T,
(order == memory_order::acquire || order == memory_order::release)
? memory_order::relaxed
: order,
scope, access::address_space::global_space > (sum[0]);
out[gid] = atm++;
});
});
Expand All @@ -149,10 +217,15 @@ void add_post_inc_test(queue q, size_t N) {
assert(std::unique(output.begin(), output.end()) == output.end());
}

template <typename T, typename Difference = T>
template <typename T, typename Difference = T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void add_test(queue q, size_t N) {
add_fetch_test<T, Difference>(q, N);
add_plus_equal_test<T, Difference>(q, N);
add_pre_inc_test<T, Difference>(q, N);
add_post_inc_test<T, Difference>(q, N);
add_fetch_local_test<T, Difference, order, scope>(q, N);
add_fetch_test<T, Difference, order, scope>(q, N);
add_plus_equal_test<T, Difference, order, scope>(q, N);
if constexpr (!std::is_floating_point_v<T>) {
add_pre_inc_test<T, Difference, order, scope>(q, N);
add_post_inc_test<T, Difference, order, scope>(q, N);
}
}
6 changes: 0 additions & 6 deletions SYCL/AtomicRef/add_atomic64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,6 @@
#include <iostream>
using namespace sycl;

// Floating-point types do not support pre- or post-increment
template <> void add_test<double>(queue q, size_t N) {
add_fetch_test<double>(q, N);
add_plus_equal_test<double>(q, N);
}

int main() {
queue q;

Expand Down
49 changes: 49 additions & 0 deletions SYCL/AtomicRef/add_orders_scopes.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out \
// RUN: -Xsycl-target-backend --cuda-gpu-arch=sm_70
// RUN: %HOST_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

#define SYCL_USE_NATIVE_FP_ATOMICS

#include "add.h"
#include <iostream>
using namespace sycl;

template <typename T, typename Difference = T,
memory_order order = memory_order::relaxed>
void add_test_scopes(queue q, size_t N) {
add_test<T, Difference, order, memory_scope::system>(q, N);
add_test<T, Difference, order, memory_scope::device>(q, N);
add_test<T, Difference, order, memory_scope::work_group>(q, N);
add_test<T, Difference, order, memory_scope::sub_group>(q, N);
}

template <typename T, typename Difference = T>
void add_test_orders_scopes(queue q, size_t N) {
add_test_scopes<T, Difference, memory_order::relaxed>(q, N);
add_test_scopes<T, Difference, memory_order::acquire>(q, N);
add_test_scopes<T, Difference, memory_order::release>(q, N);
add_test_scopes<T, Difference, memory_order::acq_rel>(q, N);
}

int main() {
queue q;

constexpr int N = 32;
add_test_orders_scopes<int>(q, N);
add_test_orders_scopes<float>(q, N);
add_test_orders_scopes<unsigned int>(q, N);
add_test_orders_scopes<double>(q, N);
add_test_orders_scopes<long>(q, N);
add_test_orders_scopes<unsigned long>(q, N);

// Include long long tests if they are 64 bits wide
if constexpr (sizeof(long long) == 8) {
add_test_orders_scopes<long long>(q, N);
add_test_orders_scopes<unsigned long long>(q, N);
}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The device associated with the queue must support aspect::atomic64 for these tests to be valid. However, even if you check for that here the compiler may still try to generate the kernels for the 64-bit tests which it will try to build with the other kernels, even though it never intends to run them. This is the reason why many other tests have a _atomic64 test variant.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To handle all possible combinations of supported aspects each of these tests would have to be split into many files (for every combination of supported atomic64, 3 memory scopes and 4 memory orders). That would be a bit too many files in my opinion.

In practice memory orders are currently only supported by sm_70 or higher NVidia devices. These also support all memory scopes and atomic64. That is why I decided to put all new tests into a single file that requires sm_70.

If you have a better suggestion how to handle this, I am happy to change it.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, the number of combinations becomes quite a headache, but the limitations of order/scope/atomic64 may apply to other devices than just CUDA ones.

An option would be to guard different combinations with macros you then define in different //RUN commands of the tests. This should make sure that the kernels are only generated for the combination test cases, avoiding compilation of invalid kernels by only running them if the device supports the combination of that test case. Does that make sense?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would still result in 24 RUN commands per file, but it does allow to merge all test files for each operation into just one.

Although I have to say I do not completely understand what mechanism skips the RUN lines for aspects that are unsupported in the hardware the test is running.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is a drawback of this approach; you cannot have e.g. XFAIL for each, but since you have to check for the features at runtime anyway the tests would have to dispatch.

The benefit of splitting them up (even through the macro alternative) is that the tests will not load binaries with unsupported instructions for a given device. That is, the individual RUNs would start out by checking that the corresponding order/scope/atomic64 is supported for the given device, and if not it should skip the test (with a print but not an error.)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still have to do runtime checks? That would result in these tests not being run until intel/llvm#4853 is merged

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have to do the runtime checks to make sure the device actually supports the features. CUDA is special in that regard as the SM version controls whether or not a feature is supported, but that's not the case for all devices.

That would result in these tests not being run until intel/llvm#4853 is merged

Since the patch is up I don't see why we wouldn't wait for it. It would make the tests more robust from the get-go.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I applied this kind of changes to tests for add. Can you check this is what you had in mind before I do the rest?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is exactly what I had in mind! It is quite a lot of RUN lines, but I suppose I got what was coming. Maybe it would make sense, just for good measure, to have a comment on each RUN grouping to have a short note specifying which test-configuration it is running.


std::cout << "Test passed." << std::endl;
}
96 changes: 96 additions & 0 deletions SYCL/AtomicRef/and.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#pragma once

#include <CL/sycl.hpp>
#include <algorithm>
#include <cassert>
#include <numeric>
#include <vector>

using namespace sycl;
using namespace sycl::ext::oneapi;

template <typename T, memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void and_local_test(queue q) {
const size_t N = 32;
T cum = 0;
std::vector<T> output(N);
std::fill(output.begin(), output.end(), T(123456));
{
buffer<T> cum_buf(&cum, 1);
buffer<T> output_buf(output.data(), output.size());
q.submit([&](handler &cgh) {
auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
auto out =
output_buf.template get_access<access::mode::discard_write>(cgh);
accessor<T, 1, access::mode::read_write, access::target::local> loc(1,
cgh);

cgh.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) {
int gid = it.get_global_id(0);
if (gid == 0)
loc[0] = T((1ll << N) - 1);
it.barrier(access::fence_space::local_space);
auto atm = atomic_ref < T,
(order == memory_order::acquire || order == memory_order::release)
? memory_order::relaxed
: order,
scope, access::address_space::local_space > (loc[0]);
out[gid] = atm.fetch_and(~T(1ll << gid), order);
it.barrier(access::fence_space::local_space);
if (gid == 0)
cum[0] = loc[0];
});
}).wait_and_throw();
}

// Final value should be equal to 0
assert(cum == 0);

// All other values should be unique; each work-item sets one bit to 0
std::sort(output.begin(), output.end());
assert(std::unique(output.begin(), output.end()) == output.end());
}

template <typename T, memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void and_global_test(queue q) {
const size_t N = 32;
const T initial = T((1ll << N) - 1);
T cum = initial;
std::vector<T> output(N);
std::fill(output.begin(), output.end(), T(0));
{
buffer<T> cum_buf(&cum, 1);
buffer<T> output_buf(output.data(), output.size());

q.submit([&](handler &cgh) {
auto cum = cum_buf.template get_access<access::mode::read_write>(cgh);
auto out =
output_buf.template get_access<access::mode::discard_write>(cgh);
cgh.parallel_for(range<1>(N), [=](item<1> it) {
size_t gid = it.get_id(0);
auto atm = atomic_ref < T,
(order == memory_order::acquire || order == memory_order::release)
? memory_order::relaxed
: order,
scope, access::address_space::global_space > (cum[0]);
out[gid] = atm.fetch_and(~T(1ll << gid), order);
});
});
}

// Final value should be equal to 0
assert(cum == 0);

// All other values should be unique; each work-item sets one bit to 0
std::sort(output.begin(), output.end());
assert(std::unique(output.begin(), output.end()) == output.end());
}

template <typename T, memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void and_test(queue q) {
and_local_test<T, order, scope>(q);
and_global_test<T, order, scope>(q);
}
Loading