Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions SYCL/SubGroup/helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,17 @@ template <typename T2> struct utils<T2, 2> {
std::to_string((T2)v.s1()) + " )";
}
};
template <typename T2> struct utils<T2, 3> {
static T2 add_vec(const vec<T2, 3> &v) { return v.s0() + v.s1() + v.s2(); }
static bool cmp_vec(const vec<T2, 3> &v, const vec<T2, 3> &r) {
return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2();
}
static std::string stringify_vec(const vec<T2, 2> &v) {
return std::string("(") + std::to_string((T2)v.s0()) + ", " +
std::to_string((T2)v.s1()) + ", " + std::to_string((T2)v.s3()) +
" )";
}
};
template <typename T2> struct utils<T2, 4> {
static T2 add_vec(const vec<T2, 4> &v) {
return v.s0() + v.s1() + v.s2() + v.s3();
Expand Down Expand Up @@ -98,7 +109,7 @@ template <typename T2> struct utils<T2, 16> {

template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
if (std::is_floating_point<T>::value) {
if (std::fabs(val - ref) > 0.01) {
if (std::fabs(val - ref) > 0.02) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What was the reason to increase the threshold?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What was the reason to increase the threshold?

During data verification all elements of vectors are added and this value is compared with reference one. If we increase number of added elements twice potential cumulative error is increased twice.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Won't it be better to switch to validating relative error instead of absolute one?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test does not target specific accuracy goals. It checks that the return values are not something completely different from expected.
Does it make sense to invest in tuning accuracy of the test?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest we use input data, so that results have no error at all.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bader, could you, please, describe a bit? I think I didn't get it.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

During data verification all elements of vectors are added and this value is compared with reference one.

When you do FP addition, the computed result is rounded to fit into resulting data type. The error occurs only if add result can't be exactly preserved and has to be rounded. E.g. T == float, 2^{20} + 2^{-10} can't be represented "exactly", but 2^{20} + 2^{21} can. I suggest using input values, so that rounding error will be 0, so you can always use exact match.

BTW, using std::fabs here reduces accuracy for T == double.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Implemented bitwise comparison from floating point type except half.
Valid range for half type is too narrow and will require to rework data for several tests. This is out of scope for current PR. But aligning input data for current test allow to revert threshold increase.

std::cout << "Unexpected result for " << name << ": " << (double)val
<< " expected value: " << (double)ref << std::endl;
exit(1);
Expand All @@ -115,8 +126,8 @@ template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
template <typename T>
void exit_if_not_equal(std::complex<T> val, std::complex<T> ref,
const char *name) {
if (std::fabs(val.real() - ref.real()) > 0.01 ||
std::fabs(val.imag() - ref.imag()) > 0.01) {
if (std::fabs(val.real() - ref.real()) > 0.02 ||
std::fabs(val.imag() - ref.imag()) > 0.02) {
std::cout << "Unexpected result for " << name << ": " << val
<< " expected value: " << ref << std::endl;
exit(1);
Expand All @@ -134,7 +145,7 @@ template <typename T> void exit_if_not_equal(T *val, T *ref, const char *name) {
template <> void exit_if_not_equal(half val, half ref, const char *name) {
int16_t cmp_val = reinterpret_cast<int16_t &>(val);
int16_t cmp_ref = reinterpret_cast<int16_t &>(ref);
if (std::abs(cmp_val - cmp_ref) > 1) {
if (std::abs(cmp_val - cmp_ref) > 2) {
std::cout << "Unexpected result for " << name << ": " << (float)val
<< " expected value: " << (float)ref << std::endl;
exit(1);
Expand Down
31 changes: 22 additions & 9 deletions SYCL/SubGroup/load_store.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,10 @@ template <typename T, int N> class sycl_subgr;
using namespace cl::sycl;

template <typename T, int N> void check(queue &Queue) {
const int G = 1024, L = 128;
const int G = 1024, L = 256;

// Pad arrays based on sub-group size to ensure no out-of-bounds accesses
// Workaround for info::device::sub_group_sizes support on some devices
size_t max_sg_size = 128;
#if 0
auto sg_sizes = Queue.get_device().get_info<info::device::sub_group_sizes>();
size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end());
#endif

try {
nd_range<1> NdRange(G, L);
Expand All @@ -51,7 +46,9 @@ template <typename T, int N> void check(queue &Queue) {
{L + max_sg_size * N}, cgh);
cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
ONEAPI::sub_group SG = NdItem.get_sub_group();
if (SG.get_group_id().get(0) % N == 0) {
auto SGid = SG.get_group_id().get(0);
/* Avoid overlapping data ranges inside and between local groups */
if (SGid % N == 0 && (SGid + N) * SG.get_local_range()[0] <= L) {
size_t SGOffset =
SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset;
Expand All @@ -66,7 +63,7 @@ template <typename T, int N> void check(queue &Queue) {
SG.store<N, T>(mp, t);
}
if (NdItem.get_global_id(0) == 0)
sgsizeacc[0] = SG.get_max_local_range()[0];
sgsizeacc[0] = SG.get_local_range()[0];
});
});
auto acc = syclbuf.template get_access<access::mode::read_write>();
Expand All @@ -91,7 +88,7 @@ template <typename T, int N> void check(queue &Queue) {
ref *= N;
}
/* There is no defined out-of-range behavior for these functions. */
if ((SGid + N) * sg_size < L) {
if ((SGid + N) * sg_size <= L) {
std::string s("Vector<");
s += std::string(typeid(ref).name()) + std::string(",") +
std::to_string(N) + std::string(">[") + std::to_string(j) +
Expand Down Expand Up @@ -181,37 +178,47 @@ int main() {
check<aligned_int>(Queue);
check<aligned_int, 1>(Queue);
check<aligned_int, 2>(Queue);
check<aligned_int, 3>(Queue);
check<aligned_int, 4>(Queue);
check<aligned_int, 8>(Queue);
check<aligned_int, 16>(Queue);
typedef unsigned int aligned_uint __attribute__((aligned(16)));
check<aligned_uint>(Queue);
check<aligned_uint, 1>(Queue);
check<aligned_uint, 2>(Queue);
check<aligned_uint, 3>(Queue);
check<aligned_uint, 4>(Queue);
check<aligned_uint, 8>(Queue);
check<aligned_uint, 16>(Queue);
typedef float aligned_float __attribute__((aligned(16)));
check<aligned_float>(Queue);
check<aligned_float, 1>(Queue);
check<aligned_float, 2>(Queue);
check<aligned_float, 3>(Queue);
check<aligned_float, 4>(Queue);
check<aligned_float, 8>(Queue);
check<aligned_float, 16>(Queue);
}
if (Queue.get_device().has_extension("cl_intel_subgroups_short") ||
PlatformName.find("CUDA") != std::string::npos) {
typedef short aligned_short __attribute__((aligned(16)));
check<aligned_short>(Queue);
check<aligned_short, 1>(Queue);
check<aligned_short, 2>(Queue);
check<aligned_short, 3>(Queue);
check<aligned_short, 4>(Queue);
check<aligned_short, 8>(Queue);
check<aligned_short, 16>(Queue);
if (Queue.get_device().has_extension("cl_khr_fp16") ||
PlatformName.find("CUDA") != std::string::npos) {
typedef half aligned_half __attribute__((aligned(16)));
check<aligned_half>(Queue);
check<aligned_half, 1>(Queue);
check<aligned_half, 2>(Queue);
check<aligned_half, 3>(Queue);
check<aligned_half, 4>(Queue);
check<aligned_half, 8>(Queue);
check<aligned_half, 16>(Queue);
}
}
if (Queue.get_device().has_extension("cl_intel_subgroups_long") ||
Expand All @@ -220,20 +227,26 @@ int main() {
check<aligned_long>(Queue);
check<aligned_long, 1>(Queue);
check<aligned_long, 2>(Queue);
check<aligned_long, 3>(Queue);
check<aligned_long, 4>(Queue);
check<aligned_long, 8>(Queue);
check<aligned_long, 16>(Queue);
typedef unsigned long aligned_ulong __attribute__((aligned(16)));
check<aligned_ulong>(Queue);
check<aligned_ulong, 1>(Queue);
check<aligned_ulong, 2>(Queue);
check<aligned_ulong, 3>(Queue);
check<aligned_ulong, 4>(Queue);
check<aligned_ulong, 8>(Queue);
check<aligned_ulong, 16>(Queue);
typedef double aligned_double __attribute__((aligned(16)));
check<aligned_double>(Queue);
check<aligned_double, 1>(Queue);
check<aligned_double, 2>(Queue);
check<aligned_double, 3>(Queue);
check<aligned_double, 4>(Queue);
check<aligned_double, 8>(Queue);
check<aligned_double, 16>(Queue);
}
std::cout << "Test passed." << std::endl;
return 0;
Expand Down