diff --git a/SYCL/ESIMD/PrefixSum.cpp b/SYCL/ESIMD/PrefixSum.cpp index 099d50fe8b..4c5420b7a7 100644 --- a/SYCL/ESIMD/PrefixSum.cpp +++ b/SYCL/ESIMD/PrefixSum.cpp @@ -184,7 +184,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems, simd p = elm32 < remaining; - S = gather4(buf, element_offset, p); + S = gather_rgba(buf, element_offset, + p); auto cnt_table = S.bit_cast_view(); cnt_table.column(0) += prev; @@ -214,7 +215,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems, cnt_table.select<1, 1, 16, 1>(j, 16) += cnt_table.replicate<1, 0, 16, 0>(j, 15); } - scatter4(buf, S, element_offset, p); + scatter_rgba(buf, S, element_offset, + p); elm32 += 32; element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32; prev = cnt_table.column(31); @@ -252,7 +254,7 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos, unsigned n_iter = n_entries / 32; for (unsigned i = 0; i < n_iter; i++) { - S = gather4(buf, element_offset); + S = gather_rgba(buf, element_offset); auto cnt_table = S.bit_cast_view(); cnt_table.column(0) += prev; @@ -288,7 +290,7 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos, if (i == n_iter - 1) cnt_table.column(31) -= cnt_table.column(30); - scatter4(buf, S, element_offset); + scatter_rgba(buf, S, element_offset); element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32; prev = cnt_table.column(31); diff --git a/SYCL/ESIMD/Prefix_Local_sum2.cpp b/SYCL/ESIMD/Prefix_Local_sum2.cpp index feaa9fc904..36bcf46afa 100644 --- a/SYCL/ESIMD/Prefix_Local_sum2.cpp +++ b/SYCL/ESIMD/Prefix_Local_sum2.cpp @@ -73,13 +73,13 @@ void cmk_acum_iterative(unsigned *buf, unsigned h_pos, simd S, T; - S = gather4(buf, element_offset); + S = gather_rgba(buf, element_offset); #pragma unroll for (int i = 1; i < PREFIX_ENTRIES / 32; i++) { element_offset += (stride_elems * 32 * TUPLE_SZ) * sizeof(unsigned); // scattered read, each inst reads 16 entries - T = gather4(buf, element_offset); + T = gather_rgba(buf, element_offset); S += T; } diff --git a/SYCL/ESIMD/Prefix_Local_sum3.cpp b/SYCL/ESIMD/Prefix_Local_sum3.cpp index e2df24c2a8..fba3e11006 100644 --- a/SYCL/ESIMD/Prefix_Local_sum3.cpp +++ b/SYCL/ESIMD/Prefix_Local_sum3.cpp @@ -197,7 +197,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems, simd p = elm32 < remaining; - S = gather4(buf, element_offset, p); + S = gather_rgba(buf, element_offset, + p); auto cnt_table = S.bit_cast_view(); cnt_table.column(0) += prev; @@ -226,7 +227,8 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems, cnt_table.select<1, 1, 16, 1>(j, 16) += cnt_table.replicate<1, 0, 16, 0>(j, 15); } - scatter4(buf, S, element_offset, p); + scatter_rgba(buf, S, element_offset, + p); elm32 += 32; element_offset += stride_elems * TUPLE_SZ * sizeof(unsigned) * 32; prev = cnt_table.column(31); diff --git a/SYCL/ESIMD/accessor_gather_scatter.cpp b/SYCL/ESIMD/accessor_gather_scatter.cpp index 6231bf64c5..bc97087280 100644 --- a/SYCL/ESIMD/accessor_gather_scatter.cpp +++ b/SYCL/ESIMD/accessor_gather_scatter.cpp @@ -67,6 +67,7 @@ template bool test(queue q) { Kernel kernel(acc); cgh.parallel_for(glob_range, kernel); }); + e.wait(); } catch (cl::sycl::exception const &e) { std::cout << "SYCL exception caught: " << e.what() << '\n'; delete[] A; diff --git a/SYCL/ESIMD/usm_gather_scatter_rgba.cpp b/SYCL/ESIMD/usm_gather_scatter_rgba.cpp new file mode 100644 index 0000000000..50f73e38c6 --- /dev/null +++ b/SYCL/ESIMD/usm_gather_scatter_rgba.cpp @@ -0,0 +1,179 @@ +//==-------- usm_gather_scatter_rgba.cpp - DPC++ ESIMD on-device test -----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu +// UNSUPPORTED: cuda +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// The test checks functionality of the gather_rgba/scatter_rgba USM-based ESIMD +// intrinsics. + +#include "esimd_test_utils.hpp" + +#include +#include +#include + +using namespace cl::sycl; + +constexpr int MASKED_LANE_NUM_REV = 1; +constexpr int NUM_RGBA_CHANNELS = get_num_channels_enabled( + sycl::ext::intel::experimental::esimd::rgba_channel_mask::ABGR); + +template +struct Kernel { + T *bufIn; + T *bufOut; + Kernel(T *bufIn, T *bufOut) : bufIn(bufIn), bufOut(bufOut) {} + + void operator()(id<1> i) const SYCL_ESIMD_KERNEL { + using namespace sycl::ext::intel::experimental::esimd; + constexpr int numChannels = get_num_channels_enabled(CH_MASK); + + // every workitem accesses contiguous block of VL * STRIDE elements, + // where each element consists of RGBA channels. + uint32_t global_offset = i * VL * STRIDE * NUM_RGBA_CHANNELS; + + simd byteOffsets(0, STRIDE * sizeof(T) * NUM_RGBA_CHANNELS); + simd v = + gather_rgba(bufIn + global_offset, byteOffsets); + v += i; + + simd pred = 1; + pred[VL - MASKED_LANE_NUM_REV] = 0; // mask out the last lane + scatter_rgba(bufOut + global_offset, v, byteOffsets, pred); + } +}; + +std::string convertMaskToStr( + sycl::ext::intel::experimental::esimd::rgba_channel_mask mask) { + using namespace sycl::ext::intel::experimental::esimd; + switch (mask) { + case rgba_channel_mask::R: + return "R"; + case rgba_channel_mask::GR: + return "GR"; + case rgba_channel_mask::ABGR: + return "ABGR"; + default: + return ""; + } + return ""; +} + +template +bool test(queue q) { + size_t numWorkItems = 2; + size_t size = VL * STRIDE * NUM_RGBA_CHANNELS * numWorkItems; + using namespace sycl::ext::intel::experimental::esimd; + constexpr int numChannels = get_num_channels_enabled(CH_MASK); + + std::cout << "Testing T=" << typeid(T).name() << " VL=" << VL + << " STRIDE=" << STRIDE << " MASK=" << convertMaskToStr(CH_MASK) + << "...\n"; + + auto dev = q.get_device(); + auto ctxt = q.get_context(); + T *A = static_cast(malloc_shared(size * sizeof(T), dev, ctxt)); + T *B = static_cast(malloc_shared(size * sizeof(T), dev, ctxt)); + T *gold = new T[size]; + + for (int i = 0; i < size; ++i) { + A[i] = (T)i; + B[i] = (T)-i; + gold[i] = (T)-i; + } + + // Fill out the array with gold values. The kernel only writes the elements + // that are not masked. For example, + // for STRIDE=1 and MASK=R, we have the following indices written: + // 0, 4, 8, 12 ... + // for STRIDE=2 and MASK=RG, we have the following indices written: + // 0, 1, 8, 9, 16, 17 ... + // All the other elements will be equal to '-A[i]'. + auto blockSize = VL * STRIDE * NUM_RGBA_CHANNELS; + for (unsigned i = 0; i < size; i += NUM_RGBA_CHANNELS * STRIDE) + for (unsigned j = 0; j < numChannels; j++) + gold[i + j] = A[i + j] + (i / (blockSize)); + + // Account for masked out last lanes (with pred argument to scatter_rgba). + auto maskedElementOffset = (VL - 1) * STRIDE * NUM_RGBA_CHANNELS; + for (unsigned i = maskedElementOffset; i < size; i += blockSize) + for (unsigned j = 0; j < numChannels; j++) + gold[i + j] = -A[i + j]; + + try { + range<1> glob_range{numWorkItems}; + auto e = q.submit([&](handler &cgh) { + Kernel kernel(A, B); + cgh.parallel_for(glob_range, kernel); + }); + e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + free(A, ctxt); + free(B, ctxt); + delete[] gold; + return e.get_cl_code(); + } + + int err_cnt = 0; + for (unsigned i = 0; i < size; ++i) { + if (B[i] != gold[i]) { + if (++err_cnt < 35) { + std::cout << "failed at index " << i << ": " << B[i] + << " != " << gold[i] << " (gold)\n"; + } + } + } + + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(size - err_cnt) / (float)size) * 100.0f << "% (" + << (size - err_cnt) << "/" << size << ")\n"; + } + + free(A, ctxt); + free(B, ctxt); + delete[] gold; + + std::cout << (err_cnt > 0 ? " FAILED\n" : " Passed\n"); + return err_cnt > 0 ? false : true; +} + +template bool test(queue q) { + using namespace sycl::ext::intel::experimental::esimd; + bool passed = true; + passed &= test(q); + passed &= test(q); + passed &= test(q); + return passed; +} + +int main(void) { + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + bool passed = true; + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + + return passed ? 0 : 1; +}