diff --git a/sycl/test-e2e/Basic/kernel_max_wg_size.cpp b/sycl/test-e2e/Basic/kernel_max_wg_size.cpp index 55c993734df6a..eb0ff5483bae4 100644 --- a/sycl/test-e2e/Basic/kernel_max_wg_size.cpp +++ b/sycl/test-e2e/Basic/kernel_max_wg_size.cpp @@ -29,6 +29,16 @@ __attribute__((noinline)) void f(int *result, nd_item<1> &index) { result[index.get_global_id()] = index.get_global_id(); } +struct KernelFunctor { + int *mResult; + KernelFunctor(int *result) : mResult(result) {} + + void operator()(nd_item<1> index) const { f(mResult, index); } + auto get(syclex::properties_tag) const { + return syclex::properties{intelex::grf_size<256>}; + } +}; + int main() { queue myQueue; auto myContext = myQueue.get_context(); @@ -46,11 +56,9 @@ int main() { nd_range myRange{range{maxWgSize}, range{maxWgSize}}; int *result = sycl::malloc_shared(maxWgSize, myQueue); - syclex::properties kernelProperties{intelex::grf_size<256>}; myQueue.submit([&](handler &cgh) { cgh.use_kernel_bundle(myBundle); - cgh.parallel_for(myRange, kernelProperties, - ([=](nd_item<1> index) { f(result, index); })); + cgh.parallel_for(myRange, KernelFunctor(result)); }); myQueue.wait(); diff --git a/sycl/test-e2e/Basic/sub_group_size_prop.cpp b/sycl/test-e2e/Basic/sub_group_size_prop.cpp index 6da86acd09c45..ae8281903a92b 100644 --- a/sycl/test-e2e/Basic/sub_group_size_prop.cpp +++ b/sycl/test-e2e/Basic/sub_group_size_prop.cpp @@ -44,33 +44,12 @@ void test(queue &Queue, const std::vector SupportedSGSizes) { return; } - auto Props = ext::oneapi::experimental::properties{ - ext::oneapi::experimental::sub_group_size}; - nd_range<1> NdRange(SGSize * 4, SGSize * 2); size_t ReadSubGroupSize = 0; { buffer ReadSubGroupSizeBuf(&ReadSubGroupSize, range(1)); - Queue.submit([&](handler &CGH) { - accessor ReadSubGroupSizeBufAcc{ReadSubGroupSizeBuf, CGH, - sycl::write_only, sycl::no_init}; - - CGH.parallel_for>( - NdRange, Props, [=](nd_item<1> NdItem) { - auto SG = NdItem.get_sub_group(); - if (NdItem.get_global_linear_id() == 0) - ReadSubGroupSizeBufAcc[0] = SG.get_local_linear_range(); - }); - }); - } - assert(ReadSubGroupSize == SGSize && "Failed check for function."); - - ReadSubGroupSize = 0; - { - buffer ReadSubGroupSizeBuf(&ReadSubGroupSize, range(1)); - Queue.submit([&](handler &CGH) { accessor ReadSubGroupSizeBufAcc{ReadSubGroupSizeBuf, CGH, sycl::write_only, sycl::no_init}; @@ -81,22 +60,6 @@ void test(queue &Queue, const std::vector SupportedSGSizes) { }); } assert(ReadSubGroupSize == SGSize && "Failed check for functor."); - - ReadSubGroupSize = 0; - { - buffer ReadSubGroupSizeBuf(&ReadSubGroupSize, range(1)); - - Queue.submit([&](handler &CGH) { - accessor ReadSubGroupSizeBufAcc{ReadSubGroupSizeBuf, CGH, - sycl::write_only, sycl::no_init}; - KernelFunctorWithSGSizeProp KernelFunctor{ReadSubGroupSizeBufAcc}; - - CGH.parallel_for>(NdRange, Props, - KernelFunctor); - }); - } - assert(ReadSubGroupSize == SGSize && - "Failed check for functor and properties."); } int main() { diff --git a/sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp b/sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp index e37d4ea1f1fb3..01db70b11464a 100644 --- a/sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp +++ b/sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp @@ -10,6 +10,49 @@ #include +template struct KernelFunctor { + int *mCorrectResultFlag; + T mClusterLaunchProperty; + sycl::range mClusterRange; + KernelFunctor(int *CorrectResultFlag, T ClusterLaunchProperty, + sycl::range ClusterRange) + : mCorrectResultFlag(CorrectResultFlag), + mClusterLaunchProperty(ClusterLaunchProperty), + mClusterRange(ClusterRange) {} + + void operator()(sycl::nd_item It) const { + uint32_t ClusterDimX, ClusterDimY, ClusterDimZ; +// Temporary solution till cluster group class is implemented +#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_CUDA_ARCH__) && \ + (__SYCL_CUDA_ARCH__ >= 900) + asm volatile("\n\t" + "mov.u32 %0, %%cluster_nctaid.x; \n\t" + "mov.u32 %1, %%cluster_nctaid.y; \n\t" + "mov.u32 %2, %%cluster_nctaid.z; \n\t" + : "=r"(ClusterDimZ), "=r"(ClusterDimY), "=r"(ClusterDimX)); +#endif + if constexpr (Dim == 1) { + if (ClusterDimZ == mClusterRange[0] && ClusterDimY == 1 && + ClusterDimX == 1) { + *mCorrectResultFlag = 1; + } + } else if constexpr (Dim == 2) { + if (ClusterDimZ == mClusterRange[1] && ClusterDimY == mClusterRange[0] && + ClusterDimX == 1) { + *mCorrectResultFlag = 1; + } + } else { + if (ClusterDimZ == mClusterRange[2] && ClusterDimY == mClusterRange[1] && + ClusterDimX == mClusterRange[0]) { + *mCorrectResultFlag = 1; + } + } + } + auto get(sycl::ext::oneapi::experimental::properties_tag) const { + return mClusterLaunchProperty; + } +}; + template int test_cluster_launch_parallel_for(sycl::queue &Queue, sycl::range GlobalRange, @@ -25,38 +68,10 @@ int test_cluster_launch_parallel_for(sycl::queue &Queue, Queue .submit([&](sycl::handler &CGH) { - CGH.parallel_for(sycl::nd_range(GlobalRange, LocalRange), - ClusterLaunchProperty, [=](sycl::nd_item It) { - uint32_t ClusterDimX, ClusterDimY, ClusterDimZ; -// Temporary solution till cluster group class is implemented -#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_CUDA_ARCH__) && \ - (__SYCL_CUDA_ARCH__ >= 900) - asm volatile("\n\t" - "mov.u32 %0, %%cluster_nctaid.x; \n\t" - "mov.u32 %1, %%cluster_nctaid.y; \n\t" - "mov.u32 %2, %%cluster_nctaid.z; \n\t" - : "=r"(ClusterDimZ), "=r"(ClusterDimY), - "=r"(ClusterDimX)); -#endif - if constexpr (Dim == 1) { - if (ClusterDimZ == ClusterRange[0] && - ClusterDimY == 1 && ClusterDimX == 1) { - *CorrectResultFlag = 1; - } - } else if constexpr (Dim == 2) { - if (ClusterDimZ == ClusterRange[1] && - ClusterDimY == ClusterRange[0] && - ClusterDimX == 1) { - *CorrectResultFlag = 1; - } - } else { - if (ClusterDimZ == ClusterRange[2] && - ClusterDimY == ClusterRange[1] && - ClusterDimX == ClusterRange[0]) { - *CorrectResultFlag = 1; - } - } - }); + CGH.parallel_for( + sycl::nd_range(GlobalRange, LocalRange), + KernelFunctor( + CorrectResultFlag, ClusterLaunchProperty, ClusterRange)); }) .wait_and_throw(); diff --git a/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp b/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp index 8900d10328871..0460defa72104 100644 --- a/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp +++ b/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp @@ -24,6 +24,22 @@ template void dummy_kernel(T *Input, int N, sycl::nd_item<1> It) { #endif } +template struct KernelFunctor { + T1 mAcc; + T2 mClusterLaunchProperty; + KernelFunctor(T2 ClusterLaunchProperty, T1 Acc) + : mClusterLaunchProperty(ClusterLaunchProperty), mAcc(Acc) {} + + void operator()(sycl::nd_item<1> It) const { + dummy_kernel( + mAcc.template get_multi_ptr().get(), 4096, + It); + } + auto get(sycl::ext::oneapi::experimental::properties_tag) const { + return mClusterLaunchProperty; + } +}; + int main() { std::vector HostArray(4096, -20); @@ -46,13 +62,8 @@ int main() { cuda::cluster_size ClusterDims(sycl::range{2}); properties ClusterLaunchProperty{ClusterDims}; auto Acc = Buff.template get_access(CGH); - CGH.parallel_for( - sycl::nd_range({4096}, {32}), ClusterLaunchProperty, - [=](sycl::nd_item<1> It) { - dummy_kernel( - Acc.get_multi_ptr().get(), 4096, - It); - }); + CGH.parallel_for(sycl::nd_range({4096}, {32}), + KernelFunctor(ClusterLaunchProperty, Acc)); }); Queue.submit([&](sycl::handler &CGH) { auto Acc = Buff.template get_access(CGH); diff --git a/sycl/test-e2e/DeviceCodeSplit/grf.cpp b/sycl/test-e2e/DeviceCodeSplit/grf.cpp index 62f1a76a5f017..4080049f665af 100644 --- a/sycl/test-e2e/DeviceCodeSplit/grf.cpp +++ b/sycl/test-e2e/DeviceCodeSplit/grf.cpp @@ -67,6 +67,15 @@ bool checkResult(const std::vector &A, int Inc) { return true; } +template struct KernelFunctor { + T1 mPA; + T2 mProp; + KernelFunctor(T1 PA, T2 Prop) : mPA(PA), mProp(Prop) {} + + void operator()(id<1> i) const { mPA[i] += 2; } + auto get(properties_tag) const { return mProp; } +}; + int main(void) { constexpr unsigned Size = 32; constexpr unsigned VL = 16; @@ -122,8 +131,8 @@ int main(void) { auto e = q.submit([&](handler &cgh) { auto PA = bufa.get_access(cgh); - cgh.parallel_for( - Size, prop, [=](id<1> i) { PA[i] += 2; }); + cgh.parallel_for(Size, + KernelFunctor(PA, prop)); }); e.wait(); } catch (sycl::exception const &e) { diff --git a/sycl/test-e2e/Graph/Inputs/sub_group_prop.cpp b/sycl/test-e2e/Graph/Inputs/sub_group_prop.cpp index 7c0bfe5161530..adaf6e1977ea4 100644 --- a/sycl/test-e2e/Graph/Inputs/sub_group_prop.cpp +++ b/sycl/test-e2e/Graph/Inputs/sub_group_prop.cpp @@ -39,9 +39,6 @@ void test(queue &Queue, const std::vector SupportedSGSizes) { return; } - auto Props = ext::oneapi::experimental::properties{ - ext::oneapi::experimental::sub_group_size}; - nd_range<1> NdRange(SGSize * 4, SGSize * 2); size_t ReadSubGroupSize = 0; @@ -49,39 +46,6 @@ void test(queue &Queue, const std::vector SupportedSGSizes) { buffer ReadSubGroupSizeBuf(&ReadSubGroupSize, range(1)); ReadSubGroupSizeBuf.set_write_back(false); - { - exp_ext::command_graph Graph{ - Queue.get_context(), - Queue.get_device(), - {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; - - add_node(Graph, Queue, [&](handler &CGH) { - accessor ReadSubGroupSizeBufAcc{ReadSubGroupSizeBuf, CGH, - sycl::write_only, sycl::no_init}; - - CGH.parallel_for>( - NdRange, Props, [=](nd_item<1> NdItem) { - auto SG = NdItem.get_sub_group(); - if (NdItem.get_global_linear_id() == 0) - ReadSubGroupSizeBufAcc[0] = SG.get_local_linear_range(); - }); - }); - - auto ExecGraph = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); - Queue.wait_and_throw(); - } - - host_accessor HostAcc(ReadSubGroupSizeBuf); - ReadSubGroupSize = HostAcc[0]; - } - assert(ReadSubGroupSize == SGSize && "Failed check for function."); - - ReadSubGroupSize = 0; - { - buffer ReadSubGroupSizeBuf(&ReadSubGroupSize, range(1)); - ReadSubGroupSizeBuf.set_write_back(false); - { exp_ext::command_graph Graph{ Queue.get_context(), @@ -107,38 +71,6 @@ void test(queue &Queue, const std::vector SupportedSGSizes) { ReadSubGroupSize = HostAcc[0]; } assert(ReadSubGroupSize == SGSize && "Failed check for functor."); - - ReadSubGroupSize = 0; - { - buffer ReadSubGroupSizeBuf(&ReadSubGroupSize, range(1)); - ReadSubGroupSizeBuf.set_write_back(false); - - { - exp_ext::command_graph Graph{ - Queue.get_context(), - Queue.get_device(), - {exp_ext::property::graph::assume_buffer_outlives_graph{}}}; - - add_node(Graph, Queue, [&](handler &CGH) { - accessor ReadSubGroupSizeBufAcc{ReadSubGroupSizeBuf, CGH, - sycl::write_only, sycl::no_init}; - KernelFunctorWithSGSizeProp KernelFunctor{ - ReadSubGroupSizeBufAcc}; - - CGH.parallel_for>( - NdRange, Props, KernelFunctor); - }); - - auto ExecGraph = Graph.finalize(); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); - Queue.wait_and_throw(); - } - - host_accessor HostAcc(ReadSubGroupSizeBuf); - ReadSubGroupSize = HostAcc[0]; - } - assert(ReadSubGroupSize == SGSize && - "Failed check for functor and properties."); } int main() { diff --git a/sycl/test-e2e/VirtualFunctions/misc/group-barrier.cpp b/sycl/test-e2e/VirtualFunctions/misc/group-barrier.cpp index f01a25d4179f4..48db619d94081 100644 --- a/sycl/test-e2e/VirtualFunctions/misc/group-barrier.cpp +++ b/sycl/test-e2e/VirtualFunctions/misc/group-barrier.cpp @@ -101,6 +101,24 @@ class MultiplyOp : public BaseOp { } }; +template struct KernelFunctor { + T1 mDeviceStorage; + T2 mDataAcc; + T3 mLocalAcc; + KernelFunctor(T1 DeviceStorage, T2 DataAcc, T3 LocalAcc) + : mDeviceStorage(DeviceStorage), mDataAcc(DataAcc), mLocalAcc(LocalAcc) {} + + void operator()(sycl::nd_item<1> It) const { + auto *Ptr = mDeviceStorage->template getAs(); + mDataAcc[It.get_global_id()] = Ptr->apply( + mLocalAcc.template get_multi_ptr().get(), + It.get_group()); + } + auto get(oneapi::properties_tag) const { + return oneapi::properties{oneapi::assume_indirect_calls}; + } +}; + int main() try { using storage_t = obj_storage_t; @@ -113,7 +131,6 @@ int main() try { sycl::range G{16}; sycl::range L{4}; - constexpr oneapi::properties props{oneapi::assume_indirect_calls}; for (unsigned TestCase = 0; TestCase < 2; ++TestCase) { sycl::buffer DataStorage(G); @@ -126,12 +143,8 @@ int main() try { q.submit([&](sycl::handler &CGH) { sycl::accessor DataAcc(DataStorage, CGH, sycl::read_write); sycl::local_accessor LocalAcc(L, CGH); - CGH.parallel_for(sycl::nd_range{G, L}, props, [=](auto It) { - auto *Ptr = DeviceStorage->getAs(); - DataAcc[It.get_global_id()] = Ptr->apply( - LocalAcc.get_multi_ptr().get(), - It.get_group()); - }); + CGH.parallel_for(sycl::nd_range{G, L}, + KernelFunctor(DeviceStorage, DataAcc, LocalAcc)); }).wait_and_throw(); auto *Ptr = HostStorage.construct(TestCase); diff --git a/sycl/test-e2e/VirtualFunctions/misc/range-non-uniform-vf-2.cpp b/sycl/test-e2e/VirtualFunctions/misc/range-non-uniform-vf-2.cpp index 45b56916a5c1d..5bb34f99fd5f1 100644 --- a/sycl/test-e2e/VirtualFunctions/misc/range-non-uniform-vf-2.cpp +++ b/sycl/test-e2e/VirtualFunctions/misc/range-non-uniform-vf-2.cpp @@ -4,7 +4,10 @@ // kernels when different work-items perform calls to different virtual // functions using the same object. // -// RUN: %{build} -o %t.out %helper-includes +// TODO: Currently using the -Wno-deprecated-declarations flag due to issue +// https://github.com/intel/llvm/issues/16839. Remove the flag as well as the +// variable 'props' once the issue is resolved. +// RUN: %{build} -o %t.out -Wno-deprecated-declarations %helper-includes // RUN: %{run} %t.out #include @@ -44,6 +47,25 @@ class OpB : public BaseOp { virtual int bar(int V) { return V / 2; } }; +template struct KernelFunctor { + T1 mDeviceStorage; + T2 mDataAcc; + KernelFunctor(T1 DeviceStorage, T2 DataAcc) + : mDeviceStorage(DeviceStorage), mDataAcc(DataAcc) {} + + template void operator()(T It) const { + // Select method that corresponds to this work-item + auto *Ptr = mDeviceStorage->template getAs(); + if (It % 2) + mDataAcc[It] = Ptr->foo(mDataAcc[It]); + else + mDataAcc[It] = Ptr->bar(mDataAcc[It]); + } + auto get(oneapi::properties_tag) const { + return oneapi::properties{oneapi::assume_indirect_calls}; + } +}; + int main() try { using storage_t = obj_storage_t; @@ -69,14 +91,7 @@ int main() try { q.submit([&](sycl::handler &CGH) { sycl::accessor DataAcc(DataStorage, CGH, sycl::read_write); - CGH.parallel_for(R, props, [=](auto It) { - // Select method that corresponds to this work-item - auto *Ptr = DeviceStorage->template getAs(); - if (It % 2) - DataAcc[It] = Ptr->foo(DataAcc[It]); - else - DataAcc[It] = Ptr->bar(DataAcc[It]); - }); + CGH.parallel_for(R, props, KernelFunctor(DeviceStorage, DataAcc)); }); BaseOp *Ptr = HostStorage.construct(TestCase); diff --git a/sycl/test-e2e/VirtualFunctions/misc/range-non-uniform-vf.cpp b/sycl/test-e2e/VirtualFunctions/misc/range-non-uniform-vf.cpp index 453a3aee81fa6..7051c8c081c7b 100644 --- a/sycl/test-e2e/VirtualFunctions/misc/range-non-uniform-vf.cpp +++ b/sycl/test-e2e/VirtualFunctions/misc/range-non-uniform-vf.cpp @@ -4,7 +4,10 @@ // kernels when different work-items perform a virtual function calls using // different objects. // -// RUN: %{build} -o %t.out %helper-includes +// TODO: Currently using the -Wno-deprecated-declarations flag due to issue +// https://github.com/intel/llvm/issues/16839. Remove the flag as well as the +// variable 'props' once the issue is resolved. +// RUN: %{build} -o %t.out -Wno-deprecated-declarations %helper-includes // RUN: %{run} %t.out #include @@ -41,6 +44,23 @@ class RoundOp : public BaseOp { virtual float apply(float V) { return sycl::round(V); } }; +template struct KernelFunctor { + T1 mDeviceStorage; + T2 mDataAcc; + KernelFunctor(T1 DeviceStorage, T2 DataAcc) + : mDeviceStorage(DeviceStorage), mDataAcc(DataAcc) {} + + void operator()(sycl::item<1> It) const { + // Select an object that corresponds to this work-item + auto Ind = It % 3; + auto *Ptr = mDeviceStorage[Ind].template getAs(); + mDataAcc[It] = Ptr->apply(mDataAcc[It]); + } + auto get(oneapi::properties_tag) const { + return oneapi::properties{oneapi::assume_indirect_calls}; + } +}; + int main() try { using storage_t = obj_storage_t; @@ -69,12 +89,7 @@ int main() try { q.submit([&](sycl::handler &CGH) { sycl::accessor DataAcc(DataStorage, CGH, sycl::read_write); - CGH.parallel_for(R, props, [=](auto it) { - // Select an object that corresponds to this work-item - auto Ind = it % 3; - auto *Ptr = DeviceStorage[Ind].template getAs(); - DataAcc[it] = Ptr->apply(DataAcc[it]); - }); + CGH.parallel_for(R, props, KernelFunctor(DeviceStorage, DataAcc)); }); BaseOp *Ptr[] = {HostStorage[0].construct(0), diff --git a/sycl/test-e2e/VirtualFunctions/misc/range-uniform-vf.cpp b/sycl/test-e2e/VirtualFunctions/misc/range-uniform-vf.cpp index 66db6a0c5af7a..219e637ae1dca 100644 --- a/sycl/test-e2e/VirtualFunctions/misc/range-uniform-vf.cpp +++ b/sycl/test-e2e/VirtualFunctions/misc/range-uniform-vf.cpp @@ -4,7 +4,10 @@ // kernels when every work-item calls the same virtual function on the same // object. // -// RUN: %{build} -o %t.out %helper-includes +// TODO: Currently using the -Wno-deprecated-declarations flag due to issue +// https://github.com/intel/llvm/issues/16839. Remove the flag as well as the +// variable 'props' once the issue is resolved. +// RUN: %{build} -o %t.out -Wno-deprecated-declarations %helper-includes // RUN: %{run} %t.out #include @@ -41,6 +44,21 @@ class RoundOp : public BaseOp { virtual float apply(float V) { return sycl::round(V); } }; +template struct KernelFunctor { + T1 mDeviceStorage; + T2 mDataAcc; + KernelFunctor(T1 DeviceStorage, T2 DataAcc) + : mDeviceStorage(DeviceStorage), mDataAcc(DataAcc) {} + + void operator()(sycl::id<1> It) const { + auto *Ptr = mDeviceStorage->template getAs(); + mDataAcc[It] = Ptr->apply(mDataAcc[It]); + } + auto get(oneapi::properties_tag) const { + return oneapi::properties{oneapi::assume_indirect_calls}; + } +}; + int main() try { using storage_t = obj_storage_t; @@ -67,10 +85,7 @@ int main() try { q.submit([&](sycl::handler &CGH) { sycl::accessor DataAcc(DataStorage, CGH, sycl::read_write); - CGH.parallel_for(R, props, [=](auto it) { - auto *Ptr = DeviceStorage->getAs(); - DataAcc[it] = Ptr->apply(DataAcc[it]); - }); + CGH.parallel_for(R, props, KernelFunctor(DeviceStorage, DataAcc)); }); auto *Ptr = HostStorage.construct(TestCase); diff --git a/sycl/test-e2e/WorkGroupScratchMemory/copy_dynamic_size.cpp b/sycl/test-e2e/WorkGroupScratchMemory/copy_dynamic_size.cpp index 1f61653efc44e..e1716cff85c67 100644 --- a/sycl/test-e2e/WorkGroupScratchMemory/copy_dynamic_size.cpp +++ b/sycl/test-e2e/WorkGroupScratchMemory/copy_dynamic_size.cpp @@ -29,6 +29,17 @@ void copy_via_smem(DataType *a, DataType *b, sycl::nd_item<1> it) { b[threadIdx_x] = smem_ptr[threadIdx_x]; } +template struct KernelFunctor { + T m_props; + DataType *m_a; + DataType *m_b; + KernelFunctor(T props, DataType *a, DataType *b) + : m_props(props), m_a(a), m_b(b) {} + + void operator()(sycl::nd_item<1> it) const { copy_via_smem(m_a, m_b, it); } + auto get(sycl_ext::properties_tag) const { return m_props; } +}; + int main() { sycl::queue queue; DataType *a = sycl::malloc_device(Size, queue); @@ -40,10 +51,12 @@ int main() { queue .submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<1>({Size}, {Size}), - sycl_ext::properties{sycl_ext::work_group_scratch_size( - Size * sizeof(DataType))}, - [=](sycl::nd_item<1> it) { copy_via_smem(a, b, it); }); + cgh.parallel_for( + sycl::nd_range<1>({Size}, {Size}), + KernelFunctor( + sycl_ext::properties{ + sycl_ext::work_group_scratch_size(Size * sizeof(DataType))}, + a, b)); }) .wait_and_throw(); diff --git a/sycl/test-e2e/WorkGroupScratchMemory/dynamic_alloc_local_accessor.cpp b/sycl/test-e2e/WorkGroupScratchMemory/dynamic_alloc_local_accessor.cpp index 04d8a85a808ff..ebcc17855cbf1 100644 --- a/sycl/test-e2e/WorkGroupScratchMemory/dynamic_alloc_local_accessor.cpp +++ b/sycl/test-e2e/WorkGroupScratchMemory/dynamic_alloc_local_accessor.cpp @@ -23,6 +23,41 @@ using namespace sycl; namespace sycl_ext = sycl::ext::oneapi::experimental; +template struct KernelFunctor { + T1 m_props; + T2 mLocalAccessor; + T3 mAcc; + KernelFunctor(T1 props, T2 LocalAccessor, T3 Acc) + : m_props(props), mLocalAccessor(LocalAccessor), mAcc(Acc) {} + + void operator()(nd_item<1> Item) const { + int *Ptr = + reinterpret_cast(sycl_ext::get_work_group_scratch_memory()); + size_t GroupOffset = Item.get_group_linear_id() * ElemPerWG; + for (size_t I = 0; I < RepeatWG; ++I) { + Ptr[WgSize * I + Item.get_local_linear_id()] = Item.get_local_linear_id(); + } + Item.barrier(); + + for (size_t I = 0; I < RepeatWG; ++I) { + // Check that the local accessor works. + size_t LocalIdx = Item.get_local_linear_id() ^ 1; + mLocalAccessor[WgSize * I + LocalIdx] = Ptr[WgSize * I + LocalIdx] + 1; + } + Item.barrier(); + + for (size_t I = 0; I < RepeatWG; ++I) { + // Check that the memory is accessible from other + // work-items + size_t BaseIdx = GroupOffset + (I * WgSize); + size_t LocalIdx = Item.get_local_linear_id(); + size_t GlobalIdx = BaseIdx + LocalIdx; + mAcc[GlobalIdx] = mLocalAccessor[WgSize * I + LocalIdx]; + } + } + auto get(sycl_ext::properties_tag) const { return m_props; } +}; + int main() { queue Q; std::vector Vec(Size, 0); @@ -36,34 +71,7 @@ int main() { auto LocalAccessor = sycl::local_accessor(WgSize * RepeatWG * sizeof(int), Cgh); Cgh.parallel_for(nd_range<1>(range<1>(WgSize * WgCount), range<1>(WgSize)), - properties, [=](nd_item<1> Item) { - int *Ptr = reinterpret_cast( - sycl_ext::get_work_group_scratch_memory()); - size_t GroupOffset = - Item.get_group_linear_id() * ElemPerWG; - for (size_t I = 0; I < RepeatWG; ++I) { - Ptr[WgSize * I + Item.get_local_linear_id()] = - Item.get_local_linear_id(); - } - Item.barrier(); - - for (size_t I = 0; I < RepeatWG; ++I) { - // Check that the local accessor works. - size_t LocalIdx = Item.get_local_linear_id() ^ 1; - LocalAccessor[WgSize * I + LocalIdx] = - Ptr[WgSize * I + LocalIdx] + 1; - } - Item.barrier(); - - for (size_t I = 0; I < RepeatWG; ++I) { - // Check that the memory is accessible from other - // work-items - size_t BaseIdx = GroupOffset + (I * WgSize); - size_t LocalIdx = Item.get_local_linear_id(); - size_t GlobalIdx = BaseIdx + LocalIdx; - Acc[GlobalIdx] = LocalAccessor[WgSize * I + LocalIdx]; - } - }); + KernelFunctor(properties, LocalAccessor, Acc)); }); host_accessor Acc(Buf, read_only); diff --git a/sycl/test-e2e/WorkGroupScratchMemory/dynamic_alloc_ptr_alias.cpp b/sycl/test-e2e/WorkGroupScratchMemory/dynamic_alloc_ptr_alias.cpp index 46346d5f2ee85..2aba3369ada2f 100644 --- a/sycl/test-e2e/WorkGroupScratchMemory/dynamic_alloc_ptr_alias.cpp +++ b/sycl/test-e2e/WorkGroupScratchMemory/dynamic_alloc_ptr_alias.cpp @@ -23,6 +23,36 @@ using namespace sycl; namespace sycl_ext = sycl::ext::oneapi::experimental; +template struct KernelFunctor { + T1 m_props; + T2 mAcc; + KernelFunctor(T1 props, T2 Acc) : m_props(props), mAcc(Acc) {} + + void operator()(nd_item<1> Item) const { + int *Ptr = + reinterpret_cast(sycl_ext::get_work_group_scratch_memory()); + size_t GroupOffset = Item.get_group_linear_id() * ElemPerWG; + for (size_t I = 0; I < RepeatWG; ++I) { + Ptr[WgSize * I + Item.get_local_linear_id()] = Item.get_local_linear_id(); + } + + Item.barrier(); + // Check that multiple calls return the same pointer. + unsigned int *PtrAlias = reinterpret_cast( + sycl_ext::get_work_group_scratch_memory()); + + for (size_t I = 0; I < RepeatWG; ++I) { + // Check that the memory is accessible from other + // work-items + size_t BaseIdx = GroupOffset + (I * WgSize); + size_t LocalIdx = Item.get_local_linear_id() ^ 1; + size_t GlobalIdx = BaseIdx + LocalIdx; + mAcc[GlobalIdx] = PtrAlias[WgSize * I + LocalIdx]; + } + } + auto get(sycl_ext::properties_tag) const { return m_props; } +}; + int main() { queue Q; std::vector Vec(Size, 0); @@ -34,31 +64,7 @@ int main() { sizeof(int)); sycl_ext::properties properties{static_size}; Cgh.parallel_for(nd_range<1>(range<1>(WgSize * WgCount), range<1>(WgSize)), - properties, [=](nd_item<1> Item) { - int *Ptr = reinterpret_cast( - sycl_ext::get_work_group_scratch_memory()); - size_t GroupOffset = - Item.get_group_linear_id() * ElemPerWG; - for (size_t I = 0; I < RepeatWG; ++I) { - Ptr[WgSize * I + Item.get_local_linear_id()] = - Item.get_local_linear_id(); - } - - Item.barrier(); - // Check that multiple calls return the same pointer. - unsigned int *PtrAlias = - reinterpret_cast( - sycl_ext::get_work_group_scratch_memory()); - - for (size_t I = 0; I < RepeatWG; ++I) { - // Check that the memory is accessible from other - // work-items - size_t BaseIdx = GroupOffset + (I * WgSize); - size_t LocalIdx = Item.get_local_linear_id() ^ 1; - size_t GlobalIdx = BaseIdx + LocalIdx; - Acc[GlobalIdx] = PtrAlias[WgSize * I + LocalIdx]; - } - }); + KernelFunctor(properties, Acc)); }); host_accessor Acc(Buf, read_only); diff --git a/sycl/test-e2e/WorkGroupScratchMemory/dynamic_allocation.cpp b/sycl/test-e2e/WorkGroupScratchMemory/dynamic_allocation.cpp index 224bf2607f772..bf61ddd51a4b3 100644 --- a/sycl/test-e2e/WorkGroupScratchMemory/dynamic_allocation.cpp +++ b/sycl/test-e2e/WorkGroupScratchMemory/dynamic_allocation.cpp @@ -22,6 +22,32 @@ using namespace sycl; namespace sycl_ext = sycl::ext::oneapi::experimental; +template struct KernelFunctor { + T1 m_props; + T2 mAcc; + KernelFunctor(T1 props, T2 Acc) : m_props(props), mAcc(Acc) {} + + void operator()(nd_item<1> Item) const { + int *Ptr = + reinterpret_cast(sycl_ext::get_work_group_scratch_memory()); + size_t GroupOffset = Item.get_group_linear_id() * ElemPerWG; + for (size_t I = 0; I < RepeatWG; ++I) { + Ptr[WgSize * I + Item.get_local_linear_id()] = Item.get_local_linear_id(); + } + + Item.barrier(); + for (size_t I = 0; I < RepeatWG; ++I) { + // Check that the memory is accessible from other + // work-items + size_t BaseIdx = GroupOffset + (I * WgSize); + size_t LocalIdx = Item.get_local_linear_id() ^ 1; + size_t GlobalIdx = BaseIdx + LocalIdx; + mAcc[GlobalIdx] = Ptr[WgSize * I + LocalIdx]; + } + } + auto get(sycl_ext::properties_tag) const { return m_props; } +}; + int main() { queue Q; std::vector Vec(Size, 0); @@ -33,26 +59,7 @@ int main() { sizeof(int)); sycl_ext::properties properties{static_size}; Cgh.parallel_for(nd_range<1>(range<1>(WgSize * WgCount), range<1>(WgSize)), - properties, [=](nd_item<1> Item) { - int *Ptr = reinterpret_cast( - sycl_ext::get_work_group_scratch_memory()); - size_t GroupOffset = - Item.get_group_linear_id() * ElemPerWG; - for (size_t I = 0; I < RepeatWG; ++I) { - Ptr[WgSize * I + Item.get_local_linear_id()] = - Item.get_local_linear_id(); - } - - Item.barrier(); - for (size_t I = 0; I < RepeatWG; ++I) { - // Check that the memory is accessible from other - // work-items - size_t BaseIdx = GroupOffset + (I * WgSize); - size_t LocalIdx = Item.get_local_linear_id() ^ 1; - size_t GlobalIdx = BaseIdx + LocalIdx; - Acc[GlobalIdx] = Ptr[WgSize * I + LocalIdx]; - } - }); + KernelFunctor(properties, Acc)); }); host_accessor Acc(Buf, read_only); diff --git a/sycl/test-e2e/WorkGroupScratchMemory/dynamic_unused.cpp b/sycl/test-e2e/WorkGroupScratchMemory/dynamic_unused.cpp index e427305c18ed3..6608eed567633 100644 --- a/sycl/test-e2e/WorkGroupScratchMemory/dynamic_unused.cpp +++ b/sycl/test-e2e/WorkGroupScratchMemory/dynamic_unused.cpp @@ -14,6 +14,19 @@ using DataType = int; namespace sycl_ext = sycl::ext::oneapi::experimental; +template struct KernelFunctor { + T m_props; + DataType *m_a; + DataType *m_b; + KernelFunctor(T props, DataType *a, DataType *b) + : m_props(props), m_a(a), m_b(b) {} + + void operator()(sycl::nd_item<1> it) const { + m_b[it.get_local_linear_id()] = m_a[it.get_local_linear_id()]; + } + auto get(sycl_ext::properties_tag) const { return m_props; } +}; + int main() { sycl::queue queue; DataType *a = sycl::malloc_device(Size, queue); @@ -25,13 +38,12 @@ int main() { queue .submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<1>({Size}, {Size}), - sycl_ext::properties{sycl_ext::work_group_scratch_size( - Size * sizeof(DataType))}, - [=](sycl::nd_item<1> it) { - b[it.get_local_linear_id()] = - a[it.get_local_linear_id()]; - }); + cgh.parallel_for( + sycl::nd_range<1>({Size}, {Size}), + KernelFunctor( + sycl_ext::properties{ + sycl_ext::work_group_scratch_size(Size * sizeof(DataType))}, + a, b)); }) .wait_and_throw();