diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 7d2dbea3ad279..2e531898b4b5f 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -248,37 +248,36 @@ class reduction_impl_algo; using cl::sycl::detail::enable_if_t; using cl::sycl::detail::queue_impl; -template -void reduCGFunc(handler &CGH, KernelType KernelFunc, const range &Range, - size_t MaxWGSize, uint32_t NumConcurrentWorkGroups, - Reduction &Redu); +// Kernels with single reduction +/// If we are given sycl::range and not sycl::nd_range we have more freedom in +/// how to split the iteration space. template -enable_if_t -reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, - const nd_range &Range, Reduction &Redu); +void reduCGFuncForRange(handler &CGH, KernelType KernelFunc, + const range &Range, size_t MaxWGSize, + uint32_t NumConcurrentWorkGroups, Reduction &Redu); template -enable_if_t -reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, - Reduction &Redu); +void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, + const nd_range &Range, Reduction &Redu); template -enable_if_t -reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, - Reduction &Redu); +void reduCGFunc(handler &CGH, KernelType KernelFunc, + const nd_range &Range, Reduction &Redu); -template -enable_if_t -reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, - Reduction &Redu); +// Kernels with multiple reductions +// sycl::nd_range version template -void reduCGFunc(handler &CGH, KernelType KernelFunc, - const nd_range &Range, - std::tuple &ReduTuple, - std::index_sequence); +void reduCGFuncMulti(handler &CGH, KernelType KernelFunc, + const nd_range &Range, + std::tuple &ReduTuple, + std::index_sequence); + +template +size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize, + Reduction &Redu); template @@ -300,12 +299,6 @@ reduSaveFinalResultToUserMem(std::shared_ptr Queue, bool IsHost, std::tuple &ReduTuple, std::index_sequence); -template -std::enable_if_t -reduSaveFinalResultToUserMemHelper(std::vector &Events, - std::shared_ptr Queue, - bool IsHost, Reduction &Redu, RestT... Rest); - __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(std::shared_ptr Queue); @@ -470,6 +463,27 @@ class __SYCL_EXPORT handler { MStreamStorage.push_back(Stream); } + /// Helper utility for operation widely used through different reduction + /// implementations. + /// @{ + template + event withAuxHandler(std::shared_ptr Queue, + FunctorTy Func) { + handler AuxHandler(Queue, MIsHost); + AuxHandler.saveCodeLoc(MCodeLoc); + Func(AuxHandler); + return AuxHandler.finalize(); + } + + template + static event withAuxHandler(std::shared_ptr Queue, + bool IsHost, FunctorTy Func) { + handler AuxHandler(Queue, IsHost); + Func(AuxHandler); + return AuxHandler.finalize(); + } + /// }@ + /// Saves buffers created by handling reduction feature in handler. /// They are then forwarded to command group and destroyed only after /// the command group finishes the work on device/host. @@ -1587,6 +1601,9 @@ class __SYCL_EXPORT handler { #endif } +// "if constexpr" simplifies implementation/increases readability in comparison +// with SFINAE-based approach. +#if __cplusplus >= 201703L /// Defines and invokes a SYCL kernel function for the specified nd_range. /// /// The SYCL kernel function is defined as a lambda function or a named @@ -1618,123 +1635,76 @@ class __SYCL_EXPORT handler { // for the device. size_t MaxWGSize = ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize); - ext::oneapi::detail::reduCGFunc( + ext::oneapi::detail::reduCGFuncForRange( *this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu); if (Reduction::is_usm || (Reduction::has_fast_atomics && Redu.initializeToIdentity()) || (!Reduction::has_fast_atomics && Redu.hasUserDiscardWriteAccessor())) { this->finalize(); - handler CopyHandler(QueueCopy, MIsHost); - CopyHandler.saveCodeLoc(MCodeLoc); - ext::oneapi::detail::reduSaveFinalResultToUserMem(CopyHandler, - Redu); - MLastEvent = CopyHandler.finalize(); - } - } - - /// Implements parallel_for() accepting nd_range \p Range and one reduction - /// object. This version uses fast sycl::atomic operations to update reduction - /// variable at the end of each work-group work. - // - // If the reduction variable must be initialized with the identity value - // before the kernel run, then an additional working accessor is created, - // initialized with the identity value and used in the kernel. That working - // accessor is then copied to user's accessor or USM pointer after - // the kernel run. - // For USM pointers without initialize_to_identity properties the same scheme - // with working accessor is used as re-using user's USM pointer in the kernel - // would require creation of another variant of user's kernel, which does not - // seem efficient. - template - detail::enable_if_t - parallel_for(nd_range Range, Reduction Redu, - _KERNELFUNCPARAM(KernelFunc)) { - std::shared_ptr QueueCopy = MQueue; - ext::oneapi::detail::reduCGFunc(*this, KernelFunc, Range, Redu); - - if (Reduction::is_usm || Redu.initializeToIdentity()) { - this->finalize(); - handler CopyHandler(QueueCopy, MIsHost); - CopyHandler.saveCodeLoc(MCodeLoc); - ext::oneapi::detail::reduSaveFinalResultToUserMem(CopyHandler, - Redu); - MLastEvent = CopyHandler.finalize(); + MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) { + ext::oneapi::detail::reduSaveFinalResultToUserMem( + CopyHandler, Redu); + }); } } - /// Implements parallel_for() accepting nd_range \p Range and one reduction - /// object. This version is a specialization for the add operator. - /// It performs runtime checks for device aspect "atomic64"; if found, fast - /// sycl::atomic_ref operations are used to update the reduction at the - /// end of each work-group work. Otherwise the default implementation is - /// used. - // - // If the reduction variable must be initialized with the identity value - // before the kernel run, then an additional working accessor is created, - // initialized with the identity value and used in the kernel. That working - // accessor is then copied to user's accessor or USM pointer after - // the kernel run. - // For USM pointers without initialize_to_identity properties the same scheme - // with working accessor is used as re-using user's USM pointer in the kernel - // would require creation of another variant of user's kernel, which does not - // seem efficient. template - detail::enable_if_t - parallel_for(nd_range Range, Reduction Redu, - _KERNELFUNCPARAM(KernelFunc)) { - - std::shared_ptr QueueCopy = MQueue; - device D = detail::getDeviceFromHandler(*this); - - if (D.has(aspect::atomic64)) { - - ext::oneapi::detail::reduCGFuncAtomic64(*this, KernelFunc, - Range, Redu); - + void parallel_for(nd_range Range, Reduction Redu, + _KERNELFUNCPARAM(KernelFunc)) { + if constexpr (!Reduction::has_fast_atomics && + !Reduction::has_atomic_add_float64) { + // The most basic implementation. + parallel_for_impl(Range, Redu, KernelFunc); + return; + } else { // Can't "early" return for "if constexpr". + std::shared_ptr QueueCopy = MQueue; + if constexpr (Reduction::has_atomic_add_float64) { + /// This version is a specialization for the add + /// operator. It performs runtime checks for device aspect "atomic64"; + /// if found, fast sycl::atomic_ref operations are used to update the + /// reduction at the end of each work-group work. Otherwise the + /// default implementation is used. + device D = detail::getDeviceFromHandler(*this); + + if (D.has(aspect::atomic64)) { + + ext::oneapi::detail::reduCGFuncAtomic64(*this, KernelFunc, + Range, Redu); + } else { + // Resort to basic implementation as well. + parallel_for_impl(Range, Redu, KernelFunc); + return; + } + } else { + // Use fast sycl::atomic operations to update reduction variable at the + // end of each work-group work. + ext::oneapi::detail::reduCGFunc(*this, KernelFunc, Range, + Redu); + } + // If the reduction variable must be initialized with the identity value + // before the kernel run, then an additional working accessor is created, + // initialized with the identity value and used in the kernel. That + // working accessor is then copied to user's accessor or USM pointer after + // the kernel run. + // For USM pointers without initialize_to_identity properties the same + // scheme with working accessor is used as re-using user's USM pointer in + // the kernel would require creation of another variant of user's kernel, + // which does not seem efficient. if (Reduction::is_usm || Redu.initializeToIdentity()) { this->finalize(); - handler CopyHandler(QueueCopy, MIsHost); - CopyHandler.saveCodeLoc(MCodeLoc); - ext::oneapi::detail::reduSaveFinalResultToUserMem( - CopyHandler, Redu); - MLastEvent = CopyHandler.finalize(); + MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) { + ext::oneapi::detail::reduSaveFinalResultToUserMem( + CopyHandler, Redu); + }); } - } else { - parallel_for_Impl(Range, Redu, KernelFunc); } } - /// Defines and invokes a SYCL kernel function for the specified nd_range. - /// Performs reduction operation specified in \p Redu. - /// - /// The SYCL kernel function is defined as a lambda function or a named - /// function object type and given an id or item for indexing in the indexing - /// space defined by \p Range. - /// If it is a named function object and the function object type is - /// globally visible, there is no need for the developer to provide - /// a kernel name for it. - /// - /// TODO: Support HOST. The kernels called by this parallel_for() may use - /// some functionality that is not yet supported on HOST such as: - /// barrier(), and ext::oneapi::reduce() that also may be used in more - /// optimized implementations waiting for their turn of code-review. - template - detail::enable_if_t - parallel_for(nd_range Range, Reduction Redu, - _KERNELFUNCPARAM(KernelFunc)) { - - parallel_for_Impl(Range, Redu, KernelFunc); - } - template - detail::enable_if_t - parallel_for_Impl(nd_range Range, Reduction Redu, - KernelType KernelFunc) { + void parallel_for_impl(nd_range Range, Reduction Redu, + KernelType KernelFunc) { // This parallel_for() is lowered to the following sequence: // 1) Call a kernel that a) call user's lambda function and b) performs // one iteration of reduction, storing the partial reductions/sums @@ -1790,20 +1760,17 @@ class __SYCL_EXPORT handler { PI_ERROR_INVALID_WORK_GROUP_SIZE); size_t NWorkItems = Range.get_group_range().size(); while (NWorkItems > 1) { - handler AuxHandler(QueueCopy, MIsHost); - AuxHandler.saveCodeLoc(MCodeLoc); - - NWorkItems = ext::oneapi::detail::reduAuxCGFunc( - AuxHandler, NWorkItems, MaxWGSize, Redu); - MLastEvent = AuxHandler.finalize(); + MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) { + NWorkItems = ext::oneapi::detail::reduAuxCGFunc( + AuxHandler, NWorkItems, MaxWGSize, Redu); + }); } // end while (NWorkItems > 1) if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor()) { - handler CopyHandler(QueueCopy, MIsHost); - CopyHandler.saveCodeLoc(MCodeLoc); - ext::oneapi::detail::reduSaveFinalResultToUserMem(CopyHandler, - Redu); - MLastEvent = CopyHandler.finalize(); + MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) { + ext::oneapi::detail::reduSaveFinalResultToUserMem( + CopyHandler, Redu); + }); } } @@ -1868,20 +1835,18 @@ class __SYCL_EXPORT handler { std::to_string(MaxWGSize), PI_ERROR_INVALID_WORK_GROUP_SIZE); - ext::oneapi::detail::reduCGFunc(*this, KernelFunc, Range, - ReduTuple, ReduIndices); + ext::oneapi::detail::reduCGFuncMulti(*this, KernelFunc, Range, + ReduTuple, ReduIndices); std::shared_ptr QueueCopy = MQueue; this->finalize(); size_t NWorkItems = Range.get_group_range().size(); while (NWorkItems > 1) { - handler AuxHandler(QueueCopy, MIsHost); - AuxHandler.saveCodeLoc(MCodeLoc); - - NWorkItems = - ext::oneapi::detail::reduAuxCGFunc( - AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices); - MLastEvent = AuxHandler.finalize(); + MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) { + NWorkItems = ext::oneapi::detail::reduAuxCGFunc( + AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices); + }); } // end while (NWorkItems > 1) auto CopyEvent = ext::oneapi::detail::reduSaveFinalResultToUserMem( @@ -1889,6 +1854,7 @@ class __SYCL_EXPORT handler { if (CopyEvent) MLastEvent = *CopyEvent; } +#endif // __cplusplus >= 201703L /// Hierarchical kernel invocation method of a kernel defined as a lambda /// encoding the body of each work-group to launch. @@ -2689,14 +2655,6 @@ class __SYCL_EXPORT handler { class Algorithm> friend class ext::oneapi::detail::reduction_impl_algo; - // This method needs to call the method finalize() and also access to private - // ctor/dtor. - template - std::enable_if_t friend ext::oneapi::detail:: - reduSaveFinalResultToUserMemHelper( - std::vector &Events, std::shared_ptr Queue, - bool IsHost, Reduction &, RestT...); - friend void detail::associateWithHandler(handler &, detail::AccessorBaseHost *, access::target); diff --git a/sycl/include/CL/sycl/reduction.hpp b/sycl/include/CL/sycl/reduction.hpp index 57f4d25db19ca..826335105bb0c 100644 --- a/sycl/include/CL/sycl/reduction.hpp +++ b/sycl/include/CL/sycl/reduction.hpp @@ -8,6 +8,9 @@ #pragma once +#if __cplusplus >= 201703L +// Entire feature is dependent on C++17. + #include #include "sycl/ext/oneapi/reduction.hpp" @@ -171,3 +174,5 @@ reduction(span Span, const T &Identity, BinaryOperation Combiner, } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) + +#endif // __cplusplus >= 201703L diff --git a/sycl/include/sycl/ext/oneapi/reduction.hpp b/sycl/include/sycl/ext/oneapi/reduction.hpp index e2c463862d9b4..bcb7fccfb61d0 100644 --- a/sycl/include/sycl/ext/oneapi/reduction.hpp +++ b/sycl/include/sycl/ext/oneapi/reduction.hpp @@ -8,6 +8,9 @@ #pragma once +#if __cplusplus >= 201703L +// Entire feature is dependent on C++17. + #include #include #include @@ -954,7 +957,6 @@ class reduction_impl bool InitializeToIdentity = false) : algo(Identity, BOp, InitializeToIdentity, VarPtr) {} -#if __cplusplus >= 201703L /// Constructs reduction_impl when the identity value is statically known template < typename _T = T, @@ -980,39 +982,14 @@ class reduction_impl reduction_impl(span Span, const T &Identity, BinaryOperation BOp, bool InitializeToIdentity = false) : algo(Identity, BOp, InitializeToIdentity, Span.data()) {} -#endif }; -/// These are the forward declaration for the classes that help to create -/// names for additional kernels. It is used only when there are -/// more then 1 kernels in one parallel_for() implementing SYCL reduction. -template -class __sycl_reduction_main_kernel; -template -class __sycl_reduction_aux_kernel; - -/// Helper structs to get additional kernel name types based on given -/// \c Name and additional template parameters helping to distinguish kernels. -/// If \c Name is undefined (is \c auto_name) leave it that way to take -/// advantage of unnamed kernels being named after their functor. -template -struct get_reduction_main_kernel_name_t { - using name = __sycl_reduction_main_kernel; -}; -template -struct get_reduction_main_kernel_name_t { - using name = sycl::detail::auto_name; -}; -template -struct get_reduction_aux_kernel_name_t { - using name = __sycl_reduction_aux_kernel; -}; -template -struct get_reduction_aux_kernel_name_t { - using name = sycl::detail::auto_name; -}; +/// A helper to pass undefined (sycl::detail::auto_name) names unmodified. We +/// must do that to avoid name collisions. +template