Skip to content
266 changes: 112 additions & 154 deletions sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -248,37 +248,36 @@ class reduction_impl_algo;
using cl::sycl::detail::enable_if_t;
using cl::sycl::detail::queue_impl;

template <typename KernelName, typename KernelType, int Dims, class Reduction>
void reduCGFunc(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
Reduction &Redu);
// Kernels with single reduction

/// If we are given sycl::range and not sycl::nd_range we have more freedom in
/// how to split the iteration space.
template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<Reduction::has_atomic_add_float64>
reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &Redu);
void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
const range<Dims> &Range, size_t MaxWGSize,
uint32_t NumConcurrentWorkGroups, Reduction &Redu);

template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<Reduction::has_fast_atomics>
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Reduction &Redu);
void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &Redu);

template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<!Reduction::has_fast_atomics>
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Reduction &Redu);
void reduCGFunc(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &Redu);

template <typename KernelName, typename KernelType, class Reduction>
enable_if_t<!Reduction::has_fast_atomics, size_t>
reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
Reduction &Redu);
// Kernels with multiple reductions

// sycl::nd_range version
template <typename KernelName, typename KernelType, int Dims,
typename... Reductions, size_t... Is>
void reduCGFunc(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range,
std::tuple<Reductions...> &ReduTuple,
std::index_sequence<Is...>);
void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range,
std::tuple<Reductions...> &ReduTuple,
std::index_sequence<Is...>);

template <typename KernelName, typename KernelType, class Reduction>
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
Reduction &Redu);

template <typename KernelName, typename KernelType, typename... Reductions,
size_t... Is>
Expand All @@ -300,12 +299,6 @@ reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
bool IsHost, std::tuple<Reduction...> &ReduTuple,
std::index_sequence<Is...>);

template <typename Reduction, typename... RestT>
std::enable_if_t<!Reduction::is_usm>
reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
std::shared_ptr<detail::queue_impl> Queue,
bool IsHost, Reduction &Redu, RestT... Rest);

__SYCL_EXPORT uint32_t
reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);

Expand Down Expand Up @@ -470,6 +463,27 @@ class __SYCL_EXPORT handler {
MStreamStorage.push_back(Stream);
}

/// Helper utility for operation widely used through different reduction
/// implementations.
/// @{
template <class FunctorTy>
event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
FunctorTy Func) {
handler AuxHandler(Queue, MIsHost);
AuxHandler.saveCodeLoc(MCodeLoc);
Func(AuxHandler);
return AuxHandler.finalize();
}

template <class FunctorTy>
static event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This version is not used, if I am not missing anything.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bool IsHost, FunctorTy Func) {
handler AuxHandler(Queue, IsHost);
Func(AuxHandler);
return AuxHandler.finalize();
}
/// }@

/// Saves buffers created by handling reduction feature in handler.
/// They are then forwarded to command group and destroyed only after
/// the command group finishes the work on device/host.
Expand Down Expand Up @@ -1587,6 +1601,9 @@ class __SYCL_EXPORT handler {
#endif
}

// "if constexpr" simplifies implementation/increases readability in comparison
// with SFINAE-based approach.
#if __cplusplus >= 201703L
/// Defines and invokes a SYCL kernel function for the specified nd_range.
///
/// The SYCL kernel function is defined as a lambda function or a named
Expand Down Expand Up @@ -1618,123 +1635,76 @@ class __SYCL_EXPORT handler {
// for the device.
size_t MaxWGSize =
ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
ext::oneapi::detail::reduCGFunc<KernelName>(
ext::oneapi::detail::reduCGFuncForRange<KernelName>(
*this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
if (Reduction::is_usm ||
(Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
(!Reduction::has_fast_atomics && Redu.hasUserDiscardWriteAccessor())) {
this->finalize();
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
Redu);
MLastEvent = CopyHandler.finalize();
}
}

/// Implements parallel_for() accepting nd_range \p Range and one reduction
/// object. This version uses fast sycl::atomic operations to update reduction
/// variable at the end of each work-group work.
//
// If the reduction variable must be initialized with the identity value
// before the kernel run, then an additional working accessor is created,
// initialized with the identity value and used in the kernel. That working
// accessor is then copied to user's accessor or USM pointer after
// the kernel run.
// For USM pointers without initialize_to_identity properties the same scheme
// with working accessor is used as re-using user's USM pointer in the kernel
// would require creation of another variant of user's kernel, which does not
// seem efficient.
template <typename KernelName = detail::auto_name, typename KernelType,
int Dims, typename Reduction>
detail::enable_if_t<Reduction::has_fast_atomics>
parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);

if (Reduction::is_usm || Redu.initializeToIdentity()) {
this->finalize();
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
Redu);
MLastEvent = CopyHandler.finalize();
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
CopyHandler, Redu);
});
}
}

/// Implements parallel_for() accepting nd_range \p Range and one reduction
/// object. This version is a specialization for the add operator.
/// It performs runtime checks for device aspect "atomic64"; if found, fast
/// sycl::atomic_ref operations are used to update the reduction at the
/// end of each work-group work. Otherwise the default implementation is
/// used.
//
// If the reduction variable must be initialized with the identity value
// before the kernel run, then an additional working accessor is created,
// initialized with the identity value and used in the kernel. That working
// accessor is then copied to user's accessor or USM pointer after
// the kernel run.
// For USM pointers without initialize_to_identity properties the same scheme
// with working accessor is used as re-using user's USM pointer in the kernel
// would require creation of another variant of user's kernel, which does not
// seem efficient.
template <typename KernelName = detail::auto_name, typename KernelType,
int Dims, typename Reduction>
detail::enable_if_t<Reduction::has_atomic_add_float64>
parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {

std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
device D = detail::getDeviceFromHandler(*this);

if (D.has(aspect::atomic64)) {

ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
Range, Redu);

void parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {
if constexpr (!Reduction::has_fast_atomics &&
!Reduction::has_atomic_add_float64) {
// The most basic implementation.
parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
return;
} else { // Can't "early" return for "if constexpr".
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
if constexpr (Reduction::has_atomic_add_float64) {
/// This version is a specialization for the add
/// operator. It performs runtime checks for device aspect "atomic64";
/// if found, fast sycl::atomic_ref operations are used to update the
/// reduction at the end of each work-group work. Otherwise the
/// default implementation is used.
device D = detail::getDeviceFromHandler(*this);

if (D.has(aspect::atomic64)) {

ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
Range, Redu);
} else {
// Resort to basic implementation as well.
parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
return;
}
} else {
// Use fast sycl::atomic operations to update reduction variable at the
// end of each work-group work.
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
Redu);
}
// If the reduction variable must be initialized with the identity value
// before the kernel run, then an additional working accessor is created,
// initialized with the identity value and used in the kernel. That
// working accessor is then copied to user's accessor or USM pointer after
// the kernel run.
// For USM pointers without initialize_to_identity properties the same
// scheme with working accessor is used as re-using user's USM pointer in
// the kernel would require creation of another variant of user's kernel,
// which does not seem efficient.
if (Reduction::is_usm || Redu.initializeToIdentity()) {
this->finalize();
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
CopyHandler, Redu);
MLastEvent = CopyHandler.finalize();
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
CopyHandler, Redu);
});
}
} else {
parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
}
}

/// Defines and invokes a SYCL kernel function for the specified nd_range.
/// Performs reduction operation specified in \p Redu.
///
/// The SYCL kernel function is defined as a lambda function or a named
/// function object type and given an id or item for indexing in the indexing
/// space defined by \p Range.
/// If it is a named function object and the function object type is
/// globally visible, there is no need for the developer to provide
/// a kernel name for it.
///
/// TODO: Support HOST. The kernels called by this parallel_for() may use
/// some functionality that is not yet supported on HOST such as:
/// barrier(), and ext::oneapi::reduce() that also may be used in more
/// optimized implementations waiting for their turn of code-review.
template <typename KernelName = detail::auto_name, typename KernelType,
int Dims, typename Reduction>
detail::enable_if_t<!Reduction::has_fast_atomics &&
!Reduction::has_atomic_add_float64>
parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {

parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
}

template <typename KernelName, typename KernelType, int Dims,
typename Reduction>
detail::enable_if_t<!Reduction::has_fast_atomics>
parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
KernelType KernelFunc) {
void parallel_for_impl(nd_range<Dims> Range, Reduction Redu,
KernelType KernelFunc) {
// This parallel_for() is lowered to the following sequence:
// 1) Call a kernel that a) call user's lambda function and b) performs
// one iteration of reduction, storing the partial reductions/sums
Expand Down Expand Up @@ -1790,20 +1760,17 @@ class __SYCL_EXPORT handler {
PI_ERROR_INVALID_WORK_GROUP_SIZE);
size_t NWorkItems = Range.get_group_range().size();
while (NWorkItems > 1) {
handler AuxHandler(QueueCopy, MIsHost);
AuxHandler.saveCodeLoc(MCodeLoc);

NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
AuxHandler, NWorkItems, MaxWGSize, Redu);
MLastEvent = AuxHandler.finalize();
MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
AuxHandler, NWorkItems, MaxWGSize, Redu);
});
} // end while (NWorkItems > 1)

if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor()) {
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
Redu);
MLastEvent = CopyHandler.finalize();
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
CopyHandler, Redu);
});
}
}

Expand Down Expand Up @@ -1868,27 +1835,26 @@ class __SYCL_EXPORT handler {
std::to_string(MaxWGSize),
PI_ERROR_INVALID_WORK_GROUP_SIZE);

ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
ReduTuple, ReduIndices);
ext::oneapi::detail::reduCGFuncMulti<KernelName>(*this, KernelFunc, Range,
ReduTuple, ReduIndices);
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
this->finalize();

size_t NWorkItems = Range.get_group_range().size();
while (NWorkItems > 1) {
handler AuxHandler(QueueCopy, MIsHost);
AuxHandler.saveCodeLoc(MCodeLoc);

NWorkItems =
ext::oneapi::detail::reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
MLastEvent = AuxHandler.finalize();
MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName,
decltype(KernelFunc)>(
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
});
} // end while (NWorkItems > 1)

auto CopyEvent = ext::oneapi::detail::reduSaveFinalResultToUserMem(
QueueCopy, MIsHost, ReduTuple, ReduIndices);
if (CopyEvent)
MLastEvent = *CopyEvent;
}
#endif // __cplusplus >= 201703L

/// Hierarchical kernel invocation method of a kernel defined as a lambda
/// encoding the body of each work-group to launch.
Expand Down Expand Up @@ -2689,14 +2655,6 @@ class __SYCL_EXPORT handler {
class Algorithm>
friend class ext::oneapi::detail::reduction_impl_algo;

// This method needs to call the method finalize() and also access to private
// ctor/dtor.
template <typename Reduction, typename... RestT>
std::enable_if_t<!Reduction::is_usm> friend ext::oneapi::detail::
reduSaveFinalResultToUserMemHelper(
std::vector<event> &Events, std::shared_ptr<detail::queue_impl> Queue,
bool IsHost, Reduction &, RestT...);

friend void detail::associateWithHandler(handler &,
detail::AccessorBaseHost *,
access::target);
Expand Down
5 changes: 5 additions & 0 deletions sycl/include/CL/sycl/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

#pragma once

#if __cplusplus >= 201703L
// Entire feature is dependent on C++17.

#include <CL/sycl/known_identity.hpp>

#include "sycl/ext/oneapi/reduction.hpp"
Expand Down Expand Up @@ -171,3 +174,5 @@ reduction(span<T, Extent> Span, const T &Identity, BinaryOperation Combiner,

} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)

#endif // __cplusplus >= 201703L
Loading