Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 37 additions & 35 deletions libcudacxx/include/cuda/__barrier/barrier_block_scope.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include <cuda/__fwd/barrier.h>
#if _CCCL_CUDA_COMPILATION()
# include <cuda/__memory/address_space.h>
# include <cuda/__ptx/instructions/get_sreg.h>
# include <cuda/__ptx/instructions/mbarrier_arrive.h>
# include <cuda/__ptx/instructions/mbarrier_init.h>
Expand All @@ -31,7 +32,6 @@
# include <cuda/__ptx/ptx_dot_variants.h>
# include <cuda/__ptx/ptx_helper_functions.h>
#endif // _CCCL_CUDA_COMPILATION()
#include <cuda/__memory/address_space.h>
#include <cuda/std/__atomic/scopes.h>
#include <cuda/std/__barrier/barrier.h>
#include <cuda/std/__barrier/empty_completion.h>
Expand All @@ -40,6 +40,7 @@
#include <cuda/std/__chrono/duration.h>
#include <cuda/std/__chrono/high_resolution_clock.h>
#include <cuda/std/__chrono/time_point.h>
#include <cuda/std/__cstddef/types.h>
#include <cuda/std/__new/device_new.h>
#include <cuda/std/cstdint>

Expand Down Expand Up @@ -69,10 +70,10 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
using __barrier_base = ::cuda::std::__barrier_base<::cuda::std::__empty_completion, thread_scope_block>;
__barrier_base __barrier;

_CCCL_DEVICE friend inline ::cuda::std::uint64_t* ::cuda::device::_LIBCUDACXX_ABI_NAMESPACE::barrier_native_handle(
_CCCL_DEVICE friend ::cuda::std::uint64_t* ::cuda::device::_LIBCUDACXX_ABI_NAMESPACE::barrier_native_handle(
barrier<thread_scope_block>& __b);

_CCCL_DEVICE ::cuda::std::uint64_t* __native_handle() const
[[nodiscard]] _CCCL_DEVICE ::cuda::std::uint64_t* __native_handle() const
{
return ::cuda::device::barrier_native_handle(const_cast<barrier&>(*this));
}
Expand All @@ -89,15 +90,15 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
barrier(const barrier&) = delete;
barrier& operator=(const barrier&) = delete;

_CCCL_API inline barrier(::cuda::std::ptrdiff_t __expected,
::cuda::std::__empty_completion __completion = ::cuda::std::__empty_completion())
_CCCL_API barrier(::cuda::std::ptrdiff_t __expected,
::cuda::std::__empty_completion __completion = ::cuda::std::__empty_completion())
{
static_assert(_LIBCUDACXX_OFFSET_IS_ZERO(barrier<thread_scope_block>, __barrier),
"fatal error: bad barrier layout");
init(this, __expected, __completion);
}

_CCCL_API inline ~barrier()
_CCCL_API ~barrier()
{
NV_IF_TARGET(NV_PROVIDES_SM_80,
(if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared)) {
Expand All @@ -111,7 +112,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
"barrier must not be in cluster shared memory");))
}

_CCCL_API inline friend void init(barrier* __b,
_CCCL_API friend inline void init(barrier* __b,
::cuda::std::ptrdiff_t __expected,
::cuda::std::__empty_completion = ::cuda::std::__empty_completion())
{
Expand All @@ -131,7 +132,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl

private:
#if _CCCL_CUDA_COMPILATION()
_CCCL_DEVICE_API _CCCL_FORCEINLINE arrival_token __arrive_sm90(::cuda::std::ptrdiff_t __update)
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE arrival_token __arrive_sm90(::cuda::std::ptrdiff_t __update)
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared))
{
Expand All @@ -142,7 +143,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
return ::cuda::ptx::mbarrier_arrive(__native_handle(), __update);
}

_CCCL_DEVICE_API _CCCL_FORCEINLINE arrival_token __arrive_sm80(::cuda::std::ptrdiff_t __update)
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE arrival_token __arrive_sm80(::cuda::std::ptrdiff_t __update)
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared))
{
Expand All @@ -156,7 +157,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
return ::cuda::ptx::mbarrier_arrive(__native_handle());
}

_CCCL_DEVICE_API _CCCL_FORCEINLINE arrival_token __arrive_sm70(::cuda::std::ptrdiff_t __update)
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE arrival_token __arrive_sm70(::cuda::std::ptrdiff_t __update)
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared))
{
Expand All @@ -182,7 +183,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
#endif // _CCCL_CUDA_COMPILATION()

public:
[[nodiscard]] _CCCL_API inline arrival_token arrive(::cuda::std::ptrdiff_t __update = 1)
/*discard*/ _CCCL_API arrival_token arrive(::cuda::std::ptrdiff_t __update = 1)
{
_CCCL_ASSERT(__update >= 0, "Arrival count update must be non-negative.");
NV_DISPATCH_TARGET(
Expand All @@ -198,7 +199,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl

private:
#if _CCCL_CUDA_COMPILATION()
_CCCL_DEVICE_API _CCCL_FORCEINLINE bool __try_wait_sm90(arrival_token __token) const
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool __try_wait_sm90(arrival_token __token) const
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared))
{
Expand All @@ -209,7 +210,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
return ::cuda::ptx::mbarrier_try_wait(__native_handle(), __token);
}

_CCCL_DEVICE_API _CCCL_FORCEINLINE bool __try_wait_sm80(arrival_token __token) const
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool __try_wait_sm80(arrival_token __token) const
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared))
{
Expand All @@ -219,7 +220,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
}
#endif // _CCCL_CUDA_COMPILATION()

_CCCL_API inline bool __try_wait([[maybe_unused]] arrival_token __token) const
[[nodiscard]] _CCCL_API bool __try_wait(arrival_token __token) const
{
NV_DISPATCH_TARGET(
NV_PROVIDES_SM_90,
Expand All @@ -231,7 +232,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
}

#if _CCCL_CUDA_COMPILATION()
_CCCL_DEVICE_API _CCCL_FORCEINLINE bool
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool
__try_wait_sm90(arrival_token __token, ::cuda::std::chrono::nanoseconds __nanosec) const
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared))
Expand All @@ -255,7 +256,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
return __ready;
}

_CCCL_DEVICE_API _CCCL_FORCEINLINE bool
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool
__try_wait_sm80(arrival_token __token, ::cuda::std::chrono::nanoseconds __nanosec) const
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared))
Expand All @@ -276,7 +277,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
#endif // _CCCL_CUDA_COMPILATION()

// Document de drop > uint32_t for __nanosec on public for APIs
_CCCL_API inline bool __try_wait(arrival_token __token, ::cuda::std::chrono::nanoseconds __nanosec) const
[[nodiscard]] _CCCL_API bool __try_wait(arrival_token __token, ::cuda::std::chrono::nanoseconds __nanosec) const
{
if (__nanosec.count() < 1)
{
Expand All @@ -295,7 +296,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
}

#if _CCCL_CUDA_COMPILATION()
_CCCL_DEVICE_API _CCCL_FORCEINLINE bool __try_wait_parity_sm90(bool __phase_parity) const
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool __try_wait_parity_sm90(bool __phase_parity) const
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared))
{
Expand All @@ -307,7 +308,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
return ::cuda::ptx::mbarrier_try_wait_parity(__native_handle(), __phase_parity);
}

_CCCL_DEVICE_API _CCCL_FORCEINLINE bool __try_wait_parity_sm80(bool __phase_parity) const
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool __try_wait_parity_sm80(bool __phase_parity) const
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared))
{
Expand All @@ -317,7 +318,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
}
#endif // _CCCL_CUDA_COMPILATION()

_CCCL_API inline bool __try_wait_parity(bool __phase_parity) const
[[nodiscard]] _CCCL_API bool __try_wait_parity(bool __phase_parity) const
{
NV_DISPATCH_TARGET(
NV_PROVIDES_SM_90,
Expand All @@ -329,7 +330,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
}

#if _CCCL_CUDA_COMPILATION()
_CCCL_DEVICE_API _CCCL_FORCEINLINE bool
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool
__try_wait_parity_sm90(bool __phase_parity, ::cuda::std::chrono::nanoseconds __nanosec) const
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared))
Expand All @@ -354,7 +355,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
return __ready;
}

_CCCL_DEVICE_API _CCCL_FORCEINLINE bool
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool
__try_wait_parity_sm80(bool __phase_parity, ::cuda::std::chrono::nanoseconds __nanosec) const
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared))
Expand All @@ -375,7 +376,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
}
#endif // _CCCL_CUDA_COMPILATION()

_CCCL_API inline bool __try_wait_parity(bool __phase_parity, ::cuda::std::chrono::nanoseconds __nanosec) const
[[nodiscard]] _CCCL_API bool __try_wait_parity(bool __phase_parity, ::cuda::std::chrono::nanoseconds __nanosec) const
{
if (__nanosec.count() < 1)
{
Expand All @@ -384,17 +385,17 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl

NV_DISPATCH_TARGET(
NV_PROVIDES_SM_90,
(__try_wait_parity_sm90(__phase_parity, __nanosec);),
(return __try_wait_parity_sm90(__phase_parity, __nanosec);),
NV_PROVIDES_SM_80,
(__try_wait_parity_sm80(__phase_parity, __nanosec);),
(return __try_wait_parity_sm80(__phase_parity, __nanosec);),
NV_ANY_TARGET,
(return ::cuda::std::__cccl_thread_poll_with_backoff(
::cuda::std::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);))
_CCCL_UNREACHABLE();
}

public:
_CCCL_API inline void wait(arrival_token&& __phase) const
_CCCL_API void wait(arrival_token&& __phase) const
{
// no need to back off on a barrier in SMEM on SM90+, SYNCS unit is taking care of this
NV_IF_TARGET(NV_PROVIDES_SM_90,
Expand All @@ -409,7 +410,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
::cuda::std::__barrier_poll_tester_phase<barrier>(this, ::cuda::std::move(__phase)));
}

_CCCL_API inline void wait_parity(bool __phase_parity) const
_CCCL_API void wait_parity(bool __phase_parity) const
{
// no need to back off on a barrier in SMEM on SM90+, SYNCS unit is taking care of this
NV_IF_TARGET(NV_PROVIDES_SM_90,
Expand All @@ -424,7 +425,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
::cuda::std::__barrier_poll_tester_parity<barrier>(this, __phase_parity));
}

_CCCL_API inline void arrive_and_wait()
_CCCL_API void arrive_and_wait()
{
wait(arrive());
}
Expand Down Expand Up @@ -463,7 +464,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
#endif // _CCCL_CUDA_COMPILATION()

public:
_CCCL_API inline void arrive_and_drop()
_CCCL_API void arrive_and_drop()
{
NV_DISPATCH_TARGET(
NV_PROVIDES_SM_90,
Expand All @@ -475,13 +476,13 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
(__barrier.arrive_and_drop();))
}

_CCCL_API static constexpr ptrdiff_t max() noexcept
[[nodiscard]] _CCCL_API static constexpr ::cuda::std::ptrdiff_t max() noexcept
{
return (1 << 20) - 1;
}

template <class _Rep, class _Period>
[[nodiscard]] _CCCL_API inline bool
[[nodiscard]] _CCCL_API bool
try_wait_for(arrival_token&& __token, const ::cuda::std::chrono::duration<_Rep, _Period>& __dur)
{
auto __nanosec = ::cuda::std::chrono::duration_cast<::cuda::std::chrono::nanoseconds>(__dur);
Expand All @@ -490,14 +491,14 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
}

template <class _Clock, class _Duration>
[[nodiscard]] _CCCL_API inline bool
[[nodiscard]] _CCCL_API bool
try_wait_until(arrival_token&& __token, const ::cuda::std::chrono::time_point<_Clock, _Duration>& __time)
{
return try_wait_for(::cuda::std::move(__token), (__time - _Clock::now()));
}

template <class _Rep, class _Period>
[[nodiscard]] _CCCL_API inline bool
[[nodiscard]] _CCCL_API bool
try_wait_parity_for(bool __phase_parity, const ::cuda::std::chrono::duration<_Rep, _Period>& __dur)
{
auto __nanosec = ::cuda::std::chrono::duration_cast<::cuda::std::chrono::nanoseconds>(__dur);
Expand All @@ -506,7 +507,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
}

template <class _Clock, class _Duration>
[[nodiscard]] _CCCL_API inline bool
[[nodiscard]] _CCCL_API bool
try_wait_parity_until(bool __phase_parity, const ::cuda::std::chrono::time_point<_Clock, _Duration>& __time)
{
return try_wait_parity_for(__phase_parity, (__time - _Clock::now()));
Expand All @@ -516,7 +517,8 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
_CCCL_END_NAMESPACE_CUDA

_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE
_CCCL_DEVICE inline ::cuda::std::uint64_t* barrier_native_handle(barrier<thread_scope_block>& __b)

[[nodiscard]] _CCCL_DEVICE inline ::cuda::std::uint64_t* barrier_native_handle(barrier<thread_scope_block>& __b)
{
return reinterpret_cast<::cuda::std::uint64_t*>(&__b.__barrier);
}
Expand Down
Loading
Loading