Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 65 additions & 57 deletions libcudacxx/include/cuda/__barrier/barrier_block_scope.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(_CCCL_ASSERT(!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared),
"barrier must not be in cluster shared memory");))
"barrier must not be in other's block shared memory in the cluster");))
}

_CCCL_API inline friend void init(barrier* __b,
Expand All @@ -125,7 +125,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
NV_IF_TARGET(
NV_PROVIDES_SM_90,
(_CCCL_ASSERT(!::cuda::device::is_object_from(__b->__barrier, ::cuda::device::address_space::cluster_shared),
"barrier must not be in cluster shared memory");))
"barrier must not be in other's block shared memory in the cluster");))

new (&__b->__barrier) __barrier_base(__expected);
}
Expand All @@ -134,13 +134,14 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
#if _CCCL_CUDA_COMPILATION()
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE arrival_token __arrive_sm90(::cuda::std::ptrdiff_t __update)
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared))
if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared))
{
return __barrier.arrive(__update);
return ::cuda::ptx::mbarrier_arrive(__native_handle(), __update);
}
_CCCL_ASSERT(::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared),
"barrier must be in shared memory, not cluster shared memory");
return ::cuda::ptx::mbarrier_arrive(__native_handle(), __update);

_CCCL_ASSERT(!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared),
"barrier must be in cta shared or global memory, not other's block shared memory in the cluster");
return __barrier.arrive(__update);
}

[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE arrival_token __arrive_sm80(::cuda::std::ptrdiff_t __update)
Expand Down Expand Up @@ -201,13 +202,15 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
#if _CCCL_CUDA_COMPILATION()
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool __try_wait_sm90(arrival_token __token) const
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared))
if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared))
{
return ::cuda::std::__call_try_wait(__barrier, ::cuda::std::move(__token));
return ::cuda::ptx::mbarrier_try_wait(__native_handle(), __token);
}
_CCCL_ASSERT(::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared),
"barrier must be in shared memory, not cluster shared memory");
return ::cuda::ptx::mbarrier_try_wait(__native_handle(), __token);

_CCCL_ASSERT(!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared),
"barrier must be in cta shared or global memory, not other's block shared memory in the cluster");

return ::cuda::std::__call_try_wait(__barrier, ::cuda::std::move(__token));
}

[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool __try_wait_sm80(arrival_token __token) const
Expand Down Expand Up @@ -235,25 +238,26 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool
__try_wait_sm90(arrival_token __token, ::cuda::std::chrono::nanoseconds __nanosec) const
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared))
if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared))
{
return ::cuda::std::__cccl_thread_poll_with_backoff(
::cuda::std::__barrier_poll_tester_phase<barrier>(this, ::cuda::std::move(__token)), __nanosec);
bool __ready = 0;
::cuda::std::chrono::high_resolution_clock::time_point const __start =
::cuda::std::chrono::high_resolution_clock::now();
::cuda::std::chrono::nanoseconds __elapsed;
do
{
const ::cuda::std::uint32_t __wait_nsec = static_cast<::cuda::std::uint32_t>((__nanosec - __elapsed).count());
::cuda::ptx::mbarrier_try_wait(__native_handle(), __token, __wait_nsec);
__elapsed = ::cuda::std::chrono::high_resolution_clock::now() - __start;
} while (!__ready && (__nanosec > __elapsed));
return __ready;
}
_CCCL_ASSERT(::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared),
"barrier must not be in cluster shared memory");

bool __ready = 0;
::cuda::std::chrono::high_resolution_clock::time_point const __start =
::cuda::std::chrono::high_resolution_clock::now();
::cuda::std::chrono::nanoseconds __elapsed;
do
{
const ::cuda::std::uint32_t __wait_nsec = static_cast<::cuda::std::uint32_t>((__nanosec - __elapsed).count());
::cuda::ptx::mbarrier_try_wait(__native_handle(), __token, __wait_nsec);
__elapsed = ::cuda::std::chrono::high_resolution_clock::now() - __start;
} while (!__ready && (__nanosec > __elapsed));
return __ready;
_CCCL_ASSERT(!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared),
"barrier must be in cta shared or global memory, not other's block shared memory in the cluster");

return ::cuda::std::__cccl_thread_poll_with_backoff(
::cuda::std::__barrier_poll_tester_phase<barrier>(this, ::cuda::std::move(__token)), __nanosec);
}

[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool
Expand Down Expand Up @@ -298,14 +302,15 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
#if _CCCL_CUDA_COMPILATION()
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool __try_wait_parity_sm90(bool __phase_parity) const
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared))
if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared))
{
return ::cuda::std::__call_try_wait_parity(__barrier, __phase_parity);
return ::cuda::ptx::mbarrier_try_wait_parity(__native_handle(), __phase_parity);
}
_CCCL_ASSERT(::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared),
"barrier must be in shared memory, not cluster shared memory");

return ::cuda::ptx::mbarrier_try_wait_parity(__native_handle(), __phase_parity);
_CCCL_ASSERT(!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared),
"barrier must be in cta shared or global memory, not other's block shared memory in the cluster");

return ::cuda::std::__call_try_wait_parity(__barrier, __phase_parity);
}

[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool __try_wait_parity_sm80(bool __phase_parity) const
Expand Down Expand Up @@ -333,26 +338,27 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool
__try_wait_parity_sm90(bool __phase_parity, ::cuda::std::chrono::nanoseconds __nanosec) const
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared))
if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared))
{
return ::cuda::std::__cccl_thread_poll_with_backoff(
::cuda::std::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);
int32_t __ready = 0;
::cuda::std::chrono::high_resolution_clock::time_point const __start =
::cuda::std::chrono::high_resolution_clock::now();
::cuda::std::chrono::nanoseconds __elapsed;
do
{
const ::cuda::std::uint32_t __wait_nsec = static_cast<::cuda::std::uint32_t>((__nanosec - __elapsed).count());
::cuda::ptx::mbarrier_try_wait_parity(__native_handle(), __phase_parity, __wait_nsec);
__elapsed = ::cuda::std::chrono::high_resolution_clock::now() - __start;
} while (!__ready && (__nanosec > __elapsed));

return __ready;
}
_CCCL_ASSERT(::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared),
"barrier must be in shared memory, not cluster shared memory");

int32_t __ready = 0;
::cuda::std::chrono::high_resolution_clock::time_point const __start =
::cuda::std::chrono::high_resolution_clock::now();
::cuda::std::chrono::nanoseconds __elapsed;
do
{
const ::cuda::std::uint32_t __wait_nsec = static_cast<::cuda::std::uint32_t>((__nanosec - __elapsed).count());
::cuda::ptx::mbarrier_try_wait_parity(__native_handle(), __phase_parity, __wait_nsec);
__elapsed = ::cuda::std::chrono::high_resolution_clock::now() - __start;
} while (!__ready && (__nanosec > __elapsed));
_CCCL_ASSERT(!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared),
"barrier must be in cta shared or global memory, not other's block shared memory in the cluster");

return __ready;
return ::cuda::std::__cccl_thread_poll_with_backoff(
::cuda::std::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);
}

[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE bool
Expand Down Expand Up @@ -433,17 +439,19 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
#if _CCCL_CUDA_COMPILATION()
_CCCL_DEVICE_API _CCCL_FORCEINLINE void __arrive_and_drop_sm90()
{
if (!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared))
if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared))
{
return __barrier.arrive_and_drop();
// TODO(bgruber): expose mbarrier.arrive_drop.shared in cuda::ptx
asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(static_cast<::cuda::std::uint32_t>(
::__cvta_generic_to_shared(&__barrier)))
: "memory");
return;
}
_CCCL_ASSERT(::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared),
"barrier must be in shared memory, not cluster shared memory");

// TODO(bgruber): expose mbarrier.arrive_drop.shared in cuda::ptx
asm volatile("mbarrier.arrive_drop.shared.b64 _, [%0];" ::"r"(static_cast<::cuda::std::uint32_t>(
::__cvta_generic_to_shared(&__barrier)))
: "memory");
_CCCL_ASSERT(!::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::cluster_shared),
"barrier must be in cta shared or global memory, not other's block shared memory in the cluster");

__barrier.arrive_and_drop();
}

_CCCL_DEVICE_API _CCCL_FORCEINLINE void __arrive_and_drop_sm80()
Expand Down