NVIDIA · fbusato · Feb 5, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026
@@ -2,25 +2,61 @@
 // SPDX-License-Identifier: BSD-3-Clause
 #pragma once
 
-#include <cub/thread/thread_operators.cuh>
-
 #include <cuda/functional>
 #include <cuda/std/functional>
 #include <cuda/std/limits>
+#include <cuda/type_traits>
 
 #include <c2h/custom_type.h>
+#include <c2h/extended_types.h>
 #include <c2h/test_util_vec.h>
 
 /***********************************************************************************************************************
  * CUB operator to identity
  **********************************************************************************************************************/
 
-template <typename Operator, typename T>
-inline constexpr T identity_v = cub::detail::identity_v<Operator, T>;
+template <typename Operator, typename T, typename = void>
+inline constexpr T identity_v = cuda::identity_element<Operator, T>();
 
 template <typename T>
 inline const T identity_v<cuda::std::plus<>, T> = T{}; // e.g. short2, float2, complex<__half> etc.
 
+/***********************************************************************************************************************
+ * half_t specializations
+ **********************************************************************************************************************/
+
+template <>
+inline const half_t identity_v<cuda::std::plus<>, half_t> = half_t{0.0f};
+
+template <>
+inline const half_t identity_v<cuda::std::multiplies<>, half_t> = half_t{1.0f};
+
+template <>
+inline const half_t identity_v<cuda::minimum<>, half_t> = cuda::std::numeric_limits<half_t>::max();
+
+template <>
+inline const half_t identity_v<cuda::maximum<>, half_t> = cuda::std::numeric_limits<half_t>::lowest();
+
+/***********************************************************************************************************************
+ * bfloat16_t specializations
+ **********************************************************************************************************************/
+
+template <>
+inline const bfloat16_t identity_v<cuda::std::plus<>, bfloat16_t> = bfloat16_t{0.0f};
+
+template <>
+inline const bfloat16_t identity_v<cuda::std::multiplies<>, bfloat16_t> = bfloat16_t{1.0f};
+
+template <>
+inline const bfloat16_t identity_v<cuda::minimum<>, bfloat16_t> = cuda::std::numeric_limits<bfloat16_t>::max();
+
+template <>
+inline const bfloat16_t identity_v<cuda::maximum<>, bfloat16_t> = cuda::std::numeric_limits<bfloat16_t>::lowest();
+
+/***********************************************************************************************************************
+ * short2, ushort2, float2 specializations
+ **********************************************************************************************************************/
+
 template <>
 inline constexpr short2 identity_v<cuda::maximum<>, short2> =
   short2{cuda::std::numeric_limits<int16_t>::lowest(), cuda::std::numeric_limits<int16_t>::lowest()};

@@ -579,84 +579,6 @@ template <typename Operator>
     return op;
   }
 }
-
-//----------------------------------------------------------------------------------------------------------------------
-// Identity
-
-template <typename Op, typename T = void>
-inline constexpr T identity_v;
-
-template <typename T>
-inline constexpr T identity_v<::cuda::minimum<>, T> = ::cuda::std::numeric_limits<T>::max();
-
-template <typename T>
-inline constexpr T identity_v<::cuda::minimum<T>, T> = ::cuda::std::numeric_limits<T>::max();
-
-template <typename T>
-inline constexpr T identity_v<::cuda::minimum<T>, void> = ::cuda::std::numeric_limits<T>::max();
-
-template <typename T>
-inline constexpr T identity_v<::cuda::maximum<>, T> = ::cuda::std::numeric_limits<T>::lowest();
-
-template <typename T>
-inline constexpr T identity_v<::cuda::maximum<T>, T> = ::cuda::std::numeric_limits<T>::lowest();
-
-template <typename T>
-inline constexpr T identity_v<::cuda::maximum<T>, void> = ::cuda::std::numeric_limits<T>::lowest();
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::plus<T>, T> = T{};
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::plus<>, T> = T{};
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::plus<T>, void> = T{};
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::bit_and<>, T> = static_cast<T>(~T{});
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::bit_and<T>, T> = static_cast<T>(~T{});
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::bit_and<T>, void> = static_cast<T>(~T{});
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::bit_or<>, T> = T{};
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::bit_or<T>, T> = T{};
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::bit_or<T>, void> = T{};
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::bit_xor<>, T> = T{};
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::bit_xor<T>, T> = T{};
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::bit_xor<T>, void> = T{};
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::logical_and<>, T> = true;
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::logical_and<T>, T> = true;
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::logical_and<T>, void> = true;
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::logical_or<>, T> = false;
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::logical_or<T>, T> = false;
-
-template <typename T>
-inline constexpr T identity_v<::cuda::std::logical_or<T>, void> = false;
 } // namespace detail
 
 #endif // !_CCCL_DOXYGEN_INVOKED

@@ -14,6 +14,7 @@
 #include <c2h/catch2_test_helper.h>
 #include <c2h/extended_types.h>
 #include <c2h/generators.h>
+#include <c2h/operator.cuh>
 
 constexpr int max_size  = 16;
 constexpr int num_seeds = 3;
@@ -158,7 +159,7 @@ C2H_TEST("ThreadScanExclusive Integral Type Tests",
   using dist_param                 = dist_interval<value_t, op_t, num_items, accum_t, output_t>;
   using filler_dist_param          = dist_interval<accum_t, op_t, num_items, accum_t, output_t>;
   constexpr auto scan_op           = op_t{};
-  constexpr auto operator_identity = cub_operator_to_identity<accum_t, op_t>::value();
+  constexpr auto operator_identity = cuda::identity_element<op_t, accum_t>();
   const int valid_items            = GENERATE_COPY(
     take(1, random(2, cuda::std::max(2, num_items - 1))),
     take(1, random(num_items + 2, cuda::std::numeric_limits<int>::max())),
@@ -215,7 +216,7 @@ C2H_TEST("ThreadScanExclusive Floating-Point Type Tests",
   using dist_param             = dist_interval<value_t, op_t, num_items, accum_t, output_t>;
   using filler_dist_param      = dist_interval<accum_t, op_t, num_items, accum_t, output_t>;
   constexpr auto scan_op       = op_t{};
-  const auto operator_identity = cub_operator_to_identity<accum_t, op_t>::value();
+  const auto operator_identity = cuda::identity_element<op_t, accum_t>();
   const int valid_items        = GENERATE_COPY(
     take(1, random(2, cuda::std::max(2, num_items - 1))),
     take(1, random(num_items + 2, cuda::std::numeric_limits<int>::max())),
@@ -274,7 +275,7 @@ C2H_TEST("ThreadScanExclusive Narrow PrecisionType Tests",
   using dist_param             = dist_interval<value_t, op_t, num_items, accum_t, output_t>;
   using filler_dist_param      = dist_interval<accum_t, op_t, num_items, accum_t, output_t>;
   constexpr auto scan_op       = unwrap_op(std::true_type{}, op_t{});
-  const auto operator_identity = cub_operator_to_identity<accum_t, op_t>::value();
+  const auto operator_identity = identity_v<op_t, accum_t>;
   const int valid_items        = GENERATE_COPY(
     take(1, random(2, cuda::std::max(2, num_items - 1))),
     take(1, random(num_items + 2, cuda::std::numeric_limits<int>::max())),

@@ -14,6 +14,7 @@
 #include <c2h/catch2_test_helper.h>
 #include <c2h/extended_types.h>
 #include <c2h/generators.h>
+#include <c2h/operator.cuh>
 
 constexpr int max_size  = 16;
 constexpr int num_seeds = 3;
@@ -158,7 +159,7 @@ C2H_TEST("ThreadScanInclusive Integral Type Tests",
   using dist_param                 = dist_interval<value_t, op_t, num_items, accum_t, output_t>;
   using filler_dist_param          = dist_interval<accum_t, op_t, num_items, accum_t, output_t>;
   constexpr auto scan_op           = op_t{};
-  constexpr auto operator_identity = cub_operator_to_identity<accum_t, op_t>::value();
+  constexpr auto operator_identity = cuda::identity_element<op_t, accum_t>();
   const int valid_items            = GENERATE_COPY(
     take(1, random(2, cuda::std::max(2, num_items - 1))),
     take(1, random(num_items + 2, cuda::std::numeric_limits<int>::max())),
@@ -218,7 +219,7 @@ C2H_TEST("ThreadScanInclusive Floating-Point Type Tests",
   using dist_param             = dist_interval<value_t, op_t, num_items, accum_t, output_t>;
   using filler_dist_param      = dist_interval<accum_t, op_t, num_items, accum_t, output_t>;
   constexpr auto scan_op       = op_t{};
-  const auto operator_identity = cub_operator_to_identity<accum_t, op_t>::value();
+  const auto operator_identity = cuda::identity_element<op_t, accum_t>();
   const int valid_items        = GENERATE_COPY(
     take(1, random(2, cuda::std::max(2, num_items - 1))),
     take(1, random(num_items + 2, cuda::std::numeric_limits<int>::max())),
@@ -280,7 +281,7 @@ C2H_TEST("ThreadScanInclusive Narrow PrecisionType Tests",
   using dist_param             = dist_interval<value_t, op_t, num_items, accum_t, output_t>;
   using filler_dist_param      = dist_interval<accum_t, op_t, num_items, accum_t, output_t>;
   constexpr auto scan_op       = unwrap_op(std::true_type{}, op_t{});
-  const auto operator_identity = cub_operator_to_identity<accum_t, op_t>::value();
+  const auto operator_identity = identity_v<op_t, accum_t>;
   const int valid_items        = GENERATE_COPY(
     take(1, random(2, cuda::std::max(2, num_items - 1))),
     take(1, random(num_items + 2, cuda::std::numeric_limits<int>::max())),

@@ -136,76 +136,6 @@ struct cub_operator_to_std<T, cuda::maximum<>>
 template <typename T, typename Operator>
 using cub_operator_to_std_t = typename cub_operator_to_std<T, Operator>::type;
 
-/***********************************************************************************************************************
- * CUB operator to identity
- **********************************************************************************************************************/
-
-template <typename T, typename Operator, typename = void>
-struct cub_operator_to_identity;
-
-template <typename T>
-struct cub_operator_to_identity<T, cuda::std::plus<>>
-{
-  static constexpr T value()
-  {
-    return T{};
-  }
-};
-
-template <typename T>
-struct cub_operator_to_identity<T, cuda::std::multiplies<>>
-{
-  static constexpr T value()
-  {
-    return T{1};
-  }
-};
-
-template <typename T>
-struct cub_operator_to_identity<T, cuda::std::bit_and<>>
-{
-  static constexpr T value()
-  {
-    return static_cast<T>(~T{0});
-  }
-};
-
-template <typename T>
-struct cub_operator_to_identity<T, cuda::std::bit_or<>>
-{
-  static constexpr T value()
-  {
-    return T{0};
-  }
-};
-
-template <typename T>
-struct cub_operator_to_identity<T, cuda::std::bit_xor<>>
-{
-  static constexpr T value()
-  {
-    return T{0};
-  }
-};
-
-template <typename T>
-struct cub_operator_to_identity<T, cuda::minimum<>>
-{
-  static constexpr T value()
-  {
-    return ::std::numeric_limits<T>::max();
-  }
-};
-
-template <typename T>
-struct cub_operator_to_identity<T, cuda::maximum<>>
-{
-  static constexpr T value()
-  {
-    return ::std::numeric_limits<T>::min();
-  }
-};
-
 /***********************************************************************************************************************
  * Type list definition
  **********************************************************************************************************************/
@@ -341,9 +271,10 @@ constexpr int num_seeds = 10;
 C2H_TEST("ThreadReduce Integral Type Tests", "[reduce][thread]", integral_type_list, cub_operator_integral_list)
 {
   using value_t                    = c2h::get<0, TestType>;
-  constexpr auto reduce_op         = c2h::get<1, TestType>{};
-  constexpr auto std_reduce_op     = cub_operator_to_std_t<value_t, c2h::get<1, TestType>>{};
-  constexpr auto operator_identity = cub_operator_to_identity<value_t, c2h::get<1, TestType>>::value();
+  using op_t                       = c2h::get<1, TestType>;
+  constexpr auto reduce_op         = op_t{};
+  constexpr auto std_reduce_op     = cub_operator_to_std_t<value_t, op_t>{};
+  constexpr auto operator_identity = cuda::identity_element<op_t, value_t>();
   CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
   c2h::device_vector<value_t> d_in(max_size);
   c2h::device_vector<value_t> d_out(1);
@@ -360,9 +291,10 @@ C2H_TEST("ThreadReduce Integral Type Tests", "[reduce][thread]", integral_type_l
 C2H_TEST("ThreadReduce Floating-Point Type Tests", "[reduce][thread]", fp_type_list, cub_operator_fp_list)
 {
   using value_t                = c2h::get<0, TestType>;
-  constexpr auto reduce_op     = c2h::get<1, TestType>{};
-  constexpr auto std_reduce_op = cub_operator_to_std_t<value_t, c2h::get<1, TestType>>{};
-  const auto operator_identity = cub_operator_to_identity<value_t, c2h::get<1, TestType>>::value();
+  using op_t                   = c2h::get<1, TestType>;
+  constexpr auto reduce_op     = op_t{};
+  constexpr auto std_reduce_op = cub_operator_to_std_t<value_t, op_t>{};
+  const auto operator_identity = cuda::identity_element<op_t, value_t>();
   CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
   c2h::device_vector<value_t> d_in(max_size);
   c2h::device_vector<value_t> d_out(1);
@@ -384,9 +316,10 @@ C2H_TEST("ThreadReduce Narrow PrecisionType Tests",
          cub_operator_fp_list)
 {
   using value_t                = c2h::get<0, TestType>;
-  constexpr auto reduce_op     = c2h::get<1, TestType>{};
-  constexpr auto std_reduce_op = cub_operator_to_std_t<float, c2h::get<1, TestType>>{};
-  const auto operator_identity = cub_operator_to_identity<float, c2h::get<1, TestType>>::value();
+  using op_t                   = c2h::get<1, TestType>;
+  constexpr auto reduce_op     = op_t{};
+  constexpr auto std_reduce_op = cub_operator_to_std_t<float, op_t>{};
+  const auto operator_identity = cuda::identity_element<op_t, float>();
   c2h::device_vector<value_t> d_in(max_size);
   c2h::device_vector<value_t> d_out(1);
   c2h::gen(C2H_SEED(num_seeds), d_in, value_t{1.0f}, value_t{2.0f});