NVIDIA
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 18 additions & 2 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/CMakeLists.txt‎
Lines changed: 10 additions & 1 deletion b/‎cpp/tensorrt_llm/CMakeLists.txt‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/system_barrier.h‎
Lines changed: 22 additions & 15 deletions b/‎cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/system_barrier.h‎
Lines changed: 22 additions & 15 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt‎
Lines changed: 6 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h‎
Lines changed: 1 addition & 2 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/communication/sm90_allreduce_nvls_warpspecialized.hpp‎
Lines changed: 27 additions & 12 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/communication/sm90_allreduce_nvls_warpspecialized.hpp‎
Lines changed: 27 additions & 12 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/epilogue/sm100_visitor_allreduce_tma_warpspecialized.hpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/epilogue/sm100_visitor_allreduce_tma_warpspecialized.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/epilogue/sm90_visitor_allreduce_tma_warpspecialized.hpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/epilogue/sm90_visitor_allreduce_tma_warpspecialized.hpp‎
Lines changed: 1 addition & 1 deletion
@@ -44,6 +44,7 @@ option(ENABLE_MULTI_DEVICE
 option(ENABLE_UCX "Enable building with UCX (Uniform Communication X) support"
        ON)
 option(NVRTC_DYNAMIC_LINKING "Link against the dynamic NVRTC libraries" OFF)
+option(ENABLE_NVSHMEM "Enable building with NVSHMEM support" OFF)
 option(USING_OSS_CUTLASS_LOW_LATENCY_GEMM
        "Using open sourced Cutlass low latency gemm kernel" ON)
 option(USING_OSS_CUTLASS_FP4_GEMM "Using open sourced Cutlass fp4 gemm kernel"
@@ -53,6 +54,8 @@ option(USING_OSS_CUTLASS_MOE_GEMM "Using open sourced Cutlass moe gemm kernel"
 option(USING_OSS_CUTLASS_ALLREDUCE_GEMM
        "Using open sourced Cutlass AR gemm kernel" ON)
 
+message(STATUS "ENABLE_NVSHMEM is ${ENABLE_NVSHMEM}")
+
 if(NVTX_DISABLE)
   add_compile_definitions("NVTX_DISABLE")
   message(STATUS "NVTX is disabled")
@@ -165,6 +168,7 @@ message(STATUS "CUDA library status:")
 message(STATUS "    version: ${CUDAToolkit_VERSION}")
 message(STATUS "    libraries: ${CUDAToolkit_LIBRARY_DIR}")
 message(STATUS "    include path: ${CUDAToolkit_INCLUDE_DIRS}")
+message(STATUS "CUDA_NVML_LIB: ${CUDA_NVML_LIB}")
 
 # Prevent CMake from creating a response file for CUDA compiler, so clangd can
 # pick up on the includes
@@ -256,9 +260,21 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss ")
 # note: cmake expr generation $<BOOL:${ENABLE_MULTI_DEVICE}> is a build time
 # evaluation so hard to debug at cmake time
 if(ENABLE_MULTI_DEVICE)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_MULTI_DEVICE=1")
+  # Add target definitions for both C++ and CUDA
+  add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:ENABLE_MULTI_DEVICE=1>
+                          $<$<COMPILE_LANGUAGE:CUDA>:ENABLE_MULTI_DEVICE=1>)
+else()
+  # Add target definitions for both C++ and CUDA
+  add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:ENABLE_MULTI_DEVICE=0>
+                          $<$<COMPILE_LANGUAGE:CUDA>:ENABLE_MULTI_DEVICE=0>)
+endif()
+
+if(ENABLE_NVSHMEM)
+  add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:ENABLE_NVSHMEM=1>
+                          $<$<COMPILE_LANGUAGE:CUDA>:ENABLE_NVSHMEM=1>)
 else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_MULTI_DEVICE=0")
+  add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:ENABLE_NVSHMEM=0>
+                          $<$<COMPILE_LANGUAGE:CUDA>:ENABLE_NVSHMEM=0>)
 endif()
 
 # Fix linking issue with TRT 10, the detailed description about `--mcmodel` can
 
@@ -72,6 +72,12 @@ if(ENABLE_MULTI_DEVICE)
   include_directories(${MPI_C_INCLUDE_DIRS})
 endif()
 
+if(ENABLE_NVSHMEM)
+  # Add hints for aarch64
+  find_package(NVSHMEM REQUIRED HINTS /usr/lib/sbsa-linux-gnu/cmake/nvshmem/)
+  include_directories(/usr/include/nvshmem/)
+endif()
+
 if(NOT WIN32)
   set(DECODER_SHARED_TARGET_0 decoder_attention_0)
   set(DECODER_SHARED_TARGET_1 decoder_attention_1)
@@ -231,7 +237,10 @@ if(ENABLE_MULTI_DEVICE)
   set(TRTLLM_LINK_LIBS ${TRTLLM_LINK_LIBS} ${MPI_C_LIBRARIES} ${NCCL_LIB})
 endif()
 
-message("TRTLLM_LINK_LIBS: ${TRTLLM_LINK_LIBS}")
+if(ENABLE_NVSHMEM)
+  set(TRTLLM_LINK_LIBS ${TRTLLM_LINK_LIBS} nvshmem::nvshmem_host
+                       nvshmem::nvshmem_device)
+endif()
 
 if(NOT WIN32) # Unix-like compilers
   set(UNDEFINED_FLAG "-Wl,--no-undefined")
 
@@ -332,12 +332,12 @@ enum class ClusterShape
     ClusterShape_1x2x1,
     ClusterShape_2x2x1,
     ClusterShape_1x4x1,
+    ClusterShape_4x1x1,
     ClusterShape_4x2x1,
     ClusterShape_2x4x1,
     ClusterShape_4x4x1,
     ClusterShape_1x8x1,
-    ClusterShape_8x1x1,
-    ClusterShape_4x1x1
+    ClusterShape_8x1x1
 };
 
 static auto get_cluster_shape_name(ClusterShape Shape_MNK)
 
@@ -22,6 +22,8 @@
 
 #include "cutlass/barrier.h"
 
+#include <cuda/atomic>
+
 namespace cutlass
 {
 
@@ -43,7 +45,7 @@ __forceinline__ __device__ uint32_t atomicCAS_system_acq(uint32_t* p, uint32_t c
 
 } // namespace detail
 
-template <class Sync, bool SafeBetweenPhases, bool UseMembarGPU>
+template <class Sync, bool SafeBetweenPhases>
 struct MulticastSystemBarrier : public GenericBarrier<Sync>
 {
 
@@ -57,23 +59,27 @@ struct MulticastSystemBarrier : public GenericBarrier<Sync>
 
 protected:
     /// Reduce into flag, with release pattern (int specialization)
-    CUTLASS_DEVICE
-    static void red_release(T* mc_ptr, int val)
+    template <cuda::thread_scope Scope>
+    CUTLASS_DEVICE static void red_release(T* mc_ptr, int val)
     {
 #if defined(CUTE_ARCH_MULTIMEM_SM90_ENABLED)
         // atomic reduction to all replicas
         // this can be conceptually thought of as __threadfence_system(); atomicAdd_system(arrival_counter_mc, 1);
         // See
         // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red
         // for multimem PTX doc
-        if constexpr (UseMembarGPU)
+        if constexpr (Scope == cuda::thread_scope::thread_scope_device)
         {
             asm volatile("multimem.red.release.gpu.global.add.u32 [%0], %1;" ::"l"(mc_ptr), "r"(val) : "memory");
         }
-        else
+        else if constexpr (Scope == cuda::thread_scope::thread_scope_system)
         {
             asm volatile("multimem.red.release.sys.global.add.u32 [%0], %1;" ::"l"(mc_ptr), "r"(val) : "memory");
         }
+        else
+        {
+            CUTE_INVALID_CONTROL_PATH("Invalid thread scope for MulticastSystemBarrier.");
+        }
 
         // Need a fence between MC and UC access to the same memory:
         // - fence.proxy instructions establish an ordering between memory accesses that may happen through different
@@ -128,8 +134,8 @@ struct MulticastSystemBarrier : public GenericBarrier<Sync>
         Sync::sync();
     }
 
-    CUTLASS_DEVICE
-    static T arrive_inc_get(T* mc_ptr, T* uc_ptr, int thread_idx, int flag_idx, int rank, int world_size)
+    template <cuda::thread_scope Scope>
+    CUTLASS_DEVICE static T arrive_inc_get(T* mc_ptr, T* uc_ptr, int thread_idx, int flag_idx, int rank, int world_size)
     {
         T* mc_barrier_ptr = mc_ptr + flag_idx;
         T* uc_barrier_ptr = uc_ptr + flag_idx;
@@ -156,37 +162,38 @@ struct MulticastSystemBarrier : public GenericBarrier<Sync>
             // can be immediately reused.
             bool master = rank == 0;
             int val = master ? 0x80000000 - (world_size - 1) : 1;
-            red_release(mc_barrier_ptr, val);
+            red_release<Scope>(mc_barrier_ptr, val);
         }
         return old_arrive;
     }
 
-    CUTLASS_DEVICE
-    static void arrive_inc(Params const& params, int thread_idx, int flag_idx, int rank, int world_size)
+    template <cuda::thread_scope Scope = cuda::thread_scope::thread_scope_system>
+    CUTLASS_DEVICE static void arrive_inc(Params const& params, int thread_idx, int flag_idx, int rank, int world_size)
     {
         T* mc_barrier = params.mc_barrier_ptr + flag_idx;
 
         Sync::sync();
 
         if (thread_idx == 0)
         {
-            red_release(mc_barrier, 1);
+            red_release<Scope>(mc_barrier, 1);
         }
     }
 
-    CUTLASS_DEVICE
-    static void arrive_and_wait(Params const& params, int thread_idx, int flag_idx, int rank, int world_size)
+    template <cuda::thread_scope Scope = cuda::thread_scope::thread_scope_system>
+    CUTLASS_DEVICE static void arrive_and_wait(
+        Params const& params, int thread_idx, int flag_idx, int rank, int world_size)
     {
         auto mc_ptr = params.mc_barrier_ptr;
         auto uc_ptr = params.uc_barrier_ptr;
         if constexpr (SafeBetweenPhases)
         {
-            auto old_arrive = arrive_inc_get(mc_ptr, uc_ptr, thread_idx, flag_idx, rank, world_size);
+            auto old_arrive = arrive_inc_get<Scope>(mc_ptr, uc_ptr, thread_idx, flag_idx, rank, world_size);
             wait(old_arrive, uc_ptr, thread_idx, flag_idx);
         }
         else
         {
-            arrive_inc(params, thread_idx, flag_idx, rank, world_size);
+            arrive_inc<Scope>(params, thread_idx, flag_idx, rank, world_size);
             wait_eq_reset(uc_ptr, thread_idx, flag_idx, world_size);
         }
     }
 
@@ -181,8 +181,7 @@ endif()
 if(USING_OSS_CUTLASS_ALLREDUCE_GEMM)
   add_library(
     ar_gemm_src STATIC
-    ${ARGEMM_SRC_CU}
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime/ipcNvlsMemory.cpp)
+    ${ARGEMM_SRC_CU} ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime/ipcNvlsMemory.cu)
   target_include_directories(
     ar_gemm_src
     PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../internal_cutlass_kernels/include)
@@ -233,6 +232,11 @@ function(process_target target_name enable_hopper enable_blackwell)
     target_link_libraries(${target_name} PRIVATE ${MPI_C_LIBRARIES})
   endif()
 
+  if(ENABLE_NVSHMEM)
+    target_link_libraries(${target_name} PRIVATE nvshmem::nvshmem_host
+                                                 nvshmem::nvshmem_device)
+  endif()
+
 endfunction()
 
 set(TARGET_LIB
 
@@ -138,7 +138,7 @@ class GemmAllReduceImplTwoshot_Sm100 : public GemmAllReduceImplInterface
     // Epilogue
     ////////////////
     using FusionCallbacks = cutlass::epilogue::fusion::LinearCombination<ElementD, float, void, float>;
-    using TileBarrierType = cutlass::MulticastSystemBarrier<cutlass::detail::SyncNoOp, true, true>;
+    using TileBarrierType = cutlass::MulticastSystemBarrier<cutlass::detail::SyncNoOp, true>;
     using EpilogueScheduleType = typename MmaAdapter<MmaType, IsFP4>::EpilogueSchedule;
     using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
     using FusionOp
 
@@ -100,8 +100,7 @@ class GemmAllReduceImplTwoshot_Sm90 : public GemmAllReduceImplInterface
     using RasterOrderOptions =
         typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90Params::RasterOrderOptions;
 
-    using TileBarrierType = cutlass::MulticastSystemBarrier<cutlass::detail::SyncNoOp, true /* Safe across phases */,
-        true /* membar.gpu */>;
+    using TileBarrierType = cutlass::MulticastSystemBarrier<cutlass::detail::SyncNoOp, true /* Safe across phases */>;
 
     // 16B alignment for TMA
     static constexpr int AlignmentA = 16 / sizeof(ElementA);
 
@@ -201,7 +201,7 @@ class CollectiveAllReduceMulticastWarpSpecialized
         auto [M, N, K, L] = problem_shape;
         auto [m, n, k, l] = tile_coord;
 
-        if (!tile_valid(m, n) || params_ptr->world_size == 1)
+        if (!tile_valid(m, n) || params_ptr->world_size <= 2)
         {
             return; // nothing to do
         }
@@ -212,7 +212,7 @@ class CollectiveAllReduceMulticastWarpSpecialized
 
         // Wait for all multicast writes to be visible to us.
         // This is safe between phases.
-        SystemBarrier::arrive_and_wait(
+        SystemBarrier::arrive_and_wait<cuda::thread_scope::thread_scope_system>(
             params_ptr->barrier_params_final_sync, thread_idx, tile_index, params_ptr->rank, params_ptr->world_size);
     }
 
@@ -297,21 +297,28 @@ class CollectiveAllReduceMulticastWarpSpecialized
                 Tensor tGR_gD1_vec = zipped_divide(tGR_gD1(_, _, _, red_m, red_n), Vec);
                 Tensor tRG_gOut_vec = zipped_divide(tRG_gOut(_, _, _, red_m, red_n), Vec);
 
-                auto pred_fn
-                    = [&](auto const&... coords) { return elem_less(tGR_pD_vec(_0{}, coords...), problem_shape); };
+                // Create predicate tensor for bounds checking
+                Tensor pred_tensor = make_tensor<bool>(make_shape(size(tGR_pD_vec)), Stride<_1>{});
+
+                // Set predicate values based on coordinate bounds
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < size(pred_tensor); ++i)
+                {
+                    pred_tensor(i) = elem_less(tGR_pD_vec(_0{}, i), problem_shape);
+                }
 
                 // Read from self.
-                cute::copy_if(CopyAtomG2R{}, pred_fn, tGR_gD0_vec, tGR_rD0_vec);
+                cute::copy_if(CopyAtomG2R{}, pred_tensor, tGR_gD0_vec, tGR_rD0_vec);
                 // Read from remote.
-                cute::copy_if(CopyAtomG2R{}, pred_fn, tGR_gD1_vec, tGR_rD1_vec);
+                cute::copy_if(CopyAtomG2R{}, pred_tensor, tGR_gD1_vec, tGR_rD1_vec);
                 // Reduce
                 CUTLASS_PRAGMA_UNROLL
                 for (int i = 0; i < size(tGR_rD0_vec); i++)
                 {
                     tGR_rD0_vec(i) += tGR_rD1_vec(i);
                 }
                 // store to self.
-                cute::copy_if(CopyAtomG2R{}, pred_fn, tGR_rD0_vec, tRG_gOut_vec);
+                cute::copy_if(CopyAtomG2R{}, pred_tensor, tGR_rD0_vec, tRG_gOut_vec);
             }
         }
     }
@@ -386,13 +393,21 @@ class CollectiveAllReduceMulticastWarpSpecialized
                 Tensor tGR_gD_vec = zipped_divide(tGR_gD(_, _, _, red_m, red_n), Vec);
                 Tensor tRG_gD_vec = zipped_divide(tRG_gD(_, _, _, red_m, red_n), Vec);
                 Tensor tGR_pD_vec = zipped_divide(tGR_pD(_, _, _, red_m, red_n), Vec);
-                // problem shape bounds check
-                auto pred_fn
-                    = [&](auto const&... coords) { return elem_less(tGR_pD_vec(_0{}, coords...), problem_shape); };
+
+                // Create predicate tensor for bounds checking
+                Tensor pred_tensor = make_tensor<bool>(make_shape(size(tGR_gD_vec)), Stride<_1>{});
+
+                // Set predicate values based on coordinate bounds
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < size(pred_tensor); ++i)
+                {
+                    pred_tensor(i) = elem_less(tGR_pD_vec(_0{}, i), problem_shape);
+                }
+
                 // load-reduce in switch
-                cute::copy_if(CopyAtomG2R{}, pred_fn, tGR_gD_vec, tGR_rD_vec);
+                cute::copy_if(CopyAtomG2R{}, pred_tensor, tGR_gD_vec, tGR_rD_vec);
                 // store switch multicast
-                cute::copy_if(CopyAtomR2G{}, pred_fn, tGR_rD_vec, tRG_gD_vec);
+                cute::copy_if(CopyAtomR2G{}, pred_tensor, tGR_rD_vec, tRG_gD_vec);
             }
         }
     }
 
@@ -171,7 +171,7 @@ struct Sm100AllReduceArrive
                 tma_store_wait<0>();
 
                 int tile_idx = params_ptr->tile_layout(m, n);
-                SystemBarrier::arrive_inc(
+                SystemBarrier::arrive_inc<cuda::thread_scope::thread_scope_device>(
                     params_ptr->barrier_params, thread_idx, tile_idx, params_ptr->rank, params_ptr->world_size);
             }
         }
 
@@ -268,7 +268,7 @@ struct Sm90AuxAllReduce
             tma_store_wait<0>();
 
             int tile_idx = params_ptr->tile_layout(m, n);
-            SystemBarrier::arrive_inc(
+            SystemBarrier::arrive_inc<cuda::thread_scope::thread_scope_device>(
                 params_ptr->barrier_params, thread_idx, tile_idx, params_ptr->rank, params_ptr->world_size);
         }
     };
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,8 @@`
`22`	`22`
`23`	`23`	`#include "cutlass/barrier.h"`
`24`	`24`
	`25`	`+#include <cuda/atomic>`
	`26`	`+`
`25`	`27`	`namespace cutlass`
`26`	`28`	`{`
`27`	`29`
`@@ -43,7 +45,7 @@ __forceinline__ __device__ uint32_t atomicCAS_system_acq(uint32_t* p, uint32_t c`
`43`	`45`
`44`	`46`	`} // namespace detail`
`45`	`47`
`46`		`-template <class Sync, bool SafeBetweenPhases, bool UseMembarGPU>`
	`48`	`+template <class Sync, bool SafeBetweenPhases>`
`47`	`49`	`struct MulticastSystemBarrier : public GenericBarrier<Sync>`
`48`	`50`	`{`
`49`	`51`
`@@ -57,23 +59,27 @@ struct MulticastSystemBarrier : public GenericBarrier<Sync>`
`57`	`59`
`58`	`60`	`protected:`
`59`	`61`	`/// Reduce into flag, with release pattern (int specialization)`
`60`		`- CUTLASS_DEVICE`
`61`		`- static void red_release(T* mc_ptr, int val)`
	`62`	`+ template <cuda::thread_scope Scope>`
	`63`	`+ CUTLASS_DEVICE static void red_release(T* mc_ptr, int val)`
`62`	`64`	`{`
`63`	`65`	`#if defined(CUTE_ARCH_MULTIMEM_SM90_ENABLED)`
`64`	`66`	`// atomic reduction to all replicas`
`65`	`67`	`// this can be conceptually thought of as __threadfence_system(); atomicAdd_system(arrival_counter_mc, 1);`
`66`	`68`	`// See`
`67`	`69`	`// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-multimem-ld-reduce-multimem-st-multimem-red`
`68`	`70`	`// for multimem PTX doc`
`69`		`- if constexpr (UseMembarGPU)`
	`71`	`+ if constexpr (Scope == cuda::thread_scope::thread_scope_device)`
`70`	`72`	`{`
`71`	`73`	`asm volatile("multimem.red.release.gpu.global.add.u32 [%0], %1;" ::"l"(mc_ptr), "r"(val) : "memory");`
`72`	`74`	`}`
`73`		`- else`
	`75`	`+ else if constexpr (Scope == cuda::thread_scope::thread_scope_system)`
`74`	`76`	`{`
`75`	`77`	`asm volatile("multimem.red.release.sys.global.add.u32 [%0], %1;" ::"l"(mc_ptr), "r"(val) : "memory");`
`76`	`78`	`}`
	`79`	`+ else`
	`80`	`+ {`
	`81`	`+ CUTE_INVALID_CONTROL_PATH("Invalid thread scope for MulticastSystemBarrier.");`
	`82`	`+ }`
`77`	`83`
`78`	`84`	`// Need a fence between MC and UC access to the same memory:`
`79`	`85`	`// - fence.proxy instructions establish an ordering between memory accesses that may happen through different`
`@@ -128,8 +134,8 @@ struct MulticastSystemBarrier : public GenericBarrier<Sync>`
`128`	`134`	`Sync::sync();`
`129`	`135`	`}`
`130`	`136`
`131`		`- CUTLASS_DEVICE`
`132`		`- static T arrive_inc_get(T* mc_ptr, T* uc_ptr, int thread_idx, int flag_idx, int rank, int world_size)`
	`137`	`+ template <cuda::thread_scope Scope>`
	`138`	`+ CUTLASS_DEVICE static T arrive_inc_get(T* mc_ptr, T* uc_ptr, int thread_idx, int flag_idx, int rank, int world_size)`
`133`	`139`	`{`
`134`	`140`	`T* mc_barrier_ptr = mc_ptr + flag_idx;`
`135`	`141`	`T* uc_barrier_ptr = uc_ptr + flag_idx;`
`@@ -156,37 +162,38 @@ struct MulticastSystemBarrier : public GenericBarrier<Sync>`
`156`	`162`	`// can be immediately reused.`
`157`	`163`	`bool master = rank == 0;`
`158`	`164`	`int val = master ? 0x80000000 - (world_size - 1) : 1;`
`159`		`- red_release(mc_barrier_ptr, val);`
	`165`	`+ red_release<Scope>(mc_barrier_ptr, val);`
`160`	`166`	`}`
`161`	`167`	`return old_arrive;`
`162`	`168`	`}`
`163`	`169`
`164`		`- CUTLASS_DEVICE`
`165`		`- static void arrive_inc(Params const& params, int thread_idx, int flag_idx, int rank, int world_size)`
	`170`	`+ template <cuda::thread_scope Scope = cuda::thread_scope::thread_scope_system>`
	`171`	`+ CUTLASS_DEVICE static void arrive_inc(Params const& params, int thread_idx, int flag_idx, int rank, int world_size)`
`166`	`172`	`{`
`167`	`173`	`T* mc_barrier = params.mc_barrier_ptr + flag_idx;`
`168`	`174`
`169`	`175`	`Sync::sync();`
`170`	`176`
`171`	`177`	`if (thread_idx == 0)`
`172`	`178`	`{`
`173`		`- red_release(mc_barrier, 1);`
	`179`	`+ red_release<Scope>(mc_barrier, 1);`
`174`	`180`	`}`
`175`	`181`	`}`
`176`	`182`
`177`		`- CUTLASS_DEVICE`
`178`		`- static void arrive_and_wait(Params const& params, int thread_idx, int flag_idx, int rank, int world_size)`
	`183`	`+ template <cuda::thread_scope Scope = cuda::thread_scope::thread_scope_system>`
	`184`	`+ CUTLASS_DEVICE static void arrive_and_wait(`
	`185`	`+ Params const& params, int thread_idx, int flag_idx, int rank, int world_size)`
`179`	`186`	`{`
`180`	`187`	`auto mc_ptr = params.mc_barrier_ptr;`
`181`	`188`	`auto uc_ptr = params.uc_barrier_ptr;`
`182`	`189`	`if constexpr (SafeBetweenPhases)`
`183`	`190`	`{`
`184`		`- auto old_arrive = arrive_inc_get(mc_ptr, uc_ptr, thread_idx, flag_idx, rank, world_size);`
	`191`	`+ auto old_arrive = arrive_inc_get<Scope>(mc_ptr, uc_ptr, thread_idx, flag_idx, rank, world_size);`
`185`	`192`	`wait(old_arrive, uc_ptr, thread_idx, flag_idx);`
`186`	`193`	`}`
`187`	`194`	`else`
`188`	`195`	`{`
`189`		`- arrive_inc(params, thread_idx, flag_idx, rank, world_size);`
	`196`	`+ arrive_inc<Scope>(params, thread_idx, flag_idx, rank, world_size);`
`190`	`197`	`wait_eq_reset(uc_ptr, thread_idx, flag_idx, world_size);`
`191`	`198`	`}`
`192`	`199`	`}`
Original file line number	Diff line number	Diff line change
`@@ -171,7 +171,7 @@ struct Sm100AllReduceArrive`
`171`	`171`	`tma_store_wait<0>();`
`172`	`172`
`173`	`173`	`int tile_idx = params_ptr->tile_layout(m, n);`
`174`		`- SystemBarrier::arrive_inc(`
	`174`	`+ SystemBarrier::arrive_inc<cuda::thread_scope::thread_scope_device>(`
`175`	`175`	`params_ptr->barrier_params, thread_idx, tile_idx, params_ptr->rank, params_ptr->world_size);`
`176`	`176`	`}`
`177`	`177`	`}`
Original file line number	Diff line number	Diff line change
`@@ -268,7 +268,7 @@ struct Sm90AuxAllReduce`
`268`	`268`	`tma_store_wait<0>();`
`269`	`269`
`270`	`270`	`int tile_idx = params_ptr->tile_layout(m, n);`
`271`		`- SystemBarrier::arrive_inc(`
	`271`	`+ SystemBarrier::arrive_inc<cuda::thread_scope::thread_scope_device>(`
`272`	`272`	`params_ptr->barrier_params, thread_idx, tile_idx, params_ptr->rank, params_ptr->world_size);`
`273`	`273`	`}`
`274`	`274`	`};`