sgl-project · zhyncs · Aug 21, 2025 · Aug 20, 2025 · Aug 20, 2025 · gemini-code-assist
@@ -1488,6 +1488,10 @@ struct CollectiveMmaArrayMixedInput<
   template <class... TMs>
   CUTLASS_DEVICE void
   tensormaps_cp_fence_release(TensorMapStorage& shared_tensormaps, cute::tuple<TMs...> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
     // Entire warp must do this (i.e. it's aligned)
     tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
     tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);