PaddlePaddle · Xreki · Apr 2, 2024 · Mar 29, 2024 · Apr 1, 2024 · Apr 1, 2024
diff --git a/paddle/phi/kernels/kps/reduce_kernel.cu b/paddle/phi/kernels/kps/reduce_kernel.cu
@@ -248,8 +248,25 @@ void SumRawKernel(const Context& dev_ctx,
         "now."));
 #endif
   } else {
-    phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
-        dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out);
+    if (x.dtype() == phi::DataType::BFLOAT16 &&
+        out_dtype == phi::DataType::FLOAT32) {
+      std::vector<int> reduce_dims = phi::funcs::details::GetReduceDim(
+          dims.GetData(), x.dims().size(), reduce_all);
+
+      phi::funcs::ReduceKernel<
+          phi::dtype::bfloat16,
+          float,
+          kps::AddFunctor,
+          kps::IdentityFunctor<phi::dtype::bfloat16, float>>(
+          dev_ctx,
+          x,
+          out,
+          kps::IdentityFunctor<phi::dtype::bfloat16, float>(),
+          reduce_dims);
+    } else {
+      phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
+          dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out);
+    }
   }
 }
 }  // namespace phi

diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -306,6 +306,7 @@ def backward(ctx, dy):
                     task.wait()
                     return dx, None, None
                 else:
+                    # When main_grad is not enabled and gradient_accumulation is used, the grad is not initialized for the first acc step.
                     (
                         dw,
                         dbias,

diff --git a/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py b/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 import paddle
 from paddle import distributed as dist
 from paddle.autograd import PyLayer
@@ -28,6 +30,8 @@
     functional as F,
 )
 
+from .log_util import logger
+
 ####################################################
 #                                                  #
 #        Distributed Communication Operator        #
@@ -234,6 +238,9 @@ def is_fused_linear_param_grad_add_supported():
         return False
 
 
+_raise_cuda_env_unset_warning_for_sp = True
+
+
 class SPInnerOverlapLinear(paddle.autograd.PyLayer):
     @staticmethod
     def forward(
@@ -290,6 +297,17 @@ def backward(ctx, dy):
             group=group,
             sync_op=False,
         )
+        # Using small operation to preempt GPU SMs for all_reduce to achieve overlap.
+        if int(os.getenv("CUDA_DEVICE_MAX_CONNECTIONS", "0")) != 1:
+            global _raise_cuda_env_unset_warning_for_sp
+            if _raise_cuda_env_unset_warning_for_sp:
+                logger.warning(
+                    "You set mp_async_allreduce=True, but you forget to set environment "
+                    "variable CUDA_DEVICE_MAX_CONNECTIONS=1, which may leads to performance "
+                    "loss. Try to export CUDA_DEVICE_MAX_CONNECTIONS=1 for better performance."
+                )
+            _raise_cuda_env_unset_warning_for_sp = False
+            tmp = paddle.ones([512])
 
         if ctx.mp_fused_linear_param_grad_add:
             if not is_fused_linear_param_grad_add_supported():
@@ -355,6 +373,7 @@ def backward(ctx, dy):
                     task.wait()
                     return dx, None, None
                 else:
+                    # When main_grad is not enabled and gradient_accumulation is used, the grad is not initialized for the first acc step.
                     (
                         dw,
                         dbias,
-Original file line number
+Diff line change
@@ Expand Up / @@ -306,6 +306,7 @@ def backward(ctx, dy): @@
                         task.wait()
                         return dx, None, None
                     else:
+                        # When main_grad is not enabled and gradient_accumulation is used, the grad is not initialized for the first acc step.
                         (
                             dw,
                             dbias,
@@ Expand Down @@