Merge branch 'PaddlePaddle:develop' into tensor

zeroRains · web-flow · commit 1f90301b783d · 2025-08-12T17:12:51.000+08:00
diff --git a/paddle/fluid/distributed/collective/deep_ep/include/event_pool.h b/paddle/fluid/distributed/collective/deep_ep/include/event_pool.h
@@ -22,7 +22,7 @@ namespace deep_ep::detail {
 
 class EventPool {
  public:
-  EventPool() = default;
+  EventPool();
   EventPool(const EventPool&) = delete;
   EventPool(EventPool&&) = delete;
   ~EventPool();
diff --git a/paddle/fluid/distributed/collective/deep_ep/src/event_pool.cc b/paddle/fluid/distributed/collective/deep_ep/src/event_pool.cc
@@ -22,6 +22,16 @@ EventPool &EventPool::Instance() {
   return pool;
 }
 
+EventPool::EventPool() {
+  for (size_t i = 0; i < 1000; ++i) {
+    cudaEvent_t new_event;
+    CUDA_CHECK(cudaEventCreate(&new_event));
+
+    cudaEventRecord(new_event, 0);
+    incomplished_events_.push(new_event);
+  }
+}
+
 EventPool::~EventPool() {
   const auto &DestroyEvent = [](cudaEvent_t event) {
     cudaError_t e = cudaEventDestroy(event);
diff --git a/python/paddle/distributed/fleet/meta_parallel/dualpipev.py b/python/paddle/distributed/fleet/meta_parallel/dualpipev.py
@@ -37,7 +37,7 @@
     PipelineParallel,
 )
 from .pp_utils.batch_comm_helper import BatchCommHelper
-from .zero_bubble_utils import WeightGradStore
+from .zero_bubble_utils import EventStore, WeightGradStore
 
 __all__ = []
 
@@ -358,6 +358,10 @@ def _commit_and_wait_comm(
             else 0
         )
         if common_forward_ops_num == 0 and common_backward_ops_num == 0:
+            if EventStore.event is not None:
+                e_t = EventStore.event
+                EventStore.event = None
+                return e_t
             return deep_ep.get_event_from_custom_stream(
                 paddle.device.current_stream().stream_base
             )
@@ -387,13 +391,28 @@ def _commit_and_wait_comm(
                 pp_raw_stream
             )
 
+        backward_outer_event_wait = False
+        if EventStore.event is not None:
+            with paddle.device.stream_guard(
+                paddle.device.Stream(stream_base=pp_raw_stream)
+            ):
+                EventStore.event.current_stream_wait()
+
+            EventStore.set(None)
+            self.pp_group.process_group.set_outer_wait(True)
+
+            backward_outer_event_wait = True
+
         if common_backward_ops_num > 0:
             bwd_reqs = batch_isend_irecv(self.comm_backward_ops)
 
             if not use_stream_wait_event:
                 for req in bwd_reqs:
                     req.wait()
 
+        if backward_outer_event_wait:
+            self.pp_group.process_group.set_outer_wait(False)
+
         if use_stream_wait_event:
             forward_event_to_wait.current_stream_wait()
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/zero_bubble_utils.py b/python/paddle/distributed/fleet/meta_parallel/zero_bubble_utils.py
@@ -54,6 +54,15 @@ def clear(cls) -> None:
         cls.funcs_queue = queue.Queue()
 
 
+class EventStore:
+
+    event = None
+
+    @classmethod
+    def set(cls, event) -> None:
+        cls.event = event
+
+
 def fold_init_dims(tensor):
     # NOTE(zhangyuqin1998): Reshape a rank-3 tensor from P x M x N to (P * M) x N,
     # to keep weight_grad in a correct rank. See phi::FoldInitDims.
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
@@ -14,7 +14,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import paddle
 from paddle import _C_ops, in_dynamic_mode
@@ -150,14 +150,18 @@ def elu_(x: Tensor, alpha: float = 1.0, name: str | None = None) -> Tensor:
 
 
 def gelu(
-    x: Tensor, approximate: bool = False, name: str | None = None
+    x: Tensor,
+    approximate: Literal["tanh", "none"] | bool = False,
+    name: str | None = None,
 ) -> Tensor:
     r"""
     gelu activation.
 
     The activation function of Gelu is calculated element by element. More information refers to :ref: `Gaussian Error Linear Units`.
 
-    if approximate is True
+    approximate parameter must be True, False, "tanh", "none".
+
+    if approximate is True or "tanh"
 
     .. math::
 
@@ -171,7 +175,7 @@ def gelu(
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        approximate (bool, optional): Whether to enable approximation. Default is False.
+        approximate (str|bool, optional): Whether to enable approximation. Default is False.
         name (str|None, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
@@ -194,8 +198,23 @@ def gelu(
             Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
             [[-0.15880796,  0.34571400],
              [ 0.84119201,  1.39957154]])
+            >>> out3 = F.gelu(x, "none")
+            >>> print(out3)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15865529,  0.34573123],
+             [ 0.84134471,  1.39978933]])
+            >>> out4 = F.gelu(x, "tanh")
+            >>> print(out4)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15880796,  0.34571400],
+             [ 0.84119201,  1.39957154]])
     """
 
+    if approximate == "tanh":
+        approximate = True
+    elif approximate == "none":
+        approximate = False
+
     if in_dynamic_or_pir_mode():
         return _C_ops.gelu(x, approximate)
     else:
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
@@ -15,7 +15,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 from paddle.framework import get_default_dtype
 
@@ -176,7 +176,9 @@ class GELU(Layer):
     r"""
     GELU Activation.
 
-    If approximate is True
+    approximate parameter must be True, False, "tanh", "none".
+
+    If approximate is True or "tanh"
 
     .. math::
 
@@ -189,7 +191,7 @@ class GELU(Layer):
         GELU(x) = 0.5 * x * (1 + erf(\frac{x}{\sqrt{2}}))
 
     Parameters:
-        approximate (bool, optional): Whether to enable approximation. Default is False.
+        approximate (str|bool, optional): Whether to enable approximation. Default is False.
         name (str|None, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
@@ -208,6 +210,24 @@ class GELU(Layer):
             Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
             [[-0.15865529,  0.34573123],
              [ 0.84134471,  1.39978933]])
+            >>> m = paddle.nn.GELU(False)
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15865529,  0.34573123],
+             [ 0.84134471,  1.39978933]])
+            >>> m = paddle.nn.GELU("none")
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15865529,  0.34573123],
+             [ 0.84134471,  1.39978933]])
+            >>> m = paddle.nn.GELU("tanh")
+            >>> out = m(x)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15880796,  0.34571400],
+             [ 0.84119201,  1.39957154]])
             >>> m = paddle.nn.GELU(True)
             >>> out = m(x)
             >>> print(out)
@@ -217,7 +237,9 @@ class GELU(Layer):
     """
 
     def __init__(
-        self, approximate: bool = False, name: str | None = None
+        self,
+        approximate: Literal["tanh", "none"] | bool = False,
+        name: str | None = None,
     ) -> None:
         super().__init__()
         self._approximate = approximate
diff --git a/test/legacy_test/test_gelu_op.py b/test/legacy_test/test_gelu_op.py