PaddlePaddle
diff --git a/‎cmake/external/flashattn.cmake‎
Lines changed: 5 additions & 3 deletions b/‎cmake/external/flashattn.cmake‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎cmake/third_party.cmake‎
Lines changed: 9 additions & 4 deletions b/‎cmake/third_party.cmake‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎paddle/phi/api/yaml/backward.yaml‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/api/yaml/backward.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/api/yaml/ops.yaml‎
Lines changed: 4 additions & 2 deletions b/‎paddle/phi/api/yaml/ops.yaml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎paddle/phi/backends/dynload/flashattn.h‎
Lines changed: 7 additions & 3 deletions b/‎paddle/phi/backends/dynload/flashattn.h‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎paddle/phi/kernels/flash_attn_kernel.h‎
Lines changed: 22 additions & 17 deletions b/‎paddle/phi/kernels/flash_attn_kernel.h‎
Lines changed: 22 additions & 17 deletions
@@ -17,10 +17,10 @@ include(ExternalProject)
 add_definitions(-DPADDLE_WITH_FLASHATTN)
 
 set(FLASHATTN_PREFIX_DIR ${THIRD_PARTY_PATH}/flashattn)
-set(FLASHATTN_SOURCE_SUBDIR csrc/flash_attn)
+set(FLASHATTN_SOURCE_SUBDIR csrc)
 set(FLASHATTN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flashattn)
 set(FLASHATTN_REPOSITORY ${GIT_URL}/PaddlePaddle/flash-attention.git)
-set(FLASHATTN_TAG 5ff4bbf56ad066750407c4aef16ac740ebda0717)
+set(FLASHATTN_TAG b5bdb79d5e1f2f88b1ef62e86899a14f82fa079a)
 
 set(FLASHATTN_INCLUDE_DIR
     "${FLASHATTN_INSTALL_DIR}/include"
@@ -62,7 +62,7 @@ else()
   set(FLASHATTN_C_FLAGS ${CMAKE_C_FLAGS})
   set(FLASHATTN_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
   set(FLASHATTN_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-  set(FLASHATTN_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(FLASHATTN_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
   set(FLASHATTN_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
   set(FLASHATTN_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
@@ -93,6 +93,8 @@ ExternalProject_Add(
              -DBUILD_SHARED=ON
              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             -DCMAKE_JOB_POOL_COMPILE:STRING=compile
+             -DCMAKE_JOB_POOLS:STRING=compile=4
              ${EXTERNAL_OPTIONAL_ARGS}
   CMAKE_CACHE_ARGS
     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 
@@ -512,10 +512,15 @@ if(WITH_GPU
     list(APPEND third_party_deps extern_cutlass)
     set(WITH_CUTLASS ON)
   endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.2)
-    include(external/flashattn)
-    list(APPEND third_party_deps extern_flashattn)
-    set(WITH_FLASHATTN ON)
+  if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.4)
+    foreach(arch ${NVCC_ARCH_BIN})
+      if(${arch} GREATER_EQUAL 80)
+        include(external/flashattn)
+        list(APPEND third_party_deps extern_flashattn)
+        set(WITH_FLASHATTN ON)
+        break()
+      endif()
+    endforeach()
   endif()
 endif()
 
 
@@ -617,7 +617,7 @@
   inplace : (out_grad -> x_grad)
 
 - backward_op : flash_attn_grad
-  forward : flash_attn (Tensor q, Tensor k, Tensor v, float dropout = 0.0, bool causal = false, bool return_softmax = false, bool is_test = false) -> Tensor(out), Tensor(softmax), Tensor(softmax_lse), Tensor(seed_offset)
+  forward : flash_attn (Tensor q, Tensor k, Tensor v, Tensor fixed_seed_offset, float dropout = 0.0, bool causal = false, bool return_softmax = false, bool is_test = false, str rng_name = "") -> Tensor(out), Tensor(softmax), Tensor(softmax_lse), Tensor(seed_offset)
   args : (Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor seed_offset, Tensor out_grad, float dropout = 0.0, bool causal = false)
   output : Tensor(q_grad), Tensor(k_grad), Tensor(v_grad)
   infer_meta :
@@ -628,7 +628,7 @@
     data_type: q
 
 - backward_op : flash_attn_unpadded_grad
-  forward : flash_attn_unpadded (Tensor q, Tensor k, Tensor v, Tensor cu_seqlens_q, Tensor cu_seqlens_k, int64_t max_seqlen_q, int64_t max_seqlen_k, float scale, float dropout = 0.0, bool causal = false, bool return_softmax = false, bool is_test = false) -> Tensor(out), Tensor(softmax), Tensor(softmax_lse), Tensor(seed_offset)
+  forward : flash_attn_unpadded (Tensor q, Tensor k, Tensor v, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor fixed_seed_offset, int64_t max_seqlen_q, int64_t max_seqlen_k, float scale, float dropout = 0.0, bool causal = false, bool return_softmax = false, bool is_test = false, str rng_name = "") -> Tensor(out), Tensor(softmax), Tensor(softmax_lse), Tensor(seed_offset)
   args : (Tensor q, Tensor k, Tensor v, Tensor cu_seqlens_q, Tensor cu_seqlens_k, Tensor out, Tensor softmax_lse, Tensor seed_offset, Tensor out_grad, int64_t max_seqlen_q, int64_t max_seqlen_k, float scale, float dropout = 0.0, bool causal = false)
   output : Tensor(q_grad), Tensor(k_grad), Tensor(v_grad)
   infer_meta :
 
@@ -678,8 +678,9 @@
   backward : fill_diagonal_tensor_grad
 
 - op : flash_attn
-  args : (Tensor q, Tensor k, Tensor v, float dropout = 0.0, bool causal = false, bool return_softmax = false, bool is_test = false)
+  args : (Tensor q, Tensor k, Tensor v, Tensor fixed_seed_offset, float dropout = 0.0, bool causal = false, bool return_softmax = false, bool is_test = false, str rng_name = "")
   output : Tensor(out), Tensor(softmax), Tensor(softmax_lse), Tensor(seed_offset)
+  optional : fixed_seed_offset
   infer_meta :
     func : FlashAttnInferMeta
     param : [q, k, v]
@@ -690,8 +691,9 @@
   backward : flash_attn_grad
 
 - op : flash_attn_unpadded
-  args : (Tensor q, Tensor k, Tensor v, Tensor cu_seqlens_q,  Tensor cu_seqlens_k, int64_t max_seqlen_q, int64_t max_seqlen_k, float scale, float dropout = 0.0, bool causal = false, bool return_softmax = false, bool is_test = false)
+  args : (Tensor q, Tensor k, Tensor v, Tensor cu_seqlens_q,  Tensor cu_seqlens_k, Tensor fixed_seed_offset, int64_t max_seqlen_q, int64_t max_seqlen_k, float scale, float dropout = 0.0, bool causal = false, bool return_softmax = false, bool is_test = false, str rng_name = "")
   output : Tensor(out), Tensor(softmax), Tensor(softmax_lse), Tensor(seed_offset)
+  optional : fixed_seed_offset
   infer_meta :
     func : FlashAttnInferMeta
     param : [q, k, v]
 
@@ -43,9 +43,13 @@ extern void* flashattn_dso_handle;
 #define DECLARE_DYNAMIC_LOAD_FLASHATTN_WRAP(__name) \
   DYNAMIC_LOAD_FLASHATTN_WRAP(__name)
 
-#define FLASHATTN_ROUTINE_EACH(__macro) \
-  __macro(flash_attn_fwd);              \
-  __macro(flash_attn_bwd);              \
+#define FLASHATTN_ROUTINE_EACH(__macro)       \
+  __macro(flash_attn_fwd);                    \
+  __macro(flash_attn_varlen_fwd);             \
+  __macro(flash_attn_bwd);                    \
+  __macro(flash_attn_varlen_bwd);             \
+  __macro(flash_attn_fwd_with_bias_and_mask); \
+  __macro(flash_attn_bwd_with_bias_and_mask); \
   __macro(flash_attn_error);
 
 FLASHATTN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_FLASHATTN_WRAP);
 
@@ -20,33 +20,38 @@
 namespace phi {
 
 template <typename T, typename Context>
-void FlashAttnUnpaddedKernel(const Context& ctx,
-                             const DenseTensor& q,
-                             const DenseTensor& k,
-                             const DenseTensor& v,
-                             const DenseTensor& cu_seqlens_q,
-                             const DenseTensor& cu_seqlens_k,
-                             int64_t max_seqlen_q,
-                             int64_t max_seqlen_k,
-                             float scale,
-                             float dropout,
-                             bool causal,
-                             bool return_softmax,
-                             bool is_test,
-                             DenseTensor* out,
-                             DenseTensor* softmax,
-                             DenseTensor* softmax_lse,
-                             DenseTensor* seed_offset);
+void FlashAttnUnpaddedKernel(
+    const Context& ctx,
+    const DenseTensor& q,
+    const DenseTensor& k,
+    const DenseTensor& v,
+    const DenseTensor& cu_seqlens_q,
+    const DenseTensor& cu_seqlens_k,
+    const paddle::optional<DenseTensor>& fixed_seed_offset,
+    int64_t max_seqlen_q,
+    int64_t max_seqlen_k,
+    float scale,
+    float dropout,
+    bool causal,
+    bool return_softmax,
+    bool is_test,
+    const std::string& rng_name,
+    DenseTensor* out,
+    DenseTensor* softmax,
+    DenseTensor* softmax_lse,
+    DenseTensor* seed_offset);
 
 template <typename T, typename Context>
 void FlashAttnKernel(const Context& ctx,
                      const DenseTensor& q,
                      const DenseTensor& k,
                      const DenseTensor& v,
+                     const paddle::optional<DenseTensor>& fixed_seed_offset,
                      float dropout,
                      bool causal,
                      bool return_softmax,
                      bool is_test,
+                     const std::string& rng_name,
                      DenseTensor* out,
                      DenseTensor* softmax,
                      DenseTensor* softmax_lse,