part-3 cherry from: [cherry-pick] Integration flash attention 2 (PaddlePaddle#56015)

umiswing · kuizhiqing · wentaoyu · commit c8e6545870e8 · 2023-11-23T14:51:15.000+08:00
* [FlashAttn] add flash randomness control (PaddlePaddle#52902) * add flash randomness control * fix VLOG undefied * [WIP] Integration flash attention 2 (PaddlePaddle#55758) * Work for fa-2 padded fwd. Code to be cleaned. * Work for fa2 unpadded fwd. * Work for padded-bwd, dk get small diff on np.random.seed(0) * Anyway I pass paddle's utest, except return softmax without dropout. * Clean code. * Modify interface. * Clean code and add some check. * Easy compile for dev. * Fix ci. * Fix ci-build. * Add std c++17 option again. * Limit max job when compiling fa2. * Remove const_cast * Add fwd params, to be cleaned. * Clean code. * Add bwd params. * Clean code. * Add enforce. * Use v2.0.4 * Pass RNG state to fa2 capi * Fix review. * Add assert * Skip compile for sm less than 80. --------- Co-authored-by: Chitsing KUI <kuizhiqing@msn.com>
diff --git a/cmake/external/flashattn.cmake b/cmake/external/flashattn.cmake
@@ -20,7 +20,7 @@ set(FLASHATTN_PREFIX_DIR ${THIRD_PARTY_PATH}/flashattn)
 set(FLASHATTN_SOURCE_SUBDIR csrc)
 set(FLASHATTN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flashattn)
 set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/flashattn)
-set(FLASHATTN_TAG 18106c1ba0ccee81b97ca947397c08a141815a47)
+set(FLASHATTN_TAG b5bdb79d5e1f2f88b1ef62e86899a14f82fa079a)
 
 set(FLASHATTN_INCLUDE_DIR
     "${FLASHATTN_INSTALL_DIR}/include"
diff --git a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
@@ -21,9 +21,13 @@
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/arange_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/gpu/flash_attn_utils.h"
 #include "paddle/phi/kernels/reshape_kernel.h"
 
+#ifdef PADDLE_WITH_FLASHATTN
+#include "paddle/phi/backends/dynload/flashattn.h"
+#include "paddle/phi/kernels/gpu/flash_attn_utils.h"
+#endif
+
 PD_DECLARE_bool(cudnn_deterministic);
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/flash_attn_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_kernel.cu
@@ -21,10 +21,12 @@
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/arange_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/gpu/flash_attn_utils.h"
 #include "paddle/phi/kernels/reshape_kernel.h"
 
-PD_DECLARE_bool(cudnn_deterministic);
+#ifdef PADDLE_WITH_FLASHATTN
+#include "paddle/phi/backends/dynload/flashattn.h"
+#include "paddle/phi/kernels/gpu/flash_attn_utils.h"
+#endif
 
 namespace phi {
 
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_flash_attn.py b/python/paddle/distributed/auto_parallel/operators/dist_flash_attn.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import logging
+
+from ...utils.log_utils import get_logger
+
+_logger = get_logger(logging.INFO)
+from ..random import determinate_rng, is_enable_auto_rand_ctrl
+from .common import (
+    DistributedOperatorImplContainer,
+    register_distributed_operator_impl,
+    register_distributed_operator_impl_container,
+)
+from .dist_eltwise import DistributedDefaultImpl0, DistributedElementwiseImpl0
+
+
+class DistributedFlashAttn(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super().__init__(op_type)
+
+
+register_distributed_operator_impl_container(DistributedFlashAttn("flash_attn"))
+
+
+# Dist FlashAttn with Random Control
+class DistributedFlashAttnImpl0(DistributedElementwiseImpl0):
+    def __init__(self, name):
+        super().__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        return True
+
+    def is_output_compatible(self, dist_op):
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        return True
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+
+        if (
+            is_enable_auto_rand_ctrl()
+            and not op_dist_attr.is_recompute
+            and rank_id in op_dist_attr.process_mesh.process_ids
+        ):
+            assert (
+                op_dist_attr is not None
+            ), f"forward op [{str(src_op)}] don't have dist attribute !"
+
+            if (
+                len(kwargs.get('fixed_seed_offset', [])) > 0
+                or len(src_op.input("fixed_seed_offset")) > 0
+            ):
+                # TODO(kuizhiqing) recompute should go here
+                pass
+            else:
+                # determinate rng
+                q_var = main_block._var_recursive(kwargs['q'][0])
+                k_var = main_block._var_recursive(kwargs['k'][0])
+                q_dims_mapping = op_dist_attr.get_input_dims_mapping(q_var.name)
+                k_dims_mapping = op_dist_attr.get_input_dims_mapping(k_var.name)
+                process_mesh = op_dist_attr.process_mesh
+                dims_mapping = q_dims_mapping[:3] + [q_dims_mapping[2]]
+
+                rng_name = determinate_rng(rank_id, dims_mapping, process_mesh)
+                assert rng_name is not None and rng_name != ""
+
+                src_op._set_attr('rng_name', rng_name)
+
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        # dropout backward is deterministic by mask, and not need for random state control
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl(
+    "flash_attn", DistributedFlashAttnImpl0("random_control")
+)
diff --git a/python/paddle/nn/functional/flash_attention.py b/python/paddle/nn/functional/flash_attention.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 import paddle
 import paddle.nn.functional as F
 from paddle import _C_ops, in_dynamic_mode
@@ -22,6 +24,10 @@
 g_enable_flash = None
 g_enable_mem_efficient = None
 
+g_use_flash_attn_v1 = (
+    os.getenv('FLAGS_flash_attn_version', 'v2').strip().lower() == 'v1'
+)
+
 
 @signature_safe_contextmanager
 def sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=True):
@@ -222,21 +228,32 @@ def flash_attention(
 
     if sdp_func_name == "flash_attn":
         if in_dynamic_mode():
-            (
-                result_attention,
-                result_softmax,
-            ) = _C_ops.flash_attn(
-                query,
-                key,
-                value,
-                fixed_seed_offset,
-                None,
-                dropout,
-                causal,
-                return_softmax,
-                not training,
-                rng_name,
-            )
+            if g_use_flash_attn_v1:
+                (result_attention, result_softmax, _, _) = _C_ops.flash_attn_v1(
+                    query,
+                    key,
+                    value,
+                    dropout,
+                    causal,
+                    return_softmax,
+                    not training,
+                )
+            else:
+                (
+                    result_attention,
+                    result_softmax,
+                ) = _C_ops.flash_attn(
+                    query,
+                    key,
+                    value,
+                    fixed_seed_offset,
+                    None,
+                    dropout,
+                    causal,
+                    return_softmax,
+                    not training,
+                    rng_name,
+                )
             return result_attention, result_softmax if return_softmax else None
 
         helper = LayerHelper('flash_attn', **locals())
@@ -377,26 +394,45 @@ def flash_attn_unpadded(
 
     """
     if in_dynamic_mode():
-        (
-            result_attention,
-            result_softmax,
-        ) = _C_ops.flash_attn_unpadded(
-            query,
-            key,
-            value,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            fixed_seed_offset,
-            None,
-            max_seqlen_q,
-            max_seqlen_k,
-            scale,
-            dropout,
-            causal,
-            return_softmax,
-            not training,
-            rng_name,
-        )
+        if g_use_flash_attn_v1:
+            (
+                result_attention,
+                result_softmax,
+            ) = _C_ops.flash_attn_unpadded(
+                query,
+                key,
+                value,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seqlen_q,
+                max_seqlen_k,
+                scale,
+                dropout,
+                causal,
+                return_softmax,
+                not training,
+            )
+        else:
+            (
+                result_attention,
+                result_softmax,
+            ) = _C_ops.flash_attn_unpadded(
+                query,
+                key,
+                value,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                fixed_seed_offset,
+                None,
+                max_seqlen_q,
+                max_seqlen_k,
+                scale,
+                dropout,
+                causal,
+                return_softmax,
+                not training,
+                rng_name,
+            )
         return result_attention, result_softmax if return_softmax else None
 
     helper = LayerHelper('flash_attn_unpadded', **locals())