ROCm
diff --git a/‎.github/workflows/amd_tests.yml‎
Lines changed: 79 additions & 0 deletions b/‎.github/workflows/amd_tests.yml‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 1 deletion b/‎.gitignore‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 30 additions & 29 deletions b/‎README.md‎
Lines changed: 30 additions & 29 deletions
diff --git a/‎flash_attn/flash_attn_triton_amd/README.md‎
Lines changed: 5 additions & 2 deletions b/‎flash_attn/flash_attn_triton_amd/README.md‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎flash_attn/flash_attn_triton_amd/bench.py‎
Lines changed: 47 additions & 14 deletions b/‎flash_attn/flash_attn_triton_amd/bench.py‎
Lines changed: 47 additions & 14 deletions
diff --git a/‎flash_attn/flash_attn_triton_amd/bwd_prefill.py‎
Lines changed: 53 additions & 29 deletions b/‎flash_attn/flash_attn_triton_amd/bwd_prefill.py‎
Lines changed: 53 additions & 29 deletions
@@ -0,0 +1,79 @@
+name: AMD Perf Kernel Tests
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [main_perf]
+  merge_group:
+    branches: [main_perf]
+    types: [checks_requested]
+  push:
+    branches: [main_perf, micmelesse/upstream_pr]
+
+concurrency:
+  group: ${{ github.ref }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  Runner-Preparation-AMD:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    outputs:
+      matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
+    steps:
+      - name: Prepare runner matrix
+        id: set-matrix
+        run: |
+          if [ x"${{ github.repository }}" == x"ROCm/flash-attention" ]; then
+            echo '::set-output name=matrix-HIP::[["self-hosted", "rocm"]]'
+          else
+            echo '::set-output name=matrix-HIP::[["ubuntu-latest"]]'
+          fi
+
+  Integration-Tests-AMD:
+    needs: Runner-Preparation-AMD
+    if: needs.Runner-Preparation-AMD.outputs.matrix-HIP != ''
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      matrix:
+        runner: ${{fromJson(needs.Runner-Preparation-AMD.outputs.matrix-HIP)}}
+    container:
+      image: rocm/pytorch:rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2
+      options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Triton
+        run: |
+          pip uninstall -y triton
+          pip install matplotlib pandas pytest
+          git clone https://github.com/triton-lang/triton
+          cd triton
+          git checkout 2e9f2c2d20601c24b91a4c32a7b97ad1f8a55d88 
+          pip install --verbose -e python
+          cd ..
+      - name: Build
+        run: |
+          export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+          python setup.py install
+      - name: Flash Attention Tests Using Reference Impl
+        run: |
+          export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+          export FLASH_ATTENTION_TRITON_AMD_REF=1
+          pytest tests/test_flash_attn_triton_amd.py
+      - name: Flash Attention Tests
+        run: |
+          export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+          export FLASH_ATTENTION_TRITON_AMD_AUTOTUNE=0
+          pytest tests/test_flash_attn_triton_amd.py
+      - name: AMD Kernel Tests
+        run: |
+          export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+          export FLASH_ATTENTION_TRITON_AMD_AUTOTUNE=0
+          pytest -v -s flash_attn/flash_attn_triton_amd/test.py
+      - name: AMD Kernel Bench
+        run: |
+          export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+          python flash_attn/flash_attn_triton_amd/bench.py
@@ -22,10 +22,16 @@ var/
 *.egg-info/
 .installed.cfg
 *.egg
-.eggs/
+.eggs
 
 # IDE-related
 .idea/
 
 # Dev
 venv
+scripts
+*.log
+core.*
+*.csv
+*.png
+*.html
@@ -164,48 +164,49 @@ git checkout main_perf
 FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" python setup.py install
 ```
 
-To test that things are working, you can run our tests. These tests take hours so you don't need to run the full thing.
-```
-FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" pytest tests/test_flash_attn_triton_amd.py
-```
-
-You can use autotune for better performance by using this flag `FLASH_ATTENTION_TRITON_AMD_AUTOTUNE="TRUE"`
-```
-FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE" FLASH_ATTENTION_TRITON_AMD_AUTOTUNE="TRUE" python $PATH_TO_CODE
-```
+#### Triton Backend
+The Triton implementation of the [Flash Attention v2](https://tridao.me/publications/flash2/flash2.pdf) is currently a work in progress.
 
-###### Docker
-You can also use the Dockerfile below which does the above steps on top of the latest rocm/pytorch image.
-```
-FROM rocm/pytorch:latest
+It supports AMD's CDNA (MI200, MI300) and RDNA GPU's using fp16, bf16 and fp32 datatypes.
 
-WORKDIR /workspace
+These features are supported in Fwd and Bwd
+1) Fwd and Bwd with causal masking
+2) Variable sequence lengths
+3) Arbitrary Q and KV sequence lengths
+4) Arbitrary head sizes
 
-# install triton
-RUN pip install triton==3.2.0
+These features are supported in Fwd for now. We will add them to backward soon.
+1) Multi and grouped query attention
+2) ALiBi and matrix bias
 
-# install flash attention
-ENV FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+These features are in development
+1) Paged Attention 
+2) Sliding Window
+3) Rotary embeddings
+4) Dropout
+5) Performance Improvements
 
-RUN git clone https://github.com/ROCm/flash-attention.git &&\ 
-    cd flash-attention &&\
-    git checkout main_perf &&\
-    python setup.py install
+#### Getting Started
+To get started with the triton backend for AMD, follow the steps below.
 
-# set working dir
-WORKDIR /workspace/flash-attention
-```
+First install the recommended Triton [commit](https://github.com/triton-lang/triton/commit/2e9f2c2d20601c24b91a4c32a7b97ad1f8a55d88).
 
-To build the docker file
 ```
-docker build -t fa_triton .
+git clone https://github.com/triton-lang/triton
+cd triton
+git checkout 2e9f2c2d20601c24b91a4c32a7b97ad1f8a55d88 
+pip install --verbose -e python
 ```
+Then install and test Flash Attention with the flag `FLASH_ATTENTION_TRITON_AMD_ENABLE` set to `"TRUE"`.
 
-To run the docker image
 ```
-docker run -it --network=host --user root --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ipc=host --shm-size 16G --device=/dev/kfd --device=/dev/dri fa_triton
+export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+cd flash-attention
+python setup.py install
+pytest tests/test_flash_attn.py
 ```
 
+
 ## How to use FlashAttention
 
 The main functions implement scaled dot product attention (softmax(Q @ K^T *
 
@@ -25,10 +25,13 @@ We are working on the following things
 ##### Getting Started
 To get started with the triton backend for AMD, follow the steps below.
 
-First install the recommended Triton version 
+First install the recommended Triton [commit](https://github.com/triton-lang/triton/commit/2e9f2c2d20601c24b91a4c32a7b97ad1f8a55d88).
 
 ```
-pip install triton==3.2.0
+git clone https://github.com/triton-lang/triton
+cd triton
+git checkout 2e9f2c2d20601c24b91a4c32a7b97ad1f8a55d88 
+pip install --verbose -e python
 ```
 Then install Flash Attention with the flag `FLASH_ATTENTION_TRITON_AMD_ENABLE` set to `"TRUE"`.
 
 
@@ -58,20 +58,53 @@
     "flash_attn_with_kvcache": ["ck", "triton"],
 }
 
-VALID_MODES = ['fwd', 'bwd', 'full']
-SUPPORTED_MODES = {
-    "flash_attn_func": ["fwd", "bwd", "full"],
-    "flash_attn_fp8_func": ["fwd", "bwd", "full"],
-    "flash_attn_kvpacked_func": ["fwd", "bwd", "full"],
-    "flash_attn_qkvpacked_func": ["fwd", "bwd", "full"],
-    "flash_attn_qkvpacked_fp8_func": ["fwd", "bwd", "full"],
-    "flash_attn_varlen_func": ["fwd", "bwd", "full"],
-    "flash_attn_varlen_fp8_func": ["fwd", "bwd", "full"],
-    "flash_attn_varlen_kvpacked_func": ["fwd", "bwd", "full"],
-    "flash_attn_varlen_qkvpacked_func": ["fwd", "bwd", "full"],
-    "flash_attn_varlen_qkvpacked_fp8_func": ["fwd", "bwd", "full"],
-    "flash_attn_with_kvcache": ["fwd"],
-}
+def get_benchmark_configs(args, varlen=False):
+    """
+    Returns benchmark configurations based on whether variable-length sequences are used.
+    """
+    if args.custom_config:
+        hk = args.hq if not args.hk else args.hk
+        sk = args.sq if not args.sk else args.sk
+        return [(args.b, args.hq, hk, args.sq, sk)]
+    elif varlen:
+        return [
+            (2, 16, 4, 1024, 1024),
+            (8, 16, 2, 2048, 2048),
+            (4, 16, 8, 4096, 4096),
+            (2, 16, 4, 8192, 8192),
+            (2, 16, 8, 16384, 16384),
+            (2, 48, 12, 1024, 1024),
+            (2, 48, 24, 2048, 2048),
+            (2, 48, 8, 4096, 4096),
+            (2, 48, 4, 8192, 8192),
+            (2, 48, 2, 16384, 16384),
+            (2, 64, 32, 1024, 1024),
+            (4, 64, 16, 2048, 2048),
+            (4, 64, 8, 4096, 4096),
+            (4, 64, 32, 8192, 8192),
+            (4, 128, 16, 16384, 16384),
+        ]
+    else:
+        return [
+            (16, 16, 16, 1024, 1024),
+            (8, 16, 16, 2048, 2048),
+            (4, 16, 16, 4096, 4096),
+            (2, 16, 16, 8192, 8192),
+            (1, 16, 16, 16384, 16384),
+            (2, 48, 48, 1024, 1024),
+            (2, 48, 48, 2048, 1024),
+            (2, 48, 48, 4096, 8192),
+            (2, 48, 48, 8192, 4096),
+            (2, 48, 48, 16384, 8192),
+            (8, 16, 16, 1989, 15344),
+            (4, 16, 16, 4097, 163),
+            (2, 16, 16, 8122, 2159),
+            (1, 16, 16, 16281, 7),
+            (2, 48, 48, 1021, 1020),
+            (2, 48, 48, 2001, 2048),
+            (2, 48, 48, 3996, 9639),
+            (2, 48, 48, 8181, 1021),
+        ]
 
 @dataclass
 class EnvVariableConfig:
 
@@ -2,12 +2,7 @@
 import torch
 import triton
 import triton.language as tl
-from .utils import DEBUG, DROPOUT_USE_PYTORCH, DROPOUT_DUMP, compute_fp8_scaling_factors, get_shapes_from_layout, get_strides_from_layout, is_fp8, write_dropout_mask, create_dropout_mask
-
-# TODO: move this into utils.py so it's shared among kernels
-# NOTE: triton fails to import tl.constexprs so create them here for the file
-tl_DROPOUT_USE_PYTORCH: tl.constexpr = triton.language.constexpr(DROPOUT_USE_PYTORCH)
-tl_DROPOUT_DUMP: tl.constexpr = triton.language.constexpr(DROPOUT_DUMP)
+from .utils import get_shape_from_layout, get_strides_from_layout, DEBUG
 
 @triton.jit
 def _bwd_preprocess(
@@ -89,6 +84,7 @@ def _bwd_preprocess(
     tl.store(delta_ptrs, delta, mask=mask_m)
 
 
+
 @triton.jit
 def _bwd_kernel_one_col_block(
     Q,
@@ -419,9 +415,11 @@ def _bwd_kernel(
     l_offset = L + off_z * stride_deltaz + off_hq * stride_deltah + q_start * stride_deltam
     delta_offset = Delta + off_z * stride_deltaz + off_hq * stride_deltah + q_start * stride_deltam
 
-    if DROPOUT:
-        batch_philox_offset = philox_offset_base + off_z * stride_dropoutz + off_hq * stride_dropouth #+ q_start * stride_dropoutm
-        dropout_offset = Dropout_mask + off_z * stride_dropoutz + off_hq * stride_dropouth #+ q_start * stride_dropoutm
+    # output tensor offsets
+    dk_offset = DK + off_z * stride_kz + off_h * stride_kh + k_start * stride_kn
+    dv_offset = DV + off_z * stride_vz + off_h * stride_vh + k_start * stride_vn
+    if SEQUENCE_PARALLEL:
+        dq_offset = DQ + stride_dq_all * start_n + off_z * stride_qz + off_h * stride_qh + q_start * stride_qm
     else:
         batch_philox_offset = 0
         dropout_offset = 0
@@ -600,12 +598,7 @@ def attention_prefill_backward_triton_impl(
     philox_seed: Optional[int], 
     philox_offset: Optional[int],
     use_exp2: bool,
-    sequence_parallel: bool = True,
-    # fp8
-    descale_q: Optional[torch.Tensor] = None,
-    descale_k: Optional[torch.Tensor] = None,
-    descale_v: Optional[torch.Tensor] = None,
-    descale_do: Optional[torch.Tensor] = None,
+    sequence_parallel = False,
 ):
     if DEBUG:
         print()
@@ -656,6 +649,8 @@ def attention_prefill_backward_triton_impl(
     stride_kz, stride_kh, stride_kn, stride_kk = k_strides
     stride_vz, stride_vh, stride_vn, stride_vk = v_strides
     stride_oz, stride_oh, stride_om, stride_ok = o_strides
+    stride_dq_all = q.numel()
+    batch_headsize = batch * nheads_q
     is_varlen = layout == "thd"
     group_size = nheads_q // nheads_k
     use_dropout = (dropout_p > 0.0)
@@ -687,13 +682,33 @@ def attention_prefill_backward_triton_impl(
     ACTUAL_BLOCK_DMODEL = head_size
 
     do = do.contiguous()
-
-    # deal with dq
     if sequence_parallel:
-        dq = dq.unsqueeze(0).repeat(num_blocks_n, *([1] * len(q.shape))) # we do repeat instead of expand because we need to write data so views are not enough
-    stride_dq_all = dq.stride()[0]
+        # replicate q for each parallel sequence
+        replicas = num_blocks_n
+        dq_shape = (replicas,) + q.shape
+    else:
+        dq_shape = q.shape
+
+    is_qkvpacked = False
+    if dq is None or dk is None or dv is None: 
+        dq = torch.zeros(dq_shape, device=q.device, dtype=q.dtype)
+        dk = torch.empty_like(k)
+        dv = torch.empty_like(v)
+    elif (not dq.is_contiguous()) or (not dq.is_contiguous()) or (not dq.is_contiguous()):
+            if DEBUG:
+                print("Not contigious and setting is packed to True")
+            is_qkvpacked = True
+            dq_og = dq
+            dq = dq.contiguous()
+            dk_og = dk
+            dk = dk.contiguous()
+            dv_og = dv
+            dv = dv.contiguous()       
+    
+    # NOTE: the kernel does inplace accumlation so dq has to be zeros. This avoids the case where we are passed empty dq and it is not all zeros
+    dq.zero_()
 
-    # assert contiguous
+    # assert contigious
     assert do.is_contiguous()
     assert q.is_contiguous()
     assert k.is_contiguous()
@@ -798,17 +813,26 @@ def attention_prefill_backward_triton_impl(
         FP8_MAX=FP8_MAX
     )
 
-    if sequence_parallel:
+    if len(dq.shape) == 5:
         dq = dq.sum(dim=0)
 
     if DEBUG:
-        print("attention_prefill_backward_triton_impl outputs")
-        print("dv:", dv, dv.shape)
-        print("dk:", dk, dk.shape)
+        print("_bwd_kernel outputs")
         print("dq:", dq, dq.shape)
-        if use_dropout:
-            print("dropout_mask:", dropout_mask, dropout_mask.shape if dropout_mask is not None else None)
-            print("dropout_fraction bwd:", 1.0 - (dropout_mask.sum()/ dropout_mask.numel()).item())
-            write_dropout_mask(dropout_mask, "dropout_mask_bwd")
+        print("dk:", dk, dk.shape)
+        print("dv:", dv, dv.shape)
+        print("delta:", delta, delta.shape)
+    
+    if is_qkvpacked:
+        if DEBUG:
+            print("Copying back to original tensors due to ispacked")
+        
+        # copy back results to og tensors
+        dq_og.copy_(dq)
+        dk_og.copy_(dk)
+        dv_og.copy_(dv)
+        return dq_og, dk_og, dv_og, delta, None, None
+    else:
+        return dq, dk, dv, delta, None, None
+
 
-    return delta