From aebc5a74c6f031d5374faa83fdcc2ce92fef805f Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Fri, 6 Sep 2024 15:14:52 +0000
Subject: [PATCH 01/24] convert alibi bias to simply bh bias (same as alibi
 without dependence on position idx)

---
 csrc/flash_attn/src/mask.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index 7ba435a37b..361ea2380e 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -153,7 +153,7 @@ struct Mask {
                         for (int mi = 0; mi < size<0>(tensor); ++mi) {
                             // No causal, no local
                             if constexpr (Has_alibi) {
-                                tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
+                                tensor(mi, make_coord(j, nj)) += alibi_slope; // * col_idx;
                             }
                             if constexpr (!Is_even_MN) {
                                 if (col_idx >= max_seqlen_k) { tensor(mi, make_coord(j, nj)) = -INFINITY; }
@@ -178,9 +178,9 @@ struct Mask {
                                 const int col_idx = col_idx_base + j;
                                 if constexpr (Has_alibi) {
                                     if constexpr (Is_causal) {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope * col_idx;
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope; // * col_idx;
                                     } else {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope; // * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
 
                                     }
                                 }

From cfca2e8153962ec2cd0bed7c1ae0ad394f3f03ca Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Fri, 6 Sep 2024 15:26:07 +0000
Subject: [PATCH 02/24] publish fewer versions

---
 .github/workflows/publish.yml | 70 +++++++++++++++++------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index ead7ff04b8..cfc1aa3378 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -43,14 +43,14 @@ jobs:
           # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the
           # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
           os: [ubuntu-20.04]
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
-          torch-version: ['2.0.1', '2.1.2', '2.2.2', '2.3.1', '2.4.0']
-          cuda-version: ['11.8.0', '12.3.2']
+          python-version: ['3.9', '3.10']
+          torch-version: ['2.4.0']
+          cuda-version: ['11.8.0']
           # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
           # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
           # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
           # when building without C++11 ABI and using it on nvcr images.
-          cxx11_abi: ['FALSE', 'TRUE']
+          cxx11_abi: ['FALSE']
           exclude:
             # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
             # Pytorch < 2.2 does not support Python 3.12
@@ -179,34 +179,34 @@ jobs:
           asset_name: ${{env.wheel_name}}
           asset_content_type: application/*
 
-  publish_package:
-    name: Publish package
-    needs: [build_wheels]
-
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v3
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-
-      - name: Install dependencies
-        run: |
-          pip install ninja packaging setuptools wheel twine
-          # We don't want to download anything CUDA-related here
-          pip install torch --index-url https://download.pytorch.org/whl/cpu
-
-      - name: Build core package
-        env:
-          FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
-        run: |
-          python setup.py sdist --dist-dir=dist
-
-      - name: Deploy
-        env:
-          TWINE_USERNAME: "__token__"
-          TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
-        run: |
-          python -m twine upload dist/*
+  # publish_package:
+  #   name: Publish package
+  #   needs: [build_wheels]
+
+  #   runs-on: ubuntu-latest
+
+  #   steps:
+  #     - uses: actions/checkout@v3
+
+  #     - uses: actions/setup-python@v4
+  #       with:
+  #         python-version: '3.10'
+
+  #     - name: Install dependencies
+  #       run: |
+  #         pip install ninja packaging setuptools wheel twine
+  #         # We don't want to download anything CUDA-related here
+  #         pip install torch --index-url https://download.pytorch.org/whl/cpu
+
+  #     - name: Build core package
+  #       env:
+  #         FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
+  #       run: |
+  #         python setup.py sdist --dist-dir=dist
+
+  #     - name: Deploy
+  #       env:
+  #         TWINE_USERNAME: "__token__"
+  #         TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+  #       run: |
+  #         python -m twine upload dist/*

From 14b704d64b3df8ed166dda5d17c2027987d6bf9c Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Sat, 7 Sep 2024 03:40:04 +0000
Subject: [PATCH 03/24] fix diagonal bias

---
 csrc/flash_attn/src/mask.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index 361ea2380e..c21933e454 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -153,7 +153,7 @@ struct Mask {
                         for (int mi = 0; mi < size<0>(tensor); ++mi) {
                             // No causal, no local
                             if constexpr (Has_alibi) {
-                                tensor(mi, make_coord(j, nj)) += alibi_slope; // * col_idx;
+                                tensor(mi, make_coord(j, nj)) += (col_idx == 0 ? 0 : alibi_slope); // alibi_slope * col_idx;
                             }
                             if constexpr (!Is_even_MN) {
                                 if (col_idx >= max_seqlen_k) { tensor(mi, make_coord(j, nj)) = -INFINITY; }
@@ -178,9 +178,10 @@ struct Mask {
                                 const int col_idx = col_idx_base + j;
                                 if constexpr (Has_alibi) {
                                     if constexpr (Is_causal) {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope; // * col_idx;
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += (col_idx == row_idx ? 0 : alibi_slope); // alibi_slope * col_idx;
+
                                     } else {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope; // * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += (col_idx == row_idx ? 0 : alibi_slope); // -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
 
                                     }
                                 }

From 5996687628275b1dd544f067ec4a34ecd747eea6 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Sat, 7 Sep 2024 13:41:10 +0000
Subject: [PATCH 04/24] try just setting bias to 0 when Col_idx_only

---
 csrc/flash_attn/src/mask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index c21933e454..13c2688af5 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -153,7 +153,7 @@ struct Mask {
                         for (int mi = 0; mi < size<0>(tensor); ++mi) {
                             // No causal, no local
                             if constexpr (Has_alibi) {
-                                tensor(mi, make_coord(j, nj)) += (col_idx == 0 ? 0 : alibi_slope); // alibi_slope * col_idx;
+                                tensor(mi, make_coord(j, nj)) += 0; // alibi_slope * col_idx;
                             }
                             if constexpr (!Is_even_MN) {
                                 if (col_idx >= max_seqlen_k) { tensor(mi, make_coord(j, nj)) = -INFINITY; }

From 3b267ff203c102569887a3ceb3e3ed153f6104c4 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Sat, 7 Sep 2024 14:07:09 +0000
Subject: [PATCH 05/24] try disabling Col_idx_only if alibi

---
 csrc/flash_attn/src/mask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index 13c2688af5..0f1f3763a1 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -139,7 +139,7 @@ struct Mask {
             // Reshape tensor_ from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
             Tensor tensor = make_tensor(tensor_.data(), flash::convert_layout_acc_rowcol(tensor_.layout()));
             // Do we need both row and column indices, or just column incides?
-            static constexpr bool Col_idx_only = !(Has_alibi && !Is_causal) && !Is_local && !Causal_mask;
+            static constexpr bool Col_idx_only = !Has_alibi && !(Has_alibi && !Is_causal) && !Is_local && !Causal_mask;
             const int lane_id = threadIdx.x % 32;
             const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
             if constexpr (Col_idx_only) {

From b9ec21569c1bfa415539698f723711d7c32e67fa Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Sun, 8 Sep 2024 01:36:38 +0000
Subject: [PATCH 06/24] fix bh bias diagonal handling

---
 csrc/flash_attn/src/mask.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index 0f1f3763a1..16a84c6ebc 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -153,7 +153,7 @@ struct Mask {
                         for (int mi = 0; mi < size<0>(tensor); ++mi) {
                             // No causal, no local
                             if constexpr (Has_alibi) {
-                                tensor(mi, make_coord(j, nj)) += 0; // alibi_slope * col_idx;
+                                tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
                             }
                             if constexpr (!Is_even_MN) {
                                 if (col_idx >= max_seqlen_k) { tensor(mi, make_coord(j, nj)) = -INFINITY; }
@@ -178,10 +178,10 @@ struct Mask {
                                 const int col_idx = col_idx_base + j;
                                 if constexpr (Has_alibi) {
                                     if constexpr (Is_causal) {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) += (col_idx == row_idx ? 0 : alibi_slope); // alibi_slope * col_idx;
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope);
 
                                     } else {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) += (col_idx == row_idx ? 0 : alibi_slope); // -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
 
                                     }
                                 }

From 42a895e6cf8e1906f353f694246a4d143521b421 Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Wed, 5 Feb 2025 18:41:27 -0500
Subject: [PATCH 07/24] compile for pytorch 2.5.1

---
 .github/workflows/publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index cfc1aa3378..5745280bb3 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -44,7 +44,7 @@ jobs:
           # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
           os: [ubuntu-20.04]
           python-version: ['3.9', '3.10']
-          torch-version: ['2.4.0']
+          torch-version: ['2.5.1']
           cuda-version: ['11.8.0']
           # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
           # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.

From f124d98a3695f7b27b7c75b148ae79b18697d7b3 Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Wed, 5 Feb 2025 18:58:54 -0500
Subject: [PATCH 08/24] fix TORCH_CUDA_VERSION env var for pytorch 2.5

---
 .github/workflows/publish.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 5745280bb3..0f37f8e654 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -118,8 +118,8 @@ jobs:
           # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
           # This code is ugly, maybe there's a better way to do this.
           export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
-            minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118}[env['MATRIX_TORCH_VERSION']]; \
-            maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121}[env['MATRIX_TORCH_VERSION']]; \
+            minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118, '2.5': 118}[env['MATRIX_TORCH_VERSION']]; \
+            maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121, '2.5': 124}[env['MATRIX_TORCH_VERSION']]; \
             print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \
           )
           if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then

From 4f8b153f25bf5237f0d97d42e2a2b7956cc40a90 Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Fri, 7 Feb 2025 09:10:16 -0500
Subject: [PATCH 09/24] adjust alibi for non causal too

---
 csrc/flash_attn/src/mask.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index 16a84c6ebc..b42edbbac9 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -177,13 +177,7 @@ struct Mask {
                             for (int j = 0; j < size<1, 0>(tensor); ++j) {
                                 const int col_idx = col_idx_base + j;
                                 if constexpr (Has_alibi) {
-                                    if constexpr (Is_causal) {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope);
-
-                                    } else {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
-
-                                    }
+                                    tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope);
                                 }
                                 if constexpr (Causal_mask) {
                                     if (col_idx >= col_idx_limit_right) {

From d2dd0fdf215aa8bbe6263e5c288f8a26b64e6551 Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Fri, 7 Feb 2025 09:12:08 -0500
Subject: [PATCH 10/24] compile for torch 2.4.1 too

---
 .github/workflows/publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 0f37f8e654..67a7965a8b 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -44,7 +44,7 @@ jobs:
           # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
           os: [ubuntu-20.04]
           python-version: ['3.9', '3.10']
-          torch-version: ['2.5.1']
+          torch-version: ['2.4.1', '2.5.1']
           cuda-version: ['11.8.0']
           # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
           # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.

From 980f524999b3c446f040349dcef69df58cbb71fd Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Fri, 7 Feb 2025 09:28:34 -0500
Subject: [PATCH 11/24] make a version that always adds the alibi slope...

---
 csrc/flash_attn/src/mask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index b42edbbac9..31df79f4e8 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -177,7 +177,7 @@ struct Mask {
                             for (int j = 0; j < size<1, 0>(tensor); ++j) {
                                 const int col_idx = col_idx_base + j;
                                 if constexpr (Has_alibi) {
-                                    tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope);
+                                    tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope;
                                 }
                                 if constexpr (Causal_mask) {
                                     if (col_idx >= col_idx_limit_right) {

From c42bf6bc66797306028f3d9246db62897ced164e Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Fri, 7 Feb 2025 09:29:06 -0500
Subject: [PATCH 12/24] ...and only compile for py310 and torch24

---
 .github/workflows/publish.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 67a7965a8b..e7a586e2c0 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -43,8 +43,8 @@ jobs:
           # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the
           # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
           os: [ubuntu-20.04]
-          python-version: ['3.9', '3.10']
-          torch-version: ['2.4.1', '2.5.1']
+          python-version: ['3.10']
+          torch-version: ['2.4.1']
           cuda-version: ['11.8.0']
           # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
           # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.

From a8d7fccd26e58eefe89f1b79454ea88886f89d21 Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Fri, 7 Feb 2025 09:47:15 -0500
Subject: [PATCH 13/24] redo causal and noncausal alibi for diagonal

---
 csrc/flash_attn/src/mask.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index 31df79f4e8..709493331a 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -177,7 +177,13 @@ struct Mask {
                             for (int j = 0; j < size<1, 0>(tensor); ++j) {
                                 const int col_idx = col_idx_base + j;
                                 if constexpr (Has_alibi) {
-                                    tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope;
+                                    if constexpr (Is_causal) {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope);
+
+                                    } else {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == row_idx) ? 0 : alibi_slope);
+
+                                    }
                                 }
                                 if constexpr (Causal_mask) {
                                     if (col_idx >= col_idx_limit_right) {

From 599e3225e34fdd5aa9a9261247b44e56df85849f Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Fri, 7 Feb 2025 09:59:36 -0500
Subject: [PATCH 14/24] bidirectional diagonal handling - take into account
 seqlens

---
 csrc/flash_attn/src/mask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index 709493331a..234f2f61a6 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -181,7 +181,7 @@ struct Mask {
                                         tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope);
 
                                     } else {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == row_idx) ? 0 : alibi_slope);
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((row_idx + max_seqlen_k == max_seqlen_q - col_idx) ? 0 : alibi_slope);
 
                                     }
                                 }

From fdc50f3987bb3c93b3b10641726130826f699e66 Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Fri, 7 Feb 2025 10:06:31 -0500
Subject: [PATCH 15/24] fix diagonal formula

---
 csrc/flash_attn/src/mask.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index 234f2f61a6..3c38aaa4d8 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -177,13 +177,7 @@ struct Mask {
                             for (int j = 0; j < size<1, 0>(tensor); ++j) {
                                 const int col_idx = col_idx_base + j;
                                 if constexpr (Has_alibi) {
-                                    if constexpr (Is_causal) {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope);
-
-                                    } else {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((row_idx + max_seqlen_k == max_seqlen_q - col_idx) ? 0 : alibi_slope);
-
-                                    }
+                                    tensor(make_coord(i, mi), make_coord(j, nj)) += ((row_idx + max_seqlen_k == max_seqlen_q + col_idx) ? 0 : alibi_slope);
                                 }
                                 if constexpr (Causal_mask) {
                                     if (col_idx >= col_idx_limit_right) {

From a645eddfb8b51dd6788efb1a8a816e2a29da282d Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Fri, 7 Feb 2025 21:38:12 -0500
Subject: [PATCH 16/24] diagonal noncausal try accounting for max_seqlens too

---
 csrc/flash_attn/src/mask.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index 3c38aaa4d8..5645e6db99 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -177,7 +177,13 @@ struct Mask {
                             for (int j = 0; j < size<1, 0>(tensor); ++j) {
                                 const int col_idx = col_idx_base + j;
                                 if constexpr (Has_alibi) {
-                                    tensor(make_coord(i, mi), make_coord(j, nj)) += ((row_idx + max_seqlen_k == max_seqlen_q + col_idx) ? 0 : alibi_slope);
+                                    if constexpr (Is_causal) {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope);
+
+                                    } else {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((row_idx + max_seqlen_k == max_seqlen_q + col_idx) ? 0 : alibi_slope);
+
+                                    }
                                 }
                                 if constexpr (Causal_mask) {
                                     if (col_idx >= col_idx_limit_right) {

From f5ce6ee9bdfb04f565c39475e63afb68fbaa78c0 Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Sat, 8 Feb 2025 08:29:40 -0500
Subject: [PATCH 17/24] follow the original expression more closely

---
 csrc/flash_attn/src/mask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index 5645e6db99..649a1dc854 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -181,7 +181,7 @@ struct Mask {
                                         tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope);
 
                                     } else {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((row_idx + max_seqlen_k == max_seqlen_q + col_idx) ? 0 : alibi_slope);
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope);
 
                                     }
                                 }

From efbbaf4530c88b2557e5dda2b1717eb02b040406 Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Sat, 8 Feb 2025 09:22:31 -0500
Subject: [PATCH 18/24] also modify alibi.h

---
 csrc/flash_attn/src/alibi.h | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/csrc/flash_attn/src/alibi.h b/csrc/flash_attn/src/alibi.h
index e714233e7e..52057a9598 100644
--- a/csrc/flash_attn/src/alibi.h
+++ b/csrc/flash_attn/src/alibi.h
@@ -37,14 +37,20 @@ struct Alibi {
         const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
         if constexpr (Is_causal) {  // Simpler, we add the same bias vector to all rows
             #pragma unroll
-            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
-                const int col_idx_base = col_idx_offset + nj * 8;
+            for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
+                const int row_idx_base = row_idx_offset + mi * warp_row_stride;
                 #pragma unroll
-                for (int j = 0; j < size<1, 0>(tensor); ++j) {
-                    const int col_idx = col_idx_base + j;
+                for (int i = 0; i < size<0, 0>(tensor); ++i) {
+                    const int row_idx = row_idx_base + i * 8;
+                    const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q);
                     #pragma unroll
-                    for (int mi = 0; mi < size<0>(tensor); ++mi) {
-                        tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
+                    for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                        const int col_idx_base = col_idx_offset + nj * 8;
+                        #pragma unroll
+                        for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                            const int col_idx = col_idx_base + j;
+                            tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope);
+                        }
                     }
                 }
             }
@@ -61,7 +67,7 @@ struct Alibi {
                         #pragma unroll
                         for (int j = 0; j < size<1, 0>(tensor); ++j) {
                             const int col_idx = col_idx_base + j;
-                            tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+                            tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope);
                         }
                     }
                 }

From 88e1cc9c47a6bb6317432d58305196b6abdebf8d Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Sat, 8 Feb 2025 09:38:28 -0500
Subject: [PATCH 19/24] Update mask.h

---
 csrc/flash_attn/src/mask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index 649a1dc854..709493331a 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -181,7 +181,7 @@ struct Mask {
                                         tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope);
 
                                     } else {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope);
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == row_idx) ? 0 : alibi_slope);
 
                                     }
                                 }

From 4bda2db97ba780d3fc342935f8bdcfbce1c5f76e Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Sat, 8 Feb 2025 09:38:56 -0500
Subject: [PATCH 20/24] alibi.h use the non max seqlen formula which seems more
 correct actually?

---
 csrc/flash_attn/src/alibi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/flash_attn/src/alibi.h b/csrc/flash_attn/src/alibi.h
index 52057a9598..5300cec763 100644
--- a/csrc/flash_attn/src/alibi.h
+++ b/csrc/flash_attn/src/alibi.h
@@ -67,7 +67,7 @@ struct Alibi {
                         #pragma unroll
                         for (int j = 0; j < size<1, 0>(tensor); ++j) {
                             const int col_idx = col_idx_base + j;
-                            tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope);
+                            tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == row_idx) ? 0 : alibi_slope);
                         }
                     }
                 }

From 5b937eb306d6a2a04b723ddde7a06dfd332a5195 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Mon, 24 Mar 2025 21:29:27 +0000
Subject: [PATCH 21/24] Revert "alibi.h use the non max seqlen formula which
 seems more correct actually?"

This reverts commit 4bda2db97ba780d3fc342935f8bdcfbce1c5f76e.
---
 csrc/flash_attn/src/alibi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/flash_attn/src/alibi.h b/csrc/flash_attn/src/alibi.h
index 5300cec763..52057a9598 100644
--- a/csrc/flash_attn/src/alibi.h
+++ b/csrc/flash_attn/src/alibi.h
@@ -67,7 +67,7 @@ struct Alibi {
                         #pragma unroll
                         for (int j = 0; j < size<1, 0>(tensor); ++j) {
                             const int col_idx = col_idx_base + j;
-                            tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == row_idx) ? 0 : alibi_slope);
+                            tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope);
                         }
                     }
                 }

From cb7632e7998143aea6754ad1d148522d3d2fe3a4 Mon Sep 17 00:00:00 2001
From: Timothy Fei Truong Jr <ttruong@ne47.bio>
Date: Mon, 24 Mar 2025 21:29:29 +0000
Subject: [PATCH 22/24] Revert "Update mask.h"

This reverts commit 88e1cc9c47a6bb6317432d58305196b6abdebf8d.
---
 csrc/flash_attn/src/mask.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h
index 709493331a..649a1dc854 100644
--- a/csrc/flash_attn/src/mask.h
+++ b/csrc/flash_attn/src/mask.h
@@ -181,7 +181,7 @@ struct Mask {
                                         tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope);
 
                                     } else {
-                                        tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == row_idx) ? 0 : alibi_slope);
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope);
 
                                     }
                                 }

From 7104b3495ea8aaf825a9248354508f216a7b9bfd Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Mon, 24 Mar 2025 17:32:01 -0400
Subject: [PATCH 23/24] publish for pytorch 2.5.1 and 2.6.0 too

---
 .github/workflows/publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index e7a586e2c0..42a7db735b 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -44,7 +44,7 @@ jobs:
           # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
           os: [ubuntu-20.04]
           python-version: ['3.10']
-          torch-version: ['2.4.1']
+          torch-version: ['2.4.1', '2.5.1', '2.6.0']
           cuda-version: ['11.8.0']
           # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
           # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.

From 4ebb8be5df0bb73ae7da14f408b9795e9423a2a1 Mon Sep 17 00:00:00 2001
From: timt51 <ttruong@ne47.bio>
Date: Mon, 24 Mar 2025 20:51:27 -0400
Subject: [PATCH 24/24] export minv and maxv for pytorch 2.6

---
 .github/workflows/publish.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 42a7db735b..7997ef0572 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -118,8 +118,8 @@ jobs:
           # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
           # This code is ugly, maybe there's a better way to do this.
           export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
-            minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118, '2.5': 118}[env['MATRIX_TORCH_VERSION']]; \
-            maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121, '2.5': 124}[env['MATRIX_TORCH_VERSION']]; \
+            minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118, '2.5': 118, '2.6': 118}[env['MATRIX_TORCH_VERSION']]; \
+            maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121, '2.5': 124, '2.6': 126}[env['MATRIX_TORCH_VERSION']]; \
             print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \
           )
           if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then