From aebc5a74c6f031d5374faa83fdcc2ce92fef805f Mon Sep 17 00:00:00 2001 From: Timothy Fei Truong Jr Date: Fri, 6 Sep 2024 15:14:52 +0000 Subject: [PATCH 01/24] convert alibi bias to simply bh bias (same as alibi without dependence on position idx) --- csrc/flash_attn/src/mask.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 7ba435a37b..361ea2380e 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -153,7 +153,7 @@ struct Mask { for (int mi = 0; mi < size<0>(tensor); ++mi) { // No causal, no local if constexpr (Has_alibi) { - tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx; + tensor(mi, make_coord(j, nj)) += alibi_slope; // * col_idx; } if constexpr (!Is_even_MN) { if (col_idx >= max_seqlen_k) { tensor(mi, make_coord(j, nj)) = -INFINITY; } @@ -178,9 +178,9 @@ struct Mask { const int col_idx = col_idx_base + j; if constexpr (Has_alibi) { if constexpr (Is_causal) { - tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope * col_idx; + tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope; // * col_idx; } else { - tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx); + tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope; // * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx); } } From cfca2e8153962ec2cd0bed7c1ae0ad394f3f03ca Mon Sep 17 00:00:00 2001 From: Timothy Fei Truong Jr Date: Fri, 6 Sep 2024 15:26:07 +0000 Subject: [PATCH 02/24] publish fewer versions --- .github/workflows/publish.yml | 70 +++++++++++++++++------------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index ead7ff04b8..cfc1aa3378 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -43,14 +43,14 @@ jobs: # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. os: [ubuntu-20.04] - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - torch-version: ['2.0.1', '2.1.2', '2.2.2', '2.3.1', '2.4.0'] - cuda-version: ['11.8.0', '12.3.2'] + python-version: ['3.9', '3.10'] + torch-version: ['2.4.0'] + cuda-version: ['11.8.0'] # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI. # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs) # when building without C++11 ABI and using it on nvcr images. - cxx11_abi: ['FALSE', 'TRUE'] + cxx11_abi: ['FALSE'] exclude: # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix # Pytorch < 2.2 does not support Python 3.12 @@ -179,34 +179,34 @@ jobs: asset_name: ${{env.wheel_name}} asset_content_type: application/* - publish_package: - name: Publish package - needs: [build_wheels] - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - - name: Install dependencies - run: | - pip install ninja packaging setuptools wheel twine - # We don't want to download anything CUDA-related here - pip install torch --index-url https://download.pytorch.org/whl/cpu - - - name: Build core package - env: - FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE" - run: | - python setup.py sdist --dist-dir=dist - - - name: Deploy - env: - TWINE_USERNAME: "__token__" - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - run: | - python -m twine upload dist/* + # publish_package: + # name: Publish package + # needs: [build_wheels] + + # runs-on: ubuntu-latest + + # steps: + # - uses: actions/checkout@v3 + + # - uses: actions/setup-python@v4 + # with: + # python-version: '3.10' + + # - name: Install dependencies + # run: | + # pip install ninja packaging setuptools wheel twine + # # We don't want to download anything CUDA-related here + # pip install torch --index-url https://download.pytorch.org/whl/cpu + + # - name: Build core package + # env: + # FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE" + # run: | + # python setup.py sdist --dist-dir=dist + + # - name: Deploy + # env: + # TWINE_USERNAME: "__token__" + # TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + # run: | + # python -m twine upload dist/* From 14b704d64b3df8ed166dda5d17c2027987d6bf9c Mon Sep 17 00:00:00 2001 From: Timothy Fei Truong Jr Date: Sat, 7 Sep 2024 03:40:04 +0000 Subject: [PATCH 03/24] fix diagonal bias --- csrc/flash_attn/src/mask.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 361ea2380e..c21933e454 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -153,7 +153,7 @@ struct Mask { for (int mi = 0; mi < size<0>(tensor); ++mi) { // No causal, no local if constexpr (Has_alibi) { - tensor(mi, make_coord(j, nj)) += alibi_slope; // * col_idx; + tensor(mi, make_coord(j, nj)) += (col_idx == 0 ? 0 : alibi_slope); // alibi_slope * col_idx; } if constexpr (!Is_even_MN) { if (col_idx >= max_seqlen_k) { tensor(mi, make_coord(j, nj)) = -INFINITY; } @@ -178,9 +178,10 @@ struct Mask { const int col_idx = col_idx_base + j; if constexpr (Has_alibi) { if constexpr (Is_causal) { - tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope; // * col_idx; + tensor(make_coord(i, mi), make_coord(j, nj)) += (col_idx == row_idx ? 0 : alibi_slope); // alibi_slope * col_idx; + } else { - tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope; // * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx); + tensor(make_coord(i, mi), make_coord(j, nj)) += (col_idx == row_idx ? 0 : alibi_slope); // -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx); } } From 5996687628275b1dd544f067ec4a34ecd747eea6 Mon Sep 17 00:00:00 2001 From: Timothy Fei Truong Jr Date: Sat, 7 Sep 2024 13:41:10 +0000 Subject: [PATCH 04/24] try just setting bias to 0 when Col_idx_only --- csrc/flash_attn/src/mask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index c21933e454..13c2688af5 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -153,7 +153,7 @@ struct Mask { for (int mi = 0; mi < size<0>(tensor); ++mi) { // No causal, no local if constexpr (Has_alibi) { - tensor(mi, make_coord(j, nj)) += (col_idx == 0 ? 0 : alibi_slope); // alibi_slope * col_idx; + tensor(mi, make_coord(j, nj)) += 0; // alibi_slope * col_idx; } if constexpr (!Is_even_MN) { if (col_idx >= max_seqlen_k) { tensor(mi, make_coord(j, nj)) = -INFINITY; } From 3b267ff203c102569887a3ceb3e3ed153f6104c4 Mon Sep 17 00:00:00 2001 From: Timothy Fei Truong Jr Date: Sat, 7 Sep 2024 14:07:09 +0000 Subject: [PATCH 05/24] try disabling Col_idx_only if alibi --- csrc/flash_attn/src/mask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 13c2688af5..0f1f3763a1 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -139,7 +139,7 @@ struct Mask { // Reshape tensor_ from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N)) Tensor tensor = make_tensor(tensor_.data(), flash::convert_layout_acc_rowcol(tensor_.layout())); // Do we need both row and column indices, or just column incides? - static constexpr bool Col_idx_only = !(Has_alibi && !Is_causal) && !Is_local && !Causal_mask; + static constexpr bool Col_idx_only = !Has_alibi && !(Has_alibi && !Is_causal) && !Is_local && !Causal_mask; const int lane_id = threadIdx.x % 32; const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2; if constexpr (Col_idx_only) { From b9ec21569c1bfa415539698f723711d7c32e67fa Mon Sep 17 00:00:00 2001 From: Timothy Fei Truong Jr Date: Sun, 8 Sep 2024 01:36:38 +0000 Subject: [PATCH 06/24] fix bh bias diagonal handling --- csrc/flash_attn/src/mask.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 0f1f3763a1..16a84c6ebc 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -153,7 +153,7 @@ struct Mask { for (int mi = 0; mi < size<0>(tensor); ++mi) { // No causal, no local if constexpr (Has_alibi) { - tensor(mi, make_coord(j, nj)) += 0; // alibi_slope * col_idx; + tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx; } if constexpr (!Is_even_MN) { if (col_idx >= max_seqlen_k) { tensor(mi, make_coord(j, nj)) = -INFINITY; } @@ -178,10 +178,10 @@ struct Mask { const int col_idx = col_idx_base + j; if constexpr (Has_alibi) { if constexpr (Is_causal) { - tensor(make_coord(i, mi), make_coord(j, nj)) += (col_idx == row_idx ? 0 : alibi_slope); // alibi_slope * col_idx; + tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); } else { - tensor(make_coord(i, mi), make_coord(j, nj)) += (col_idx == row_idx ? 0 : alibi_slope); // -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx); + tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx); } } From 42a895e6cf8e1906f353f694246a4d143521b421 Mon Sep 17 00:00:00 2001 From: timt51 Date: Wed, 5 Feb 2025 18:41:27 -0500 Subject: [PATCH 07/24] compile for pytorch 2.5.1 --- .github/workflows/publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index cfc1aa3378..5745280bb3 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -44,7 +44,7 @@ jobs: # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. os: [ubuntu-20.04] python-version: ['3.9', '3.10'] - torch-version: ['2.4.0'] + torch-version: ['2.5.1'] cuda-version: ['11.8.0'] # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI. From f124d98a3695f7b27b7c75b148ae79b18697d7b3 Mon Sep 17 00:00:00 2001 From: timt51 Date: Wed, 5 Feb 2025 18:58:54 -0500 Subject: [PATCH 08/24] fix TORCH_CUDA_VERSION env var for pytorch 2.5 --- .github/workflows/publish.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 5745280bb3..0f37f8e654 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -118,8 +118,8 @@ jobs: # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix # This code is ugly, maybe there's a better way to do this. export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \ - minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118}[env['MATRIX_TORCH_VERSION']]; \ - maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121}[env['MATRIX_TORCH_VERSION']]; \ + minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118, '2.5': 118}[env['MATRIX_TORCH_VERSION']]; \ + maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121, '2.5': 124}[env['MATRIX_TORCH_VERSION']]; \ print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \ ) if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then From 4f8b153f25bf5237f0d97d42e2a2b7956cc40a90 Mon Sep 17 00:00:00 2001 From: timt51 Date: Fri, 7 Feb 2025 09:10:16 -0500 Subject: [PATCH 09/24] adjust alibi for non causal too --- csrc/flash_attn/src/mask.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 16a84c6ebc..b42edbbac9 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -177,13 +177,7 @@ struct Mask { for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; if constexpr (Has_alibi) { - if constexpr (Is_causal) { - tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); - - } else { - tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx); - - } + tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); } if constexpr (Causal_mask) { if (col_idx >= col_idx_limit_right) { From d2dd0fdf215aa8bbe6263e5c288f8a26b64e6551 Mon Sep 17 00:00:00 2001 From: timt51 Date: Fri, 7 Feb 2025 09:12:08 -0500 Subject: [PATCH 10/24] compile for torch 2.4.1 too --- .github/workflows/publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 0f37f8e654..67a7965a8b 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -44,7 +44,7 @@ jobs: # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. os: [ubuntu-20.04] python-version: ['3.9', '3.10'] - torch-version: ['2.5.1'] + torch-version: ['2.4.1', '2.5.1'] cuda-version: ['11.8.0'] # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI. From 980f524999b3c446f040349dcef69df58cbb71fd Mon Sep 17 00:00:00 2001 From: timt51 Date: Fri, 7 Feb 2025 09:28:34 -0500 Subject: [PATCH 11/24] make a version that always adds the alibi slope... --- csrc/flash_attn/src/mask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index b42edbbac9..31df79f4e8 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -177,7 +177,7 @@ struct Mask { for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; if constexpr (Has_alibi) { - tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); + tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope; } if constexpr (Causal_mask) { if (col_idx >= col_idx_limit_right) { From c42bf6bc66797306028f3d9246db62897ced164e Mon Sep 17 00:00:00 2001 From: timt51 Date: Fri, 7 Feb 2025 09:29:06 -0500 Subject: [PATCH 12/24] ...and only compile for py310 and torch24 --- .github/workflows/publish.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 67a7965a8b..e7a586e2c0 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -43,8 +43,8 @@ jobs: # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. os: [ubuntu-20.04] - python-version: ['3.9', '3.10'] - torch-version: ['2.4.1', '2.5.1'] + python-version: ['3.10'] + torch-version: ['2.4.1'] cuda-version: ['11.8.0'] # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI. From a8d7fccd26e58eefe89f1b79454ea88886f89d21 Mon Sep 17 00:00:00 2001 From: timt51 Date: Fri, 7 Feb 2025 09:47:15 -0500 Subject: [PATCH 13/24] redo causal and noncausal alibi for diagonal --- csrc/flash_attn/src/mask.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 31df79f4e8..709493331a 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -177,7 +177,13 @@ struct Mask { for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; if constexpr (Has_alibi) { - tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope; + if constexpr (Is_causal) { + tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); + + } else { + tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == row_idx) ? 0 : alibi_slope); + + } } if constexpr (Causal_mask) { if (col_idx >= col_idx_limit_right) { From 599e3225e34fdd5aa9a9261247b44e56df85849f Mon Sep 17 00:00:00 2001 From: timt51 Date: Fri, 7 Feb 2025 09:59:36 -0500 Subject: [PATCH 14/24] bidirectional diagonal handling - take into account seqlens --- csrc/flash_attn/src/mask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 709493331a..234f2f61a6 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -181,7 +181,7 @@ struct Mask { tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); } else { - tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == row_idx) ? 0 : alibi_slope); + tensor(make_coord(i, mi), make_coord(j, nj)) += ((row_idx + max_seqlen_k == max_seqlen_q - col_idx) ? 0 : alibi_slope); } } From fdc50f3987bb3c93b3b10641726130826f699e66 Mon Sep 17 00:00:00 2001 From: timt51 Date: Fri, 7 Feb 2025 10:06:31 -0500 Subject: [PATCH 15/24] fix diagonal formula --- csrc/flash_attn/src/mask.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 234f2f61a6..3c38aaa4d8 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -177,13 +177,7 @@ struct Mask { for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; if constexpr (Has_alibi) { - if constexpr (Is_causal) { - tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); - - } else { - tensor(make_coord(i, mi), make_coord(j, nj)) += ((row_idx + max_seqlen_k == max_seqlen_q - col_idx) ? 0 : alibi_slope); - - } + tensor(make_coord(i, mi), make_coord(j, nj)) += ((row_idx + max_seqlen_k == max_seqlen_q + col_idx) ? 0 : alibi_slope); } if constexpr (Causal_mask) { if (col_idx >= col_idx_limit_right) { From a645eddfb8b51dd6788efb1a8a816e2a29da282d Mon Sep 17 00:00:00 2001 From: timt51 Date: Fri, 7 Feb 2025 21:38:12 -0500 Subject: [PATCH 16/24] diagonal noncausal try accounting for max_seqlens too --- csrc/flash_attn/src/mask.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 3c38aaa4d8..5645e6db99 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -177,7 +177,13 @@ struct Mask { for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; if constexpr (Has_alibi) { - tensor(make_coord(i, mi), make_coord(j, nj)) += ((row_idx + max_seqlen_k == max_seqlen_q + col_idx) ? 0 : alibi_slope); + if constexpr (Is_causal) { + tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); + + } else { + tensor(make_coord(i, mi), make_coord(j, nj)) += ((row_idx + max_seqlen_k == max_seqlen_q + col_idx) ? 0 : alibi_slope); + + } } if constexpr (Causal_mask) { if (col_idx >= col_idx_limit_right) { From f5ce6ee9bdfb04f565c39475e63afb68fbaa78c0 Mon Sep 17 00:00:00 2001 From: timt51 Date: Sat, 8 Feb 2025 08:29:40 -0500 Subject: [PATCH 17/24] follow the original expression more closely --- csrc/flash_attn/src/mask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 5645e6db99..649a1dc854 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -181,7 +181,7 @@ struct Mask { tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); } else { - tensor(make_coord(i, mi), make_coord(j, nj)) += ((row_idx + max_seqlen_k == max_seqlen_q + col_idx) ? 0 : alibi_slope); + tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope); } } From efbbaf4530c88b2557e5dda2b1717eb02b040406 Mon Sep 17 00:00:00 2001 From: timt51 Date: Sat, 8 Feb 2025 09:22:31 -0500 Subject: [PATCH 18/24] also modify alibi.h --- csrc/flash_attn/src/alibi.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/csrc/flash_attn/src/alibi.h b/csrc/flash_attn/src/alibi.h index e714233e7e..52057a9598 100644 --- a/csrc/flash_attn/src/alibi.h +++ b/csrc/flash_attn/src/alibi.h @@ -37,14 +37,20 @@ struct Alibi { const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2; if constexpr (Is_causal) { // Simpler, we add the same bias vector to all rows #pragma unroll - for (int nj = 0; nj < size<1, 1>(tensor); ++nj) { - const int col_idx_base = col_idx_offset + nj * 8; + for (int mi = 0; mi < size<0, 1>(tensor); ++mi) { + const int row_idx_base = row_idx_offset + mi * warp_row_stride; #pragma unroll - for (int j = 0; j < size<1, 0>(tensor); ++j) { - const int col_idx = col_idx_base + j; + for (int i = 0; i < size<0, 0>(tensor); ++i) { + const int row_idx = row_idx_base + i * 8; + const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q); #pragma unroll - for (int mi = 0; mi < size<0>(tensor); ++mi) { - tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx; + for (int nj = 0; nj < size<1, 1>(tensor); ++nj) { + const int col_idx_base = col_idx_offset + nj * 8; + #pragma unroll + for (int j = 0; j < size<1, 0>(tensor); ++j) { + const int col_idx = col_idx_base + j; + tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); + } } } } @@ -61,7 +67,7 @@ struct Alibi { #pragma unroll for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; - tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx); + tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope); } } } From 88e1cc9c47a6bb6317432d58305196b6abdebf8d Mon Sep 17 00:00:00 2001 From: timt51 Date: Sat, 8 Feb 2025 09:38:28 -0500 Subject: [PATCH 19/24] Update mask.h --- csrc/flash_attn/src/mask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 649a1dc854..709493331a 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -181,7 +181,7 @@ struct Mask { tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); } else { - tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope); + tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == row_idx) ? 0 : alibi_slope); } } From 4bda2db97ba780d3fc342935f8bdcfbce1c5f76e Mon Sep 17 00:00:00 2001 From: timt51 Date: Sat, 8 Feb 2025 09:38:56 -0500 Subject: [PATCH 20/24] alibi.h use the non max seqlen formula which seems more correct actually? --- csrc/flash_attn/src/alibi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/alibi.h b/csrc/flash_attn/src/alibi.h index 52057a9598..5300cec763 100644 --- a/csrc/flash_attn/src/alibi.h +++ b/csrc/flash_attn/src/alibi.h @@ -67,7 +67,7 @@ struct Alibi { #pragma unroll for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; - tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope); + tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == row_idx) ? 0 : alibi_slope); } } } From 5b937eb306d6a2a04b723ddde7a06dfd332a5195 Mon Sep 17 00:00:00 2001 From: Timothy Fei Truong Jr Date: Mon, 24 Mar 2025 21:29:27 +0000 Subject: [PATCH 21/24] Revert "alibi.h use the non max seqlen formula which seems more correct actually?" This reverts commit 4bda2db97ba780d3fc342935f8bdcfbce1c5f76e. --- csrc/flash_attn/src/alibi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/alibi.h b/csrc/flash_attn/src/alibi.h index 5300cec763..52057a9598 100644 --- a/csrc/flash_attn/src/alibi.h +++ b/csrc/flash_attn/src/alibi.h @@ -67,7 +67,7 @@ struct Alibi { #pragma unroll for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; - tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == row_idx) ? 0 : alibi_slope); + tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope); } } } From cb7632e7998143aea6754ad1d148522d3d2fe3a4 Mon Sep 17 00:00:00 2001 From: Timothy Fei Truong Jr Date: Mon, 24 Mar 2025 21:29:29 +0000 Subject: [PATCH 22/24] Revert "Update mask.h" This reverts commit 88e1cc9c47a6bb6317432d58305196b6abdebf8d. --- csrc/flash_attn/src/mask.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 709493331a..649a1dc854 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -181,7 +181,7 @@ struct Mask { tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); } else { - tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == row_idx) ? 0 : alibi_slope); + tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope); } } From 7104b3495ea8aaf825a9248354508f216a7b9bfd Mon Sep 17 00:00:00 2001 From: timt51 Date: Mon, 24 Mar 2025 17:32:01 -0400 Subject: [PATCH 23/24] publish for pytorch 2.5.1 and 2.6.0 too --- .github/workflows/publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index e7a586e2c0..42a7db735b 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -44,7 +44,7 @@ jobs: # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. os: [ubuntu-20.04] python-version: ['3.10'] - torch-version: ['2.4.1'] + torch-version: ['2.4.1', '2.5.1', '2.6.0'] cuda-version: ['11.8.0'] # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI. From 4ebb8be5df0bb73ae7da14f408b9795e9423a2a1 Mon Sep 17 00:00:00 2001 From: timt51 Date: Mon, 24 Mar 2025 20:51:27 -0400 Subject: [PATCH 24/24] export minv and maxv for pytorch 2.6 --- .github/workflows/publish.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 42a7db735b..7997ef0572 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -118,8 +118,8 @@ jobs: # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix # This code is ugly, maybe there's a better way to do this. export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \ - minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118, '2.5': 118}[env['MATRIX_TORCH_VERSION']]; \ - maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121, '2.5': 124}[env['MATRIX_TORCH_VERSION']]; \ + minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118, '2.5': 118, '2.6': 118}[env['MATRIX_TORCH_VERSION']]; \ + maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121, '2.5': 124, '2.6': 126}[env['MATRIX_TORCH_VERSION']]; \ print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \ ) if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then