diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index ead7ff04b8..7997ef0572 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -43,14 +43,14 @@ jobs: # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the # manylinux docker image, but I haven't figured out how to install CUDA on manylinux. os: [ubuntu-20.04] - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] - torch-version: ['2.0.1', '2.1.2', '2.2.2', '2.3.1', '2.4.0'] - cuda-version: ['11.8.0', '12.3.2'] + python-version: ['3.10'] + torch-version: ['2.4.1', '2.5.1', '2.6.0'] + cuda-version: ['11.8.0'] # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not. # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI. # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs) # when building without C++11 ABI and using it on nvcr images. - cxx11_abi: ['FALSE', 'TRUE'] + cxx11_abi: ['FALSE'] exclude: # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix # Pytorch < 2.2 does not support Python 3.12 @@ -118,8 +118,8 @@ jobs: # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix # This code is ugly, maybe there's a better way to do this. export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \ - minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118}[env['MATRIX_TORCH_VERSION']]; \ - maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121}[env['MATRIX_TORCH_VERSION']]; \ + minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118, '2.5': 118, '2.6': 118}[env['MATRIX_TORCH_VERSION']]; \ + maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121, '2.5': 124, '2.6': 126}[env['MATRIX_TORCH_VERSION']]; \ print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \ ) if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then @@ -179,34 +179,34 @@ jobs: asset_name: ${{env.wheel_name}} asset_content_type: application/* - publish_package: - name: Publish package - needs: [build_wheels] - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - - name: Install dependencies - run: | - pip install ninja packaging setuptools wheel twine - # We don't want to download anything CUDA-related here - pip install torch --index-url https://download.pytorch.org/whl/cpu - - - name: Build core package - env: - FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE" - run: | - python setup.py sdist --dist-dir=dist - - - name: Deploy - env: - TWINE_USERNAME: "__token__" - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - run: | - python -m twine upload dist/* + # publish_package: + # name: Publish package + # needs: [build_wheels] + + # runs-on: ubuntu-latest + + # steps: + # - uses: actions/checkout@v3 + + # - uses: actions/setup-python@v4 + # with: + # python-version: '3.10' + + # - name: Install dependencies + # run: | + # pip install ninja packaging setuptools wheel twine + # # We don't want to download anything CUDA-related here + # pip install torch --index-url https://download.pytorch.org/whl/cpu + + # - name: Build core package + # env: + # FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE" + # run: | + # python setup.py sdist --dist-dir=dist + + # - name: Deploy + # env: + # TWINE_USERNAME: "__token__" + # TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + # run: | + # python -m twine upload dist/* diff --git a/csrc/flash_attn/src/alibi.h b/csrc/flash_attn/src/alibi.h index e714233e7e..52057a9598 100644 --- a/csrc/flash_attn/src/alibi.h +++ b/csrc/flash_attn/src/alibi.h @@ -37,14 +37,20 @@ struct Alibi { const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2; if constexpr (Is_causal) { // Simpler, we add the same bias vector to all rows #pragma unroll - for (int nj = 0; nj < size<1, 1>(tensor); ++nj) { - const int col_idx_base = col_idx_offset + nj * 8; + for (int mi = 0; mi < size<0, 1>(tensor); ++mi) { + const int row_idx_base = row_idx_offset + mi * warp_row_stride; #pragma unroll - for (int j = 0; j < size<1, 0>(tensor); ++j) { - const int col_idx = col_idx_base + j; + for (int i = 0; i < size<0, 0>(tensor); ++i) { + const int row_idx = row_idx_base + i * 8; + const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q); #pragma unroll - for (int mi = 0; mi < size<0>(tensor); ++mi) { - tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx; + for (int nj = 0; nj < size<1, 1>(tensor); ++nj) { + const int col_idx_base = col_idx_offset + nj * 8; + #pragma unroll + for (int j = 0; j < size<1, 0>(tensor); ++j) { + const int col_idx = col_idx_base + j; + tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); + } } } } @@ -61,7 +67,7 @@ struct Alibi { #pragma unroll for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; - tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx); + tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope); } } } diff --git a/csrc/flash_attn/src/mask.h b/csrc/flash_attn/src/mask.h index 7ba435a37b..649a1dc854 100644 --- a/csrc/flash_attn/src/mask.h +++ b/csrc/flash_attn/src/mask.h @@ -139,7 +139,7 @@ struct Mask { // Reshape tensor_ from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N)) Tensor tensor = make_tensor(tensor_.data(), flash::convert_layout_acc_rowcol(tensor_.layout())); // Do we need both row and column indices, or just column incides? - static constexpr bool Col_idx_only = !(Has_alibi && !Is_causal) && !Is_local && !Causal_mask; + static constexpr bool Col_idx_only = !Has_alibi && !(Has_alibi && !Is_causal) && !Is_local && !Causal_mask; const int lane_id = threadIdx.x % 32; const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2; if constexpr (Col_idx_only) { @@ -178,9 +178,10 @@ struct Mask { const int col_idx = col_idx_base + j; if constexpr (Has_alibi) { if constexpr (Is_causal) { - tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope * col_idx; + tensor(make_coord(i, mi), make_coord(j, nj)) += ((col_idx == (col_idx_limit_right - 1)) ? 0 : alibi_slope); + } else { - tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx); + tensor(make_coord(i, mi), make_coord(j, nj)) += (((row_idx + max_seqlen_k - max_seqlen_q - col_idx) == 0) ? 0 : alibi_slope); } }