Bump version to 0.0.32 (fairinternal/xformers#1418)

danthe3rd · xFormers Bot · commit ac0064145d87 · 2025-08-13T19:28:23.000Z
__original_commit__ = fairinternal/xformers@dee2011
diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml
@@ -24,9 +24,10 @@ runs:
         print(sys.version)
         cushort = "${{ inputs.toolkit_short_version }}"
         # Version uploaded to pypi (rather than PyTorch s3)
-        TORCH_CUDA_DEFAULT = "126"  # since pytorch 2.7.0
+        TORCH_CUDA_DEFAULT = "128"  # since pytorch 2.8.0
         # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts
         full_version, install_script = {
+          "129": ("12.9.0", "https://developer.download.nvidia.com/compute/cuda/12.9.1/local_installers/cuda_12.9.1_575.57.08_linux.run"),
           "128": ("12.8.1", "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run"),
           # (Build with nvcc 12.8 on linux even when building for 12.6 to avoid seg fault in Flash3 build)
           "126": ("12.8.1", "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run"),
@@ -35,6 +36,7 @@ runs:
           "6.1": ("6.1.2", "https://repo.radeon.com/amdgpu-install/6.1.3/rhel/8.9/amdgpu-install-6.1.60103-1.el8.noarch.rpm"),
           "6.2.4": ("6.2.4", "https://repo.radeon.com/amdgpu-install/6.2.4/rhel/8.9/amdgpu-install-6.2.60204-1.el8.noarch.rpm"),
           "6.3": ("6.3.1", "https://repo.radeon.com/amdgpu-install/6.3.1/rhel/8.9/amdgpu-install-6.3.60301-1.el8.noarch.rpm"),
+          "6.4": ("6.4.2", "https://repo.radeon.com/amdgpu-install/6.4.2/rhel/8.9/amdgpu-install-6.4.60402-1.el8.noarch.rpm"),
         }[cushort]
         with open(os.environ['GITHUB_OUTPUT'], "r+") as fp:
           fp.write("CUDA_VERSION=" + full_version + "\n")
@@ -50,7 +52,7 @@ runs:
     - name: Install cuda
       if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda'
       id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.23
+      uses: Jimver/cuda-toolkit@v0.2.24
       with:
         cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }}
         method: network
diff --git a/.github/workflows/rocm_build.yml b/.github/workflows/rocm_build.yml
@@ -22,9 +22,9 @@ jobs:
       matrix:
         os: ['ubuntu-alola']
         python: ['3.11']
-        torch_version: ['2.7.1']
+        torch_version: ['2.8.0']
         toolkit_type: ['rocm']
-        toolkit_short_version: ['6.2.4', '6.3']
+        toolkit_short_version: ['6.3', '6.4']
 
     uses: ./.github/workflows/wheels_build.yml
     if: github.repository == 'rocm/xformers'
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -33,12 +33,12 @@ jobs:
         PYTHON_VERSION = "3.9"
         # NOTE: Don't forget to update `upload_pt`'s matrix
         # when changing the CUDA/ROCM versions below!
-        CU_VERSIONS = ['118', '126', '128']
-        ROCM_VERSIONS = ["6.2.4", "6.3"]
+        CU_VERSIONS = ['126', '128', '129']
+        ROCM_VERSIONS = ['6.3', '6.4']
 
         include = []
         for os in ['8-core-ubuntu', 'windows-8-core']:
-          for torch_version in ['2.7.1']:
+          for torch_version in ['2.8.0']:
             # CUDA builds
             for cuda_short_version in CU_VERSIONS:
               if cuda_short_version < "124" and "windows" in os:
@@ -88,7 +88,7 @@ jobs:
     uses: ./.github/workflows/wheels_upload_pip.yml
     with:
       twine_username: __token__
-      filter: "*torch2.7.1+cu126*"
+      filter: "*torch2.8.0+cu128*"
       execute: ${{ github.repository == 'facebookresearch/xformers' && github.event_name != 'pull_request' }}
     secrets:
       twine_password: ${{ secrets.PYPI_TOKEN }}
@@ -99,15 +99,15 @@ jobs:
       fail-fast: false
       matrix:
         suffix:
-          - cu118
           - cu126
           - cu128
+          - cu129
           - rocm6.2.4
           - rocm6.3
     uses: ./.github/workflows/wheels_upload_s3.yml
     with:
       aws_role: "arn:aws:iam::749337293305:role/pytorch_bot_uploader_role"
       s3_path: s3://pytorch/whl/${{ matrix.suffix }}/
       aws_s3_cp_extra_args: --acl public-read
-      filter: "*torch2.7.1+${{ matrix.suffix }}*"
+      filter: "*torch2.8.0+${{ matrix.suffix }}*"
       execute: ${{ github.repository == 'facebookresearch/xformers' && github.ref_type == 'tag' }}
diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml
@@ -73,7 +73,7 @@ jobs:
 
       - name: Install build dependencies
         run: |
-          $PY -m pip install wheel setuptools ninja torch==2.7.1 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu126
+          $PY -m pip install wheel setuptools ninja torch==2.8.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu126
           git config --global --add safe.directory "*"
           $PY -c "import torch; print('torch', torch.__version__)"
           $PY -c "import torch; print('torch.cuda', torch.version.cuda)"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,9 +4,20 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [0.0.32] - 2025-??-??
+## [0.0.33] - 2025-??-??
+
+## [0.0.32] - 2025-08-13
+Pre-built binary wheels are available for PyTorch 2.8.0.
+
+### Added
+- Support flash-attention package up to 2.8.2
+- Speed improvements to `python -m xformers.profiler.find_slowest`
+
 ### Removed
 - Removed autograd backward pass for merge_attentions as it is easy to use incorrectly.
+- Attention biases are no longer `torch.Tensor` subclasses. This is no longer
+necessary for torch.compile to work, and was adding more complexity
+
 
 ## [0.0.31] - 2025-06-25
 Pre-built binary wheels are available for PyTorch 2.7.1.
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
 # Example requirement, can be anything that pip knows
 # install with `pip install -r requirements.txt`, and make sure that CI does the same
-torch >= 2.7
+torch >= 2.8
 numpy
diff --git a/version.txt b/version.txt
@@ -1 +1 @@
-0.0.32
+0.0.33
diff --git a/xformers/ops/fmha/flash.py b/xformers/ops/fmha/flash.py
@@ -74,7 +74,7 @@
 
     FLASH_VERSION = flash_attn.__version__
     FLASH_VER_MIN = parse_version("2.7.1")
-    FLASH_VER_LAST = parse_version("2.8.0.post2")  # last supported, inclusive
+    FLASH_VER_LAST = parse_version("2.8.2")  # last supported, inclusive
     flash_ver_parsed = parse_version(FLASH_VERSION)
     if (
         flash_ver_parsed < FLASH_VER_MIN or flash_ver_parsed > FLASH_VER_LAST
diff --git a/xformers/ops/fmha/flash3.py b/xformers/ops/fmha/flash3.py
@@ -71,7 +71,7 @@ def _flash_attention3_incompatible_reason() -> Optional[str]:
         torch.ops.flash_attn_3, "bwd"
     ):
         return "PyTorch has no `flash_attn_3` - is your Flash-Attention version recent enough?"
-    if not torch.ops.flash_attn_3.fwd.default._schema.is_backward_compatible_with(
+    if not torch.ops.flash_attn_3.fwd.default._schema.is_backward_compatible_with(  # type: ignore
         parse_schema(
             "flash_attn_3::fwd(Tensor q, Tensor k, Tensor v, Tensor(k_new!)? k_new=None, "
             "Tensor(v_new!)? v_new=None, Tensor? q_v=None, Tensor(out!)? out=None, "
@@ -87,7 +87,7 @@ def _flash_attention3_incompatible_reason() -> Optional[str]:
         )
     ):
         return "flash_attn_3::fwd operator is not compatible"
-    if not torch.ops.flash_attn_3.bwd.default._schema.is_backward_compatible_with(
+    if not torch.ops.flash_attn_3.bwd.default._schema.is_backward_compatible_with(  # type: ignore
         parse_schema(
             "flash_attn_3::bwd(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, "
             "Tensor(dq!)? dq=None, Tensor(dk!)? dk=None, Tensor(dv!)? dv=None, Tensor? cu_seqlens_q=None, "
diff --git a/xformers/ops/fmha/torch_attention_compat.py b/xformers/ops/fmha/torch_attention_compat.py
@@ -27,7 +27,7 @@ def is_pt_cutlass_compatible(force: bool = False) -> bool:
     expected_fwd_schema = parse_schema(fwd_schema_str)
 
     current_schema = torch.ops.aten._efficient_attention_forward.default._schema
-    if not current_schema.is_backward_compatible_with(expected_fwd_schema):
+    if not current_schema.is_backward_compatible_with(expected_fwd_schema):  # type: ignore
         compatible = False
 
         if force:
@@ -48,7 +48,7 @@ def is_pt_cutlass_compatible(force: bool = False) -> bool:
     expected_bwd_schema = parse_schema(bwd_schema_str)
 
     current_schema = torch.ops.aten._efficient_attention_backward.default._schema
-    if not current_schema.is_backward_compatible_with(expected_bwd_schema):
+    if not current_schema.is_backward_compatible_with(expected_bwd_schema):  # type: ignore
         compatible = False
 
         if force:
@@ -108,8 +108,8 @@ def is_pt_flash_old(force: bool) -> Optional[bool]:
     expected_old_fwd_schema = parse_schema(old_fwd_schema_str)
 
     current_schema = torch.ops.aten._flash_attention_forward.default._schema
-    old = current_schema.is_backward_compatible_with(expected_old_fwd_schema)
-    if not old and not current_schema.is_backward_compatible_with(expected_fwd_schema):
+    old = current_schema.is_backward_compatible_with(expected_old_fwd_schema)  # type: ignore
+    if not old and not current_schema.is_backward_compatible_with(expected_fwd_schema):  # type: ignore
         compatible = False
 
         if force:
@@ -136,7 +136,7 @@ def is_pt_flash_old(force: bool) -> Optional[bool]:
     expected_bwd_schema = parse_schema(bwd_schema_old_str if old else bwd_schema_str)
 
     current_schema = torch.ops.aten._flash_attention_backward.default._schema
-    if not current_schema.is_backward_compatible_with(expected_bwd_schema):
+    if not current_schema.is_backward_compatible_with(expected_bwd_schema):  # type: ignore
         compatible = False
 
         if force: