diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml
index 67d08ac7b0..b4766a3b13 100644
--- a/.github/actions/setup-build-cuda/action.yml
+++ b/.github/actions/setup-build-cuda/action.yml
@@ -24,9 +24,10 @@ runs:
         print(sys.version)
         cushort = "${{ inputs.toolkit_short_version }}"
         # Version uploaded to pypi (rather than PyTorch s3)
-        TORCH_CUDA_DEFAULT = "128"  # since pytorch 2.8.0
+        TORCH_CUDA_DEFAULT = "130"  # since pytorch 2.9.0
         # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts
         full_version, install_script = {
+          "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"),
           "129": ("12.9.0", "https://developer.download.nvidia.com/compute/cuda/12.9.1/local_installers/cuda_12.9.1_575.57.08_linux.run"),
           "128": ("12.8.1", "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run"),
           # (Build with nvcc 12.8 on linux even when building for 12.6 to avoid seg fault in Flash3 build)
@@ -52,7 +53,8 @@ runs:
     - name: Install cuda
       if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda'
       id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.24
+      # Using N-Storm fork until https://github.com/Jimver/cuda-toolkit/issues/395 is resolved
+      uses: N-Storm/cuda-toolkit@v0.2.28
       with:
         cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }}
         method: network
diff --git a/.github/actions/setup-env-build/action.yml b/.github/actions/setup-env-build/action.yml
index 495288a398..8c16200cb8 100644
--- a/.github/actions/setup-env-build/action.yml
+++ b/.github/actions/setup-env-build/action.yml
@@ -26,7 +26,7 @@ runs:
 
         CONDA_INSTALL_CMD = "micromamba create python=${{ inputs.python }} zlib pip ninja ccache=4.8 -c conda-forge -q -y"
 
-        conda_env_key = CONDA_INSTALL_CMD + "[cu129][v2]"
+        conda_env_key = CONDA_INSTALL_CMD + "[cu130][v2]"
         for file in sorted(glob.glob("requirement*.txt")):
           conda_env_key += f"\n########## {file}\n"
           conda_env_key += Path(file).read_text()
diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml
index 44dd0cab33..9bc445c770 100644
--- a/.github/workflows/linters.yml
+++ b/.github/workflows/linters.yml
@@ -7,4 +7,3 @@ on:
 jobs:
   repo:
     uses: ./.github/workflows/linters_reusable.yml
-
diff --git a/.github/workflows/linters_reusable.yml b/.github/workflows/linters_reusable.yml
index 9100b63803..a6eee05af7 100644
--- a/.github/workflows/linters_reusable.yml
+++ b/.github/workflows/linters_reusable.yml
@@ -15,9 +15,9 @@ jobs:
         with:
           fetch-depth: 0
       - name: Setup Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
         with:
-          python-version: '3.9'
+          python-version: '3.10'
       - name: Run pre-script
         if: ${{ inputs.pre-script }}
         run: ${{ inputs.pre-script }}
diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 1897eab1d1..f27f393ffc 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -1,6 +1,6 @@
 name: rocm-ci
 
-on: 
+on:
   pull_request:
     types: [labeled, synchronize, reopened]
   workflow_dispatch: {}
@@ -43,23 +43,23 @@ jobs:
 
         export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}}
         echo GIT_BRANCH        = $GIT_BRANCH
-        
+
         export ROCM_PATH=/opt/rocm
         echo ROCM_PATH         = $ROCM_PATH
 
         hipcc --version
         rocm-smi
         rocminfo | grep "gfx"
-        
+
     - name: Setup build env
       run: |
         conda create -n xformers python=3.11
         export PATH=/opt/conda/envs/xformers/bin:$PATH
         python -VV
-        
+
         python -m pip install -U torch --index-url=https://download.pytorch.org/whl/rocm6.2
         python -c "import torch; print(f'PyTorch version {torch.__version__}')"
-        
+
         python -m pip install ninja scipy pytest pytest-html
 
     - name: Pre-build clean
@@ -72,16 +72,16 @@ jobs:
       run: |
         export PATH=/opt/conda/envs/xformers/bin:$PATH
         export MAX_JOBS=20
-        
+
         python -m pip install -e ./_xformers --verbose
         python -m xformers.info
 
     - name: Run python tests
       run: |
         export PATH=/opt/conda/envs/xformers/bin:$PATH
-        
+
         python -m pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py
-    
+
     - name: Archive logs
       if: '!cancelled()'
       uses: actions/upload-artifact@v4
diff --git a/.github/workflows/rocm_docker.yml b/.github/workflows/rocm_docker.yml
index 31fc242a71..d774306c08 100644
--- a/.github/workflows/rocm_docker.yml
+++ b/.github/workflows/rocm_docker.yml
@@ -12,13 +12,13 @@ jobs:
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
-      
+
       - name: Login to Docker Hub
         uses: docker/login-action@v3
         with:
           username: ${{ vars.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
-      
+
       - name: Build and push
         uses: docker/build-push-action@v6
         with:
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 0be59490eb..57363a5018 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -29,16 +29,16 @@ jobs:
         environ = os.environ
 
         # All builds are python-version agnostic,
-        # and built with python 3.9
-        PYTHON_VERSION = "3.9"
+        # and built with python 3.10
+        PYTHON_VERSION = "3.10"
         # NOTE: Don't forget to update `upload_pt`'s matrix
         # when changing the CUDA/ROCM versions below!
-        CU_VERSIONS = ['126', '128', '129']
+        CU_VERSIONS = ['126', '129', '130']
         ROCM_VERSIONS = ['6.4']
 
         include = []
         for os in ['8-core-ubuntu', 'windows-8-core']:
-          for torch_version in ['2.8.0']:
+          for torch_version in ['2.9.0']:
             # CUDA builds
             for cuda_short_version in CU_VERSIONS:
               if cuda_short_version < "124" and "windows" in os:
@@ -88,7 +88,7 @@ jobs:
     uses: ./.github/workflows/wheels_upload_pip.yml
     with:
       twine_username: __token__
-      filter: "*torch2.8.0+cu128*"
+      filter: "*torch2.9.0+cu130*"
       execute: ${{ github.repository == 'facebookresearch/xformers' && github.event_name != 'pull_request' }}
     secrets:
       twine_password: ${{ secrets.PYPI_TOKEN }}
@@ -100,13 +100,13 @@ jobs:
       matrix:
         suffix:
           - cu126
-          - cu128
           - cu129
+          - cu130
           - rocm6.4
     uses: ./.github/workflows/wheels_upload_s3.yml
     with:
       aws_role: "arn:aws:iam::749337293305:role/pytorch_bot_uploader_role"
       s3_path: s3://pytorch/whl/${{ matrix.suffix }}/
       aws_s3_cp_extra_args: --acl public-read
-      filter: "*torch2.8.0+${{ matrix.suffix }}*"
+      filter: "*torch2.9.0+${{ matrix.suffix }}*"
       execute: ${{ github.repository == 'facebookresearch/xformers' && github.ref_type == 'tag' }}
diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml
index 654602a26b..f2b0440f62 100644
--- a/.github/workflows/wheels_build.yml
+++ b/.github/workflows/wheels_build.yml
@@ -53,9 +53,13 @@ jobs:
       run:
         shell: bash
     steps:
-      - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 120
+      - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 120 && fromJSON(inputs.toolkit_short_version) < 130
         run: |
-          echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 9.0a" >> ${GITHUB_ENV}
+          echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a" >> ${GITHUB_ENV}
+
+      - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 130
+        run: |
+          echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a 10.0a 10.3a 11.0a 12.0a 12.1a" >> ${GITHUB_ENV}
 
       - if: runner.os == 'Windows'
         run: git config --system core.longpaths true
diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml
index bac84742c6..dd12adb98b 100644
--- a/.github/workflows/win-build.yml
+++ b/.github/workflows/win-build.yml
@@ -61,8 +61,8 @@ jobs:
         uses: ./.github/actions/setup-build-cuda
         with:
           toolkit_type: "cuda"
-          toolkit_short_version: "128"
-          python: "3.9"
+          toolkit_short_version: "130"
+          python: "3.10"
 
       - name: Remove internal code
         run: |
@@ -73,7 +73,7 @@ jobs:
 
       - name: Install build dependencies
         run: |
-          $PY -m pip install wheel setuptools ninja torch==2.8.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu126
+          $PY -m pip install wheel setuptools ninja torch==2.9.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu130
           git config --global --add safe.directory "*"
           $PY -c "import torch; print('torch', torch.__version__)"
           $PY -c "import torch; print('torch.cuda', torch.version.cuda)"
diff --git a/.gitignore b/.gitignore
index 978b6be3e0..a32f07f167 100644
--- a/.gitignore
+++ b/.gitignore
@@ -71,5 +71,3 @@ xformers/csrc/attention/hip_fmha/instances/*_hip.h
 xformers/csrc/attention/hip_decoder/*.cu
 xformers/csrc/attention/hip_decoder/*.hip
 xformers/csrc/attention/hip_decoder/*_hip.h
-
-
diff --git a/requirements.txt b/requirements.txt
index 399c304661..5ab1043984 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 # Example requirement, can be anything that pip knows
 # install with `pip install -r requirements.txt`, and make sure that CI does the same
-torch >= 2.8
+torch >= 2.9
 numpy
diff --git a/setup.py b/setup.py
index 967732f639..1cac34c942 100644
--- a/setup.py
+++ b/setup.py
@@ -176,7 +176,9 @@ def get_flash_attention2_nvcc_archs_flags(cuda_version: int):
         return []
     # Figure out default archs to target
     DEFAULT_ARCHS_LIST = ""
-    if cuda_version >= 1208:
+    if cuda_version >= 1300:
+        DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0;11.0;12.0"
+    elif cuda_version >= 1208:
         DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0;12.0"
     elif cuda_version >= 1108:
         DEFAULT_ARCHS_LIST = "8.0;8.6;9.0"
@@ -281,9 +283,15 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int):
         return []
     if cuda_version < 1203:
         return []
+    if ((sys.platform == "win32" or platform.system() == "Windows")
+        and cuda_version >= 1300):
+        return []
     archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
     if archs_list is None:
-        if torch.cuda.get_device_capability("cuda") != (9, 0):
+        if torch.cuda.get_device_capability("cuda") != (
+            9,
+            0,
+        ) and torch.cuda.get_device_capability("cuda") != (8, 0):
             return []
         archs_list = "8.0 9.0a"
     nvcc_archs_flags = []
diff --git a/tests/readme_test_on_rocm.txt b/tests/readme_test_on_rocm.txt
index c21fd0d587..754fac7fe7 100644
--- a/tests/readme_test_on_rocm.txt
+++ b/tests/readme_test_on_rocm.txt
@@ -3,11 +3,9 @@
 
    2. verify testing for generic fmha inference on ROCM
 
-      #> pytest tests/test_mem_eff_attention.py::test_forward  
+      #> pytest tests/test_mem_eff_attention.py::test_forward
 
    3. verify testing for decoder fmha inference on ROCM
 
       #> pytest tests/test_mem_eff_attention.py::test_decoder
       #> pytest tests/test_mem_eff_attention.py::test_splitk_decoder
-
-
diff --git a/third_party/cutlass b/third_party/cutlass
index e9627ce55b..8afb19d904 160000
--- a/third_party/cutlass
+++ b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit e9627ce55b42fd2599f58cd4396da9380954def0
+Subproject commit 8afb19d9047afc26816a046059afe66763e68aa5
diff --git a/third_party/flash-attention b/third_party/flash-attention
index c485eeade0..de1584b532 160000
--- a/third_party/flash-attention
+++ b/third_party/flash-attention
@@ -1 +1 @@
-Subproject commit c485eeade0c3ec9ce186c3640c52c9f1ce090b81
+Subproject commit de1584b5328321189a4d7832fe29bbd6813bf6ed
diff --git a/xformers/benchmarks/readme_benchmark_on_rocm.txt b/xformers/benchmarks/readme_benchmark_on_rocm.txt
index 9ae61f5294..cb64bb912d 100644
--- a/xformers/benchmarks/readme_benchmark_on_rocm.txt
+++ b/xformers/benchmarks/readme_benchmark_on_rocm.txt
@@ -8,10 +8,9 @@
 
     3. Benchmark for decoder fmha inference on ROCM
 
-       #> python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py  
+       #> python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py
 
     4. Other Benchmarks for fmha inference on ROCM
 
        #> python xformers/benchmarks/benchmark_attn_decoding.py
        #> python xformers/benchmarks/benchmark_mem_eff_attention_mqa.py
-
diff --git a/xformers/csrc/attention/hip_decoder/CMakeLists.txt b/xformers/csrc/attention/hip_decoder/CMakeLists.txt
index 97e2ab0b22..75e075b09e 100644
--- a/xformers/csrc/attention/hip_decoder/CMakeLists.txt
+++ b/xformers/csrc/attention/hip_decoder/CMakeLists.txt
@@ -36,14 +36,14 @@ set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES LINKER_LANGUAGE
 set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES HIP_ARCHITECTURES ${GPU_TARGETS})
 
-target_compile_options(${exe_name} PUBLIC 
+target_compile_options(${exe_name} PUBLIC
   -fno-gpu-rdc
   $<$<CONFIG:Debug>:
   --save-temps
   >
 )
 
-target_compile_options(${splitk_exe_name} PUBLIC 
+target_compile_options(${splitk_exe_name} PUBLIC
   -fno-gpu-rdc
   $<$<CONFIG:Debug>:
   --save-temps
@@ -52,13 +52,13 @@ target_compile_options(${splitk_exe_name} PUBLIC
   >
 )
 
-target_include_directories(${exe_name} PUBLIC 
+target_include_directories(${exe_name} PUBLIC
   ${ck_include}                           # ck includes
   ${torch_include}                        # aten includes
   ${torch_include}/torch/csrc/api/include # torch includes
 )
 
-target_include_directories(${splitk_exe_name} PUBLIC 
+target_include_directories(${splitk_exe_name} PUBLIC
   ${ck_include}                           # ck includes
   ${torch_include}                        # aten includes
   ${torch_include}/torch/csrc/api/include # torch includes
@@ -93,14 +93,14 @@ target_link_libraries(${splitk_exe_name} PUBLIC
   amdhip64
 )
 
-target_compile_definitions(${exe_name} PUBLIC 
+target_compile_definitions(${exe_name} PUBLIC
   ATTN_FWD_DECODER_MAIN=1
   GLIBCXX_USE_CXX11_ABI=1
   __HIP_PLATFORM_HCC__=1
   USE_ROCM=1
 )
 
-target_compile_definitions(${splitk_exe_name} PUBLIC 
+target_compile_definitions(${splitk_exe_name} PUBLIC
   ATTN_FWD_SPLITK_DECODER_MAIN=1
   GLIBCXX_USE_CXX11_ABI=1
   __HIP_PLATFORM_HCC__=1
@@ -108,13 +108,13 @@ target_compile_definitions(${splitk_exe_name} PUBLIC
 )
 
 include(CMakePrintHelpers)
-cmake_print_properties(TARGETS ${exe_name} ${splitk_exe_name} PROPERTIES 
-  LINK_LIBRARIES 
-  LINK_DIRECTORIES 
-  INCLUDE_DIRECTORIES 
-  COMPILE_DEFINITIONS 
+cmake_print_properties(TARGETS ${exe_name} ${splitk_exe_name} PROPERTIES
+  LINK_LIBRARIES
+  LINK_DIRECTORIES
+  INCLUDE_DIRECTORIES
+  COMPILE_DEFINITIONS
   COMPILE_OPTIONS
   SOURCES
   HIP_ARCHITECTURES)
 
-rocm_install(TARGETS ${exe_name} ${splitk_exe_name})
\ No newline at end of file
+rocm_install(TARGETS ${exe_name} ${splitk_exe_name})
diff --git a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
index 829df66469..72cfc4f641 100644
--- a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
+++ b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
@@ -1,16 +1,16 @@
 
 # Instances generator
 
-  The instances generator is a simple python tool used to generate several hundred of instances (.cpp files) and their references (.h files). 
-  Without this tool, manually writing those instances and references will be very laborious and easy to get wrong. 
-  
-  The instances generated by this scripts are divided into three categories visible from the scripts: 
+  The instances generator is a simple python tool used to generate several hundred of instances (.cpp files) and their references (.h files).
+  Without this tool, manually writing those instances and references will be very laborious and easy to get wrong.
+
+  The instances generated by this scripts are divided into three categories visible from the scripts:
    * Infer -- which refers to instances for calling inference-only kernels
    * Forward -- which refers to instances for calling training forward kernels
    * Backward -- which refers to instances for calling training backward kernels
-     
-  The instance generator is for being used by the HIP fmha developers themselves. It is not supposed to be used by the xformers users for 
-  building xformers, since for xformers users, the instances are already well prepared as part of the xformers codes. 
+
+  The instance generator is for being used by the HIP fmha developers themselves. It is not supposed to be used by the xformers users for
+  building xformers, since for xformers users, the instances are already well prepared as part of the xformers codes.
 
 ## how to use instance generator
 
@@ -21,7 +21,7 @@
      ```
    * To generate reduced instances (when headdim256 is not required)
 
-     ``` 
+     ```
       #> python xformers/csrc/attention/hip_fmha/generate_instances.py --ignore-hd256
      ```
    * More options except for `--ignore-hd256` could be added to suppport further customization in generating instances as required
@@ -29,5 +29,3 @@
 ## where the instances files are located
    The instances files and references files are always located under a folder `instances/` that is located under the same directory
    as the file `generate_instances.py` itself
-
-     
diff --git a/xformers/csrc/sparse24/sparse24_gemm_sm90.cu b/xformers/csrc/sparse24/sparse24_gemm_sm90.cu
index cabdc7799c..303eca1991 100644
--- a/xformers/csrc/sparse24/sparse24_gemm_sm90.cu
+++ b/xformers/csrc/sparse24/sparse24_gemm_sm90.cu
@@ -96,10 +96,10 @@ struct SparseRowwiseKernel<cutlass::float_e4m3_t> {
           float,
           ElementOut,
           cutlass::layout::RowMajor,
-          1,
+          8,
           ElementOut,
           cutlass::layout::RowMajor,
-          1,
+          8,
           cutlass::epilogue::TmaWarpSpecializedCooperative,
           EpilogueEVT>::CollectiveOp;
 
@@ -176,10 +176,10 @@ struct SparseRowwiseKernel<cutlass::bfloat16_t> {
           float,
           ElementOut,
           cutlass::layout::RowMajor,
-          1,
+          8,
           ElementOut,
           cutlass::layout::RowMajor,
-          1,
+          8,
           cutlass::epilogue::TmaWarpSpecializedCooperative,
           EpilogueEVT>::CollectiveOp;
 
diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index 5908635dac..be0f75ead4 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -31,7 +31,7 @@ def _get_use_fa3() -> bool:
 
 def fa3_available() -> bool:
     has_cuda = torch.version.cuda is not None
-    is_90a = has_cuda and torch.cuda.get_device_capability() >= (9, 0)
+    is_90a = has_cuda and (8, 0) <= torch.cuda.get_device_capability() <= (9, 0)
     has_valid_flash3 = flash3._C_flashattention3 is not None  # pyre-ignore[16]
     return is_90a and has_valid_flash3
 
diff --git a/xformers/ops/fmha/flash.py b/xformers/ops/fmha/flash.py
index 63e436698c..4956bcf25d 100644
--- a/xformers/ops/fmha/flash.py
+++ b/xformers/ops/fmha/flash.py
@@ -71,7 +71,7 @@
 
     FLASH_VERSION = flash_attn.__version__
     FLASH_VER_MIN = parse_version("2.7.1")
-    FLASH_VER_LAST = parse_version("2.8.3")  # last supported, inclusive
+    FLASH_VER_LAST = parse_version("2.8.4")  # last supported, inclusive
     flash_ver_parsed = parse_version(FLASH_VERSION)
     if (
         flash_ver_parsed < FLASH_VER_MIN or flash_ver_parsed > FLASH_VER_LAST
diff --git a/xformers/ops/fmha/flash3.py b/xformers/ops/fmha/flash3.py
index b4e55f41ba..770960f1b3 100644
--- a/xformers/ops/fmha/flash3.py
+++ b/xformers/ops/fmha/flash3.py
@@ -647,6 +647,14 @@ class FwOp(AttentionFwOpBase):
     @classmethod
     def not_supported_reasons(cls, d: Inputs) -> List[str]:
         reasons = super(FwOp, cls).not_supported_reasons(d)
+        device_type = d.query.device.type
+        if device_type == "cuda" and (torch.version.hip is None):
+            device_capability = torch.cuda.get_device_capability(d.device)
+            if device_capability > cls.CUDA_MINIMUM_COMPUTE_CAPABILITY:
+                reasons.append(
+                    f"requires device with capability == {cls.CUDA_MINIMUM_COMPUTE_CAPABILITY} "
+                    f"but your GPU has capability {device_capability} (too new)"
+                )
         check_lastdim_alignment_stride1(reasons, "query", d.query, 8)
         check_lastdim_alignment_stride1(reasons, "key", d.value, 8)
         check_lastdim_alignment_stride1(reasons, "value", d.value, 8)
@@ -801,6 +809,14 @@ class BwOp(AttentionBwOpBase):
     @classmethod
     def not_supported_reasons(cls, d: Inputs) -> List[str]:
         reasons = super(BwOp, cls).not_supported_reasons(d)
+        device_type = d.query.device.type
+        if device_type == "cuda" and (torch.version.hip is None):
+            device_capability = torch.cuda.get_device_capability(d.device)
+            if device_capability > cls.CUDA_MINIMUM_COMPUTE_CAPABILITY:
+                reasons.append(
+                    f"requires device with capability == {cls.CUDA_MINIMUM_COMPUTE_CAPABILITY} "
+                    f"but your GPU has capability {device_capability} (too new)"
+                )
         check_lastdim_alignment_stride1(reasons, "query", d.query, 8)
         check_lastdim_alignment_stride1(reasons, "key", d.value, 8)
         check_lastdim_alignment_stride1(reasons, "value", d.value, 8)