From c0c3c342fa58d8715378c4a0ff75a0a658cf701c Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Thu, 9 Oct 2025 12:18:32 +0200
Subject: [PATCH 01/22] BUILD CUDA 13

---
 .github/actions/setup-build-cuda/action.yml  |  5 +++--
 .github/actions/setup-env-build/action.yml   |  2 +-
 .github/workflows/wheels.yml                 |  4 ++--
 .github/workflows/wheels_build.yml           |  8 ++++++--
 setup.py                                     |  6 ++++--
 third_party/cutlass                          |  2 +-
 third_party/flash-attention                  |  2 +-
 xformers/csrc/sparse24/sparse24_gemm_sm90.cu |  8 ++++----
 xformers/ops/fmha/dispatch.py                |  2 +-
 xformers/ops/fmha/flash.py                   |  2 +-
 xformers/ops/fmha/flash3.py                  | 16 ++++++++++++++++
 11 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml
index 67d08ac7b0..151c393595 100644
--- a/.github/actions/setup-build-cuda/action.yml
+++ b/.github/actions/setup-build-cuda/action.yml
@@ -24,9 +24,10 @@ runs:
         print(sys.version)
         cushort = "${{ inputs.toolkit_short_version }}"
         # Version uploaded to pypi (rather than PyTorch s3)
-        TORCH_CUDA_DEFAULT = "128"  # since pytorch 2.8.0
+        TORCH_CUDA_DEFAULT = "129"  # since pytorch 2.8.0
         # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts
         full_version, install_script = {
+          "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"),
           "129": ("12.9.0", "https://developer.download.nvidia.com/compute/cuda/12.9.1/local_installers/cuda_12.9.1_575.57.08_linux.run"),
           "128": ("12.8.1", "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run"),
           # (Build with nvcc 12.8 on linux even when building for 12.6 to avoid seg fault in Flash3 build)
@@ -52,7 +53,7 @@ runs:
     - name: Install cuda
       if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda'
       id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.24
+      uses: Jimver/cuda-toolkit@v0.2.27
       with:
         cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }}
         method: network
diff --git a/.github/actions/setup-env-build/action.yml b/.github/actions/setup-env-build/action.yml
index 495288a398..8c16200cb8 100644
--- a/.github/actions/setup-env-build/action.yml
+++ b/.github/actions/setup-env-build/action.yml
@@ -26,7 +26,7 @@ runs:
 
         CONDA_INSTALL_CMD = "micromamba create python=${{ inputs.python }} zlib pip ninja ccache=4.8 -c conda-forge -q -y"
 
-        conda_env_key = CONDA_INSTALL_CMD + "[cu129][v2]"
+        conda_env_key = CONDA_INSTALL_CMD + "[cu130][v2]"
         for file in sorted(glob.glob("requirement*.txt")):
           conda_env_key += f"\n########## {file}\n"
           conda_env_key += Path(file).read_text()
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 0be59490eb..ecbf163394 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -33,7 +33,7 @@ jobs:
         PYTHON_VERSION = "3.9"
         # NOTE: Don't forget to update `upload_pt`'s matrix
         # when changing the CUDA/ROCM versions below!
-        CU_VERSIONS = ['126', '128', '129']
+        CU_VERSIONS = ['126', '129', '130']
         ROCM_VERSIONS = ['6.4']
 
         include = []
@@ -100,8 +100,8 @@ jobs:
       matrix:
         suffix:
           - cu126
-          - cu128
           - cu129
+          - cu130
           - rocm6.4
     uses: ./.github/workflows/wheels_upload_s3.yml
     with:
diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml
index 654602a26b..87bfe2a355 100644
--- a/.github/workflows/wheels_build.yml
+++ b/.github/workflows/wheels_build.yml
@@ -53,9 +53,13 @@ jobs:
       run:
         shell: bash
     steps:
-      - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 120
+      - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 120 && fromJSON(inputs.toolkit_short_version) < 130
         run: |
-          echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 9.0a" >> ${GITHUB_ENV}
+          echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a" >> ${GITHUB_ENV}
+
+      - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 130
+        run: |
+          echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a 10.0f 11.0f 12.0f" >> ${GITHUB_ENV}
 
       - if: runner.os == 'Windows'
         run: git config --system core.longpaths true
diff --git a/setup.py b/setup.py
index 967732f639..2317d57c6e 100644
--- a/setup.py
+++ b/setup.py
@@ -176,7 +176,9 @@ def get_flash_attention2_nvcc_archs_flags(cuda_version: int):
         return []
     # Figure out default archs to target
     DEFAULT_ARCHS_LIST = ""
-    if cuda_version >= 1208:
+    if cuda_version >= 1300:
+        DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0f;11.0f;12.0f"
+    elif cuda_version >= 1208:
         DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0;12.0"
     elif cuda_version >= 1108:
         DEFAULT_ARCHS_LIST = "8.0;8.6;9.0"
@@ -283,7 +285,7 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int):
         return []
     archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
     if archs_list is None:
-        if torch.cuda.get_device_capability("cuda") != (9, 0):
+        if torch.cuda.get_device_capability("cuda") != (9, 0) and torch.cuda.get_device_capability("cuda") != (8, 0):
             return []
         archs_list = "8.0 9.0a"
     nvcc_archs_flags = []
diff --git a/third_party/cutlass b/third_party/cutlass
index e9627ce55b..c6aeb9179c 160000
--- a/third_party/cutlass
+++ b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit e9627ce55b42fd2599f58cd4396da9380954def0
+Subproject commit c6aeb9179c5f74a0fcdbd28527bf4b6ba8c60752
diff --git a/third_party/flash-attention b/third_party/flash-attention
index c485eeade0..5183de4335 160000
--- a/third_party/flash-attention
+++ b/third_party/flash-attention
@@ -1 +1 @@
-Subproject commit c485eeade0c3ec9ce186c3640c52c9f1ce090b81
+Subproject commit 5183de433587a8aedd2450e9f18166c24521af29
diff --git a/xformers/csrc/sparse24/sparse24_gemm_sm90.cu b/xformers/csrc/sparse24/sparse24_gemm_sm90.cu
index cabdc7799c..303eca1991 100644
--- a/xformers/csrc/sparse24/sparse24_gemm_sm90.cu
+++ b/xformers/csrc/sparse24/sparse24_gemm_sm90.cu
@@ -96,10 +96,10 @@ struct SparseRowwiseKernel<cutlass::float_e4m3_t> {
           float,
           ElementOut,
           cutlass::layout::RowMajor,
-          1,
+          8,
           ElementOut,
           cutlass::layout::RowMajor,
-          1,
+          8,
           cutlass::epilogue::TmaWarpSpecializedCooperative,
           EpilogueEVT>::CollectiveOp;
 
@@ -176,10 +176,10 @@ struct SparseRowwiseKernel<cutlass::bfloat16_t> {
           float,
           ElementOut,
           cutlass::layout::RowMajor,
-          1,
+          8,
           ElementOut,
           cutlass::layout::RowMajor,
-          1,
+          8,
           cutlass::epilogue::TmaWarpSpecializedCooperative,
           EpilogueEVT>::CollectiveOp;
 
diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py
index 5908635dac..be0f75ead4 100644
--- a/xformers/ops/fmha/dispatch.py
+++ b/xformers/ops/fmha/dispatch.py
@@ -31,7 +31,7 @@ def _get_use_fa3() -> bool:
 
 def fa3_available() -> bool:
     has_cuda = torch.version.cuda is not None
-    is_90a = has_cuda and torch.cuda.get_device_capability() >= (9, 0)
+    is_90a = has_cuda and (8, 0) <= torch.cuda.get_device_capability() <= (9, 0)
     has_valid_flash3 = flash3._C_flashattention3 is not None  # pyre-ignore[16]
     return is_90a and has_valid_flash3
 
diff --git a/xformers/ops/fmha/flash.py b/xformers/ops/fmha/flash.py
index 63e436698c..4956bcf25d 100644
--- a/xformers/ops/fmha/flash.py
+++ b/xformers/ops/fmha/flash.py
@@ -71,7 +71,7 @@
 
     FLASH_VERSION = flash_attn.__version__
     FLASH_VER_MIN = parse_version("2.7.1")
-    FLASH_VER_LAST = parse_version("2.8.3")  # last supported, inclusive
+    FLASH_VER_LAST = parse_version("2.8.4")  # last supported, inclusive
     flash_ver_parsed = parse_version(FLASH_VERSION)
     if (
         flash_ver_parsed < FLASH_VER_MIN or flash_ver_parsed > FLASH_VER_LAST
diff --git a/xformers/ops/fmha/flash3.py b/xformers/ops/fmha/flash3.py
index b4e55f41ba..770960f1b3 100644
--- a/xformers/ops/fmha/flash3.py
+++ b/xformers/ops/fmha/flash3.py
@@ -647,6 +647,14 @@ class FwOp(AttentionFwOpBase):
     @classmethod
     def not_supported_reasons(cls, d: Inputs) -> List[str]:
         reasons = super(FwOp, cls).not_supported_reasons(d)
+        device_type = d.query.device.type
+        if device_type == "cuda" and (torch.version.hip is None):
+            device_capability = torch.cuda.get_device_capability(d.device)
+            if device_capability > cls.CUDA_MINIMUM_COMPUTE_CAPABILITY:
+                reasons.append(
+                    f"requires device with capability == {cls.CUDA_MINIMUM_COMPUTE_CAPABILITY} "
+                    f"but your GPU has capability {device_capability} (too new)"
+                )
         check_lastdim_alignment_stride1(reasons, "query", d.query, 8)
         check_lastdim_alignment_stride1(reasons, "key", d.value, 8)
         check_lastdim_alignment_stride1(reasons, "value", d.value, 8)
@@ -801,6 +809,14 @@ class BwOp(AttentionBwOpBase):
     @classmethod
     def not_supported_reasons(cls, d: Inputs) -> List[str]:
         reasons = super(BwOp, cls).not_supported_reasons(d)
+        device_type = d.query.device.type
+        if device_type == "cuda" and (torch.version.hip is None):
+            device_capability = torch.cuda.get_device_capability(d.device)
+            if device_capability > cls.CUDA_MINIMUM_COMPUTE_CAPABILITY:
+                reasons.append(
+                    f"requires device with capability == {cls.CUDA_MINIMUM_COMPUTE_CAPABILITY} "
+                    f"but your GPU has capability {device_capability} (too new)"
+                )
         check_lastdim_alignment_stride1(reasons, "query", d.query, 8)
         check_lastdim_alignment_stride1(reasons, "key", d.value, 8)
         check_lastdim_alignment_stride1(reasons, "value", d.value, 8)

From 3bd4670c6e2f7b088e887f36b6dd502e298957b4 Mon Sep 17 00:00:00 2001
From: Johnny <johnnync13@gmail.com>
Date: Thu, 9 Oct 2025 12:22:04 +0200
Subject: [PATCH 02/22] Update action.yml

---
 .github/actions/setup-build-cuda/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml
index 151c393595..f5d9a6f771 100644
--- a/.github/actions/setup-build-cuda/action.yml
+++ b/.github/actions/setup-build-cuda/action.yml
@@ -24,7 +24,7 @@ runs:
         print(sys.version)
         cushort = "${{ inputs.toolkit_short_version }}"
         # Version uploaded to pypi (rather than PyTorch s3)
-        TORCH_CUDA_DEFAULT = "129"  # since pytorch 2.8.0
+        TORCH_CUDA_DEFAULT = "130"  # since pytorch 2.8.0
         # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts
         full_version, install_script = {
           "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"),

From f0e238402c41fc0d1790e405ec20b895a80c1d5f Mon Sep 17 00:00:00 2001
From: Johnny <johnnync13@gmail.com>
Date: Wed, 15 Oct 2025 06:08:43 +0200
Subject: [PATCH 03/22] Update action.yml

---
 .github/actions/setup-build-cuda/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml
index f5d9a6f771..a9bf2fd785 100644
--- a/.github/actions/setup-build-cuda/action.yml
+++ b/.github/actions/setup-build-cuda/action.yml
@@ -53,7 +53,7 @@ runs:
     - name: Install cuda
       if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda'
       id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.27
+      uses: Jimver/cuda-toolkit@v0.2.28
       with:
         cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }}
         method: network

From 37eed3afbbb418ca09e2dd98213b3bd40c2ff22d Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Wed, 15 Oct 2025 15:14:37 +0200
Subject: [PATCH 04/22] Update flash-attention hash

---
 third_party/flash-attention | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/flash-attention b/third_party/flash-attention
index 5183de4335..a76e692a6e 160000
--- a/third_party/flash-attention
+++ b/third_party/flash-attention
@@ -1 +1 @@
-Subproject commit 5183de433587a8aedd2450e9f18166c24521af29
+Subproject commit a76e692a6eb13121c27db6187629acacda6160bc

From db622e4f0f333df50a249d4f062de84e0166e5d9 Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Tue, 21 Oct 2025 12:18:10 -0700
Subject: [PATCH 05/22] fix error cccl

---
 third_party/cutlass         | 2 +-
 third_party/flash-attention | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/cutlass b/third_party/cutlass
index c6aeb9179c..b1d6e2c9b3 160000
--- a/third_party/cutlass
+++ b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit c6aeb9179c5f74a0fcdbd28527bf4b6ba8c60752
+Subproject commit b1d6e2c9b334dfa811e4183dfbd02419249e4b52
diff --git a/third_party/flash-attention b/third_party/flash-attention
index a76e692a6e..933b2c3ebb 160000
--- a/third_party/flash-attention
+++ b/third_party/flash-attention
@@ -1 +1 @@
-Subproject commit a76e692a6eb13121c27db6187629acacda6160bc
+Subproject commit 933b2c3ebb8a3da378f5fefb4e398c8a9970ad81

From 920ade351460a830dcd89d4b2f9e8f4a62205b2c Mon Sep 17 00:00:00 2001
From: Johnny <johnnync13@gmail.com>
Date: Tue, 21 Oct 2025 16:47:55 -0700
Subject: [PATCH 06/22] Update requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 399c304661..5ab1043984 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 # Example requirement, can be anything that pip knows
 # install with `pip install -r requirements.txt`, and make sure that CI does the same
-torch >= 2.8
+torch >= 2.9
 numpy

From c2407a6cddd60ba990b06f7563303f1f64fcf590 Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Tue, 21 Oct 2025 16:53:26 -0700
Subject: [PATCH 07/22] fix error pytorch 2.9.0 in CI

---
 .github/actions/setup-build-cuda/action.yml | 2 +-
 .github/workflows/wheels.yml                | 6 +++---
 .github/workflows/win-build.yml             | 2 +-
 third_party/flash-attention                 | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml
index a9bf2fd785..2b140009b1 100644
--- a/.github/actions/setup-build-cuda/action.yml
+++ b/.github/actions/setup-build-cuda/action.yml
@@ -24,7 +24,7 @@ runs:
         print(sys.version)
         cushort = "${{ inputs.toolkit_short_version }}"
         # Version uploaded to pypi (rather than PyTorch s3)
-        TORCH_CUDA_DEFAULT = "130"  # since pytorch 2.8.0
+        TORCH_CUDA_DEFAULT = "130"  # since pytorch 2.9.0
         # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts
         full_version, install_script = {
           "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"),
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index ecbf163394..d56c3640dd 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -38,7 +38,7 @@ jobs:
 
         include = []
         for os in ['8-core-ubuntu', 'windows-8-core']:
-          for torch_version in ['2.8.0']:
+          for torch_version in ['2.9.0']:
             # CUDA builds
             for cuda_short_version in CU_VERSIONS:
               if cuda_short_version < "124" and "windows" in os:
@@ -88,7 +88,7 @@ jobs:
     uses: ./.github/workflows/wheels_upload_pip.yml
     with:
       twine_username: __token__
-      filter: "*torch2.8.0+cu128*"
+      filter: "*torch2.9.0+cu130*"
       execute: ${{ github.repository == 'facebookresearch/xformers' && github.event_name != 'pull_request' }}
     secrets:
       twine_password: ${{ secrets.PYPI_TOKEN }}
@@ -108,5 +108,5 @@ jobs:
       aws_role: "arn:aws:iam::749337293305:role/pytorch_bot_uploader_role"
       s3_path: s3://pytorch/whl/${{ matrix.suffix }}/
       aws_s3_cp_extra_args: --acl public-read
-      filter: "*torch2.8.0+${{ matrix.suffix }}*"
+      filter: "*torch2.9.0+${{ matrix.suffix }}*"
       execute: ${{ github.repository == 'facebookresearch/xformers' && github.ref_type == 'tag' }}
diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml
index bac84742c6..109cb007e6 100644
--- a/.github/workflows/win-build.yml
+++ b/.github/workflows/win-build.yml
@@ -73,7 +73,7 @@ jobs:
 
       - name: Install build dependencies
         run: |
-          $PY -m pip install wheel setuptools ninja torch==2.8.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu126
+          $PY -m pip install wheel setuptools ninja torch==2.9.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu126
           git config --global --add safe.directory "*"
           $PY -c "import torch; print('torch', torch.__version__)"
           $PY -c "import torch; print('torch.cuda', torch.version.cuda)"
diff --git a/third_party/flash-attention b/third_party/flash-attention
index 933b2c3ebb..9dbed03d1a 160000
--- a/third_party/flash-attention
+++ b/third_party/flash-attention
@@ -1 +1 @@
-Subproject commit 933b2c3ebb8a3da378f5fefb4e398c8a9970ad81
+Subproject commit 9dbed03d1a7a5862998c182c83d8265fea9dc21b

From 3e2e11e3ced1f1085a23f2c9db9edb191e17f7f9 Mon Sep 17 00:00:00 2001
From: Johnny <johnnync13@gmail.com>
Date: Tue, 28 Oct 2025 08:11:39 -0700
Subject: [PATCH 08/22] Update linters_reusable.yml

---
 .github/workflows/linters_reusable.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/linters_reusable.yml b/.github/workflows/linters_reusable.yml
index 9100b63803..a6eee05af7 100644
--- a/.github/workflows/linters_reusable.yml
+++ b/.github/workflows/linters_reusable.yml
@@ -15,9 +15,9 @@ jobs:
         with:
           fetch-depth: 0
       - name: Setup Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
         with:
-          python-version: '3.9'
+          python-version: '3.10'
       - name: Run pre-script
         if: ${{ inputs.pre-script }}
         run: ${{ inputs.pre-script }}

From bfb2271dcbb250aee14894e94a17a54ee92df8ce Mon Sep 17 00:00:00 2001
From: Johnny <johnnynuca14@gmail.com>
Date: Tue, 28 Oct 2025 08:15:07 -0700
Subject: [PATCH 09/22] Update CUDA toolkit and Python versions in workflow

---
 .github/workflows/win-build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml
index 109cb007e6..e44cc95bab 100644
--- a/.github/workflows/win-build.yml
+++ b/.github/workflows/win-build.yml
@@ -61,8 +61,8 @@ jobs:
         uses: ./.github/actions/setup-build-cuda
         with:
           toolkit_type: "cuda"
-          toolkit_short_version: "128"
-          python: "3.9"
+          toolkit_short_version: "130"
+          python: "3.10"
 
       - name: Remove internal code
         run: |

From d434cb39bdd2462799d141ba30f1a94ea3032b3c Mon Sep 17 00:00:00 2001
From: Johnny <johnnynuca14@gmail.com>
Date: Tue, 28 Oct 2025 08:18:33 -0700
Subject: [PATCH 10/22] Update Python version from 3.9 to 3.10

---
 .github/workflows/wheels.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index d56c3640dd..57363a5018 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -29,8 +29,8 @@ jobs:
         environ = os.environ
 
         # All builds are python-version agnostic,
-        # and built with python 3.9
-        PYTHON_VERSION = "3.9"
+        # and built with python 3.10
+        PYTHON_VERSION = "3.10"
         # NOTE: Don't forget to update `upload_pt`'s matrix
         # when changing the CUDA/ROCM versions below!
         CU_VERSIONS = ['126', '129', '130']

From ea4407122ac380e35262459005ec45fae719f0ff Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Tue, 28 Oct 2025 08:55:30 -0700
Subject: [PATCH 11/22] upstream

---
 third_party/cutlass         | 2 +-
 third_party/flash-attention | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/cutlass b/third_party/cutlass
index b1d6e2c9b3..b2ca083d2b 160000
--- a/third_party/cutlass
+++ b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit b1d6e2c9b334dfa811e4183dfbd02419249e4b52
+Subproject commit b2ca083d2bb96c41d9b3c5a930637c641f6669bf
diff --git a/third_party/flash-attention b/third_party/flash-attention
index 9dbed03d1a..b3f1b6a5bd 160000
--- a/third_party/flash-attention
+++ b/third_party/flash-attention
@@ -1 +1 @@
-Subproject commit 9dbed03d1a7a5862998c182c83d8265fea9dc21b
+Subproject commit b3f1b6a5bdcce820e74cc0bb6f615165387195cc

From dbe25a2f5bf8730de12c90e3f60d6301c1e38b54 Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Tue, 28 Oct 2025 09:01:56 -0700
Subject: [PATCH 12/22] lint

---
 .github/workflows/linters.yml                 |  1 -
 .github/workflows/rocm_ci.yml                 | 16 ++++++-------
 .github/workflows/rocm_docker.yml             |  4 ++--
 .gitignore                                    |  2 --
 setup.py                                      |  5 +++-
 tests/readme_test_on_rocm.txt                 |  4 +---
 .../benchmarks/readme_benchmark_on_rocm.txt   |  3 +--
 .../csrc/attention/hip_decoder/CMakeLists.txt | 24 +++++++++----------
 .../attention/hip_fmha/GENERATE_INSTANCES.md  | 18 +++++++-------
 9 files changed, 36 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml
index 44dd0cab33..9bc445c770 100644
--- a/.github/workflows/linters.yml
+++ b/.github/workflows/linters.yml
@@ -7,4 +7,3 @@ on:
 jobs:
   repo:
     uses: ./.github/workflows/linters_reusable.yml
-
diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
index 1897eab1d1..f27f393ffc 100644
--- a/.github/workflows/rocm_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -1,6 +1,6 @@
 name: rocm-ci
 
-on: 
+on:
   pull_request:
     types: [labeled, synchronize, reopened]
   workflow_dispatch: {}
@@ -43,23 +43,23 @@ jobs:
 
         export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}}
         echo GIT_BRANCH        = $GIT_BRANCH
-        
+
         export ROCM_PATH=/opt/rocm
         echo ROCM_PATH         = $ROCM_PATH
 
         hipcc --version
         rocm-smi
         rocminfo | grep "gfx"
-        
+
     - name: Setup build env
       run: |
         conda create -n xformers python=3.11
         export PATH=/opt/conda/envs/xformers/bin:$PATH
         python -VV
-        
+
         python -m pip install -U torch --index-url=https://download.pytorch.org/whl/rocm6.2
         python -c "import torch; print(f'PyTorch version {torch.__version__}')"
-        
+
         python -m pip install ninja scipy pytest pytest-html
 
     - name: Pre-build clean
@@ -72,16 +72,16 @@ jobs:
       run: |
         export PATH=/opt/conda/envs/xformers/bin:$PATH
         export MAX_JOBS=20
-        
+
         python -m pip install -e ./_xformers --verbose
         python -m xformers.info
 
     - name: Run python tests
       run: |
         export PATH=/opt/conda/envs/xformers/bin:$PATH
-        
+
         python -m pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py
-    
+
     - name: Archive logs
       if: '!cancelled()'
       uses: actions/upload-artifact@v4
diff --git a/.github/workflows/rocm_docker.yml b/.github/workflows/rocm_docker.yml
index 31fc242a71..d774306c08 100644
--- a/.github/workflows/rocm_docker.yml
+++ b/.github/workflows/rocm_docker.yml
@@ -12,13 +12,13 @@ jobs:
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
-      
+
       - name: Login to Docker Hub
         uses: docker/login-action@v3
         with:
           username: ${{ vars.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
-      
+
       - name: Build and push
         uses: docker/build-push-action@v6
         with:
diff --git a/.gitignore b/.gitignore
index 978b6be3e0..a32f07f167 100644
--- a/.gitignore
+++ b/.gitignore
@@ -71,5 +71,3 @@ xformers/csrc/attention/hip_fmha/instances/*_hip.h
 xformers/csrc/attention/hip_decoder/*.cu
 xformers/csrc/attention/hip_decoder/*.hip
 xformers/csrc/attention/hip_decoder/*_hip.h
-
-
diff --git a/setup.py b/setup.py
index 2317d57c6e..1bd352d25f 100644
--- a/setup.py
+++ b/setup.py
@@ -285,7 +285,10 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int):
         return []
     archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
     if archs_list is None:
-        if torch.cuda.get_device_capability("cuda") != (9, 0) and torch.cuda.get_device_capability("cuda") != (8, 0):
+        if torch.cuda.get_device_capability("cuda") != (
+            9,
+            0,
+        ) and torch.cuda.get_device_capability("cuda") != (8, 0):
             return []
         archs_list = "8.0 9.0a"
     nvcc_archs_flags = []
diff --git a/tests/readme_test_on_rocm.txt b/tests/readme_test_on_rocm.txt
index c21fd0d587..754fac7fe7 100644
--- a/tests/readme_test_on_rocm.txt
+++ b/tests/readme_test_on_rocm.txt
@@ -3,11 +3,9 @@
 
    2. verify testing for generic fmha inference on ROCM
 
-      #> pytest tests/test_mem_eff_attention.py::test_forward  
+      #> pytest tests/test_mem_eff_attention.py::test_forward
 
    3. verify testing for decoder fmha inference on ROCM
 
       #> pytest tests/test_mem_eff_attention.py::test_decoder
       #> pytest tests/test_mem_eff_attention.py::test_splitk_decoder
-
-
diff --git a/xformers/benchmarks/readme_benchmark_on_rocm.txt b/xformers/benchmarks/readme_benchmark_on_rocm.txt
index 9ae61f5294..cb64bb912d 100644
--- a/xformers/benchmarks/readme_benchmark_on_rocm.txt
+++ b/xformers/benchmarks/readme_benchmark_on_rocm.txt
@@ -8,10 +8,9 @@
 
     3. Benchmark for decoder fmha inference on ROCM
 
-       #> python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py  
+       #> python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py
 
     4. Other Benchmarks for fmha inference on ROCM
 
        #> python xformers/benchmarks/benchmark_attn_decoding.py
        #> python xformers/benchmarks/benchmark_mem_eff_attention_mqa.py
-
diff --git a/xformers/csrc/attention/hip_decoder/CMakeLists.txt b/xformers/csrc/attention/hip_decoder/CMakeLists.txt
index 97e2ab0b22..75e075b09e 100644
--- a/xformers/csrc/attention/hip_decoder/CMakeLists.txt
+++ b/xformers/csrc/attention/hip_decoder/CMakeLists.txt
@@ -36,14 +36,14 @@ set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES LINKER_LANGUAGE
 set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES HIP_ARCHITECTURES ${GPU_TARGETS})
 
-target_compile_options(${exe_name} PUBLIC 
+target_compile_options(${exe_name} PUBLIC
   -fno-gpu-rdc
   $<$<CONFIG:Debug>:
   --save-temps
   >
 )
 
-target_compile_options(${splitk_exe_name} PUBLIC 
+target_compile_options(${splitk_exe_name} PUBLIC
   -fno-gpu-rdc
   $<$<CONFIG:Debug>:
   --save-temps
@@ -52,13 +52,13 @@ target_compile_options(${splitk_exe_name} PUBLIC
   >
 )
 
-target_include_directories(${exe_name} PUBLIC 
+target_include_directories(${exe_name} PUBLIC
   ${ck_include}                           # ck includes
   ${torch_include}                        # aten includes
   ${torch_include}/torch/csrc/api/include # torch includes
 )
 
-target_include_directories(${splitk_exe_name} PUBLIC 
+target_include_directories(${splitk_exe_name} PUBLIC
   ${ck_include}                           # ck includes
   ${torch_include}                        # aten includes
   ${torch_include}/torch/csrc/api/include # torch includes
@@ -93,14 +93,14 @@ target_link_libraries(${splitk_exe_name} PUBLIC
   amdhip64
 )
 
-target_compile_definitions(${exe_name} PUBLIC 
+target_compile_definitions(${exe_name} PUBLIC
   ATTN_FWD_DECODER_MAIN=1
   GLIBCXX_USE_CXX11_ABI=1
   __HIP_PLATFORM_HCC__=1
   USE_ROCM=1
 )
 
-target_compile_definitions(${splitk_exe_name} PUBLIC 
+target_compile_definitions(${splitk_exe_name} PUBLIC
   ATTN_FWD_SPLITK_DECODER_MAIN=1
   GLIBCXX_USE_CXX11_ABI=1
   __HIP_PLATFORM_HCC__=1
@@ -108,13 +108,13 @@ target_compile_definitions(${splitk_exe_name} PUBLIC
 )
 
 include(CMakePrintHelpers)
-cmake_print_properties(TARGETS ${exe_name} ${splitk_exe_name} PROPERTIES 
-  LINK_LIBRARIES 
-  LINK_DIRECTORIES 
-  INCLUDE_DIRECTORIES 
-  COMPILE_DEFINITIONS 
+cmake_print_properties(TARGETS ${exe_name} ${splitk_exe_name} PROPERTIES
+  LINK_LIBRARIES
+  LINK_DIRECTORIES
+  INCLUDE_DIRECTORIES
+  COMPILE_DEFINITIONS
   COMPILE_OPTIONS
   SOURCES
   HIP_ARCHITECTURES)
 
-rocm_install(TARGETS ${exe_name} ${splitk_exe_name})
\ No newline at end of file
+rocm_install(TARGETS ${exe_name} ${splitk_exe_name})
diff --git a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
index 829df66469..72cfc4f641 100644
--- a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
+++ b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md
@@ -1,16 +1,16 @@
 
 # Instances generator
 
-  The instances generator is a simple python tool used to generate several hundred of instances (.cpp files) and their references (.h files). 
-  Without this tool, manually writing those instances and references will be very laborious and easy to get wrong. 
-  
-  The instances generated by this scripts are divided into three categories visible from the scripts: 
+  The instances generator is a simple python tool used to generate several hundred of instances (.cpp files) and their references (.h files).
+  Without this tool, manually writing those instances and references will be very laborious and easy to get wrong.
+
+  The instances generated by this scripts are divided into three categories visible from the scripts:
    * Infer -- which refers to instances for calling inference-only kernels
    * Forward -- which refers to instances for calling training forward kernels
    * Backward -- which refers to instances for calling training backward kernels
-     
-  The instance generator is for being used by the HIP fmha developers themselves. It is not supposed to be used by the xformers users for 
-  building xformers, since for xformers users, the instances are already well prepared as part of the xformers codes. 
+
+  The instance generator is for being used by the HIP fmha developers themselves. It is not supposed to be used by the xformers users for
+  building xformers, since for xformers users, the instances are already well prepared as part of the xformers codes.
 
 ## how to use instance generator
 
@@ -21,7 +21,7 @@
      ```
    * To generate reduced instances (when headdim256 is not required)
 
-     ``` 
+     ```
       #> python xformers/csrc/attention/hip_fmha/generate_instances.py --ignore-hd256
      ```
    * More options except for `--ignore-hd256` could be added to suppport further customization in generating instances as required
@@ -29,5 +29,3 @@
 ## where the instances files are located
    The instances files and references files are always located under a folder `instances/` that is located under the same directory
    as the file `generate_instances.py` itself
-
-     

From b819a230a984a6684bf73bfacdeccc8d99386c15 Mon Sep 17 00:00:00 2001
From: Johnny <johnnynuca14@gmail.com>
Date: Tue, 28 Oct 2025 10:30:26 -0700
Subject: [PATCH 13/22] Add use-github-cache option to CUDA setup action

---
 .github/actions/setup-build-cuda/action.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml
index 2b140009b1..61165689f3 100644
--- a/.github/actions/setup-build-cuda/action.yml
+++ b/.github/actions/setup-build-cuda/action.yml
@@ -57,6 +57,7 @@ runs:
       with:
         cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }}
         method: network
+        use-github-cache: false
     - if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda'
       shell: bash
       run: |

From a62a9a9543760a424acc5389f4f490806b479bbb Mon Sep 17 00:00:00 2001
From: Johnny <johnnynuca14@gmail.com>
Date: Tue, 28 Oct 2025 18:53:27 -0700
Subject: [PATCH 14/22] Update cuda-toolkit action to use N-Storm fork

Switched from Jimver to N-Storm fork of cuda-toolkit.
---
 .github/actions/setup-build-cuda/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml
index 61165689f3..b4766a3b13 100644
--- a/.github/actions/setup-build-cuda/action.yml
+++ b/.github/actions/setup-build-cuda/action.yml
@@ -53,11 +53,11 @@ runs:
     - name: Install cuda
       if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda'
       id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.28
+      # Using N-Storm fork until https://github.com/Jimver/cuda-toolkit/issues/395 is resolved
+      uses: N-Storm/cuda-toolkit@v0.2.28
       with:
         cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }}
         method: network
-        use-github-cache: false
     - if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda'
       shell: bash
       run: |

From f732af6d99c43c726d007d6f0a4cf9c3de049398 Mon Sep 17 00:00:00 2001
From: Johnny <johnnynuca14@gmail.com>
Date: Tue, 28 Oct 2025 19:01:29 -0700
Subject: [PATCH 15/22] Fix CUDA architecture list format in setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1bd352d25f..1527570426 100644
--- a/setup.py
+++ b/setup.py
@@ -177,7 +177,7 @@ def get_flash_attention2_nvcc_archs_flags(cuda_version: int):
     # Figure out default archs to target
     DEFAULT_ARCHS_LIST = ""
     if cuda_version >= 1300:
-        DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0f;11.0f;12.0f"
+        DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0;11.0;12.0"
     elif cuda_version >= 1208:
         DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0;12.0"
     elif cuda_version >= 1108:

From 40872c08bbbfdaa68f009e5987999ff7d02eb349 Mon Sep 17 00:00:00 2001
From: Johnny <johnnynuca14@gmail.com>
Date: Wed, 29 Oct 2025 16:36:02 +0100
Subject: [PATCH 16/22] Update TORCH_CUDA_ARCH_LIST for toolkit versioning

---
 .github/workflows/wheels_build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml
index 87bfe2a355..f2b0440f62 100644
--- a/.github/workflows/wheels_build.yml
+++ b/.github/workflows/wheels_build.yml
@@ -59,7 +59,7 @@ jobs:
 
       - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 130
         run: |
-          echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a 10.0f 11.0f 12.0f" >> ${GITHUB_ENV}
+          echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a 10.0a 10.3a 11.0a 12.0a 12.1a" >> ${GITHUB_ENV}
 
       - if: runner.os == 'Windows'
         run: git config --system core.longpaths true

From 6be38b95b1b6fe626301fd22446faa87a0a3a110 Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Thu, 30 Oct 2025 13:27:15 +0100
Subject: [PATCH 17/22] try fix windows

---
 .github/workflows/win-build.yml |  2 +-
 setup.py                        | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml
index e44cc95bab..dd12adb98b 100644
--- a/.github/workflows/win-build.yml
+++ b/.github/workflows/win-build.yml
@@ -73,7 +73,7 @@ jobs:
 
       - name: Install build dependencies
         run: |
-          $PY -m pip install wheel setuptools ninja torch==2.9.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu126
+          $PY -m pip install wheel setuptools ninja torch==2.9.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu130
           git config --global --add safe.directory "*"
           $PY -c "import torch; print('torch', torch.__version__)"
           $PY -c "import torch; print('torch.cuda', torch.version.cuda)"
diff --git a/setup.py b/setup.py
index 1bd352d25f..6b7c952f84 100644
--- a/setup.py
+++ b/setup.py
@@ -299,9 +299,20 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int):
         if num not in [80, 90]:  # only support Sm80/Sm90
             continue
         suffix = match.group("suffix")
-        nvcc_archs_flags.append(
-            f"-gencode=arch=compute_{num}{suffix},code=sm_{num}{suffix}"
-        )
+        # On Windows, avoid assembling SASS for sm_90a due to nvcc/ptxas instability.
+        # Emit PTX for 90a and let the driver JIT at runtime.
+        if (
+            (sys.platform == "win32" or platform.system() == "Windows")
+            and num == 90
+            and suffix == "a"
+        ):
+            nvcc_archs_flags.append(
+                f"-gencode=arch=compute_{num}{suffix},code=compute_{num}{suffix}"
+            )
+        else:
+            nvcc_archs_flags.append(
+                f"-gencode=arch=compute_{num}{suffix},code=sm_{num}{suffix}"
+            )
         if match.group("ptx") is not None:
             nvcc_archs_flags.append(
                 f"-gencode=arch=compute_{num}{suffix},code=compute_{num}{suffix}"

From 6f6e99e75ce67785f38be75b402f6de25b585294 Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Thu, 30 Oct 2025 14:39:58 +0100
Subject: [PATCH 18/22] avoid compile fa3 windows with cu130

---
 setup.py                    | 19 +++++--------------
 third_party/cutlass         |  2 +-
 third_party/flash-attention |  2 +-
 3 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/setup.py b/setup.py
index 11ddd2b92e..df4a449533 100644
--- a/setup.py
+++ b/setup.py
@@ -283,6 +283,8 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int):
         return []
     if cuda_version < 1203:
         return []
+    if cuda_version >= 1300:
+        return []
     archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
     if archs_list is None:
         if torch.cuda.get_device_capability("cuda") != (
@@ -299,20 +301,9 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int):
         if num not in [80, 90]:  # only support Sm80/Sm90
             continue
         suffix = match.group("suffix")
-        # On Windows, avoid assembling SASS for sm_90a due to nvcc/ptxas instability.
-        # Emit PTX for 90a and let the driver JIT at runtime.
-        if (
-            (sys.platform == "win32" or platform.system() == "Windows")
-            and num == 90
-            and suffix == "a"
-        ):
-            nvcc_archs_flags.append(
-                f"-gencode=arch=compute_{num}{suffix},code=compute_{num}{suffix}"
-            )
-        else:
-            nvcc_archs_flags.append(
-                f"-gencode=arch=compute_{num}{suffix},code=sm_{num}{suffix}"
-            )
+        nvcc_archs_flags.append(
+            f"-gencode=arch=compute_{num}{suffix},code=sm_{num}{suffix}"
+        )
         if match.group("ptx") is not None:
             nvcc_archs_flags.append(
                 f"-gencode=arch=compute_{num}{suffix},code=compute_{num}{suffix}"
diff --git a/third_party/cutlass b/third_party/cutlass
index b2ca083d2b..8afb19d904 160000
--- a/third_party/cutlass
+++ b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit b2ca083d2bb96c41d9b3c5a930637c641f6669bf
+Subproject commit 8afb19d9047afc26816a046059afe66763e68aa5
diff --git a/third_party/flash-attention b/third_party/flash-attention
index b3f1b6a5bd..de1584b532 160000
--- a/third_party/flash-attention
+++ b/third_party/flash-attention
@@ -1 +1 @@
-Subproject commit b3f1b6a5bdcce820e74cc0bb6f615165387195cc
+Subproject commit de1584b5328321189a4d7832fe29bbd6813bf6ed

From d5acb1527b951194676cb1d488020ba5692f01ae Mon Sep 17 00:00:00 2001
From: Johnny <johnnynuca14@gmail.com>
Date: Thu, 30 Oct 2025 14:42:15 +0100
Subject: [PATCH 19/22] Update CUDA version from 13.0.1 to 13.0.2

---
 .github/actions/setup-build-cuda/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml
index b4766a3b13..862bea7db9 100644
--- a/.github/actions/setup-build-cuda/action.yml
+++ b/.github/actions/setup-build-cuda/action.yml
@@ -27,7 +27,7 @@ runs:
         TORCH_CUDA_DEFAULT = "130"  # since pytorch 2.9.0
         # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts
         full_version, install_script = {
-          "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"),
+          "130": ("13.0.2", "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run"),
           "129": ("12.9.0", "https://developer.download.nvidia.com/compute/cuda/12.9.1/local_installers/cuda_12.9.1_575.57.08_linux.run"),
           "128": ("12.8.1", "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run"),
           # (Build with nvcc 12.8 on linux even when building for 12.6 to avoid seg fault in Flash3 build)

From 3b8fcb262bd127e3544c8a9d9f2fedd111fa19b8 Mon Sep 17 00:00:00 2001
From: Johnny <johnnynuca14@gmail.com>
Date: Thu, 30 Oct 2025 15:01:15 +0100
Subject: [PATCH 20/22] Modify CUDA version check for Windows platform

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index df4a449533..ee681c7d29 100644
--- a/setup.py
+++ b/setup.py
@@ -283,7 +283,8 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int):
         return []
     if cuda_version < 1203:
         return []
-    if cuda_version >= 1300:
+if ((sys.platform == "win32" or platform.system() == "Windows")
+    and cuda_version >= 1300):
         return []
     archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
     if archs_list is None:

From 7281508d1612a93bb29d3e5da49e6fc5464a7290 Mon Sep 17 00:00:00 2001
From: Johnny <johnnynuca14@gmail.com>
Date: Thu, 30 Oct 2025 15:13:49 +0100
Subject: [PATCH 21/22] Update CUDA version from 13.0.2 to 13.0.1

Due action runner in windows is not updates
---
 .github/actions/setup-build-cuda/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml
index 862bea7db9..b4766a3b13 100644
--- a/.github/actions/setup-build-cuda/action.yml
+++ b/.github/actions/setup-build-cuda/action.yml
@@ -27,7 +27,7 @@ runs:
         TORCH_CUDA_DEFAULT = "130"  # since pytorch 2.9.0
         # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts
         full_version, install_script = {
-          "130": ("13.0.2", "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run"),
+          "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"),
           "129": ("12.9.0", "https://developer.download.nvidia.com/compute/cuda/12.9.1/local_installers/cuda_12.9.1_575.57.08_linux.run"),
           "128": ("12.8.1", "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run"),
           # (Build with nvcc 12.8 on linux even when building for 12.6 to avoid seg fault in Flash3 build)

From fc6b421ec5102be5eda4992a6fdc4f716332fbd9 Mon Sep 17 00:00:00 2001
From: Johnny <johnnynuca14@gmail.com>
Date: Thu, 30 Oct 2025 15:21:13 +0100
Subject: [PATCH 22/22] Update setup.py

Co-authored-by: dan_the_3rd <43445237+danthe3rd@users.noreply.github.com>
---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index ee681c7d29..1cac34c942 100644
--- a/setup.py
+++ b/setup.py
@@ -283,8 +283,8 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int):
         return []
     if cuda_version < 1203:
         return []
-if ((sys.platform == "win32" or platform.system() == "Windows")
-    and cuda_version >= 1300):
+    if ((sys.platform == "win32" or platform.system() == "Windows")
+        and cuda_version >= 1300):
         return []
     archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
     if archs_list is None: