From c0c3c342fa58d8715378c4a0ff75a0a658cf701c Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Thu, 9 Oct 2025 12:18:32 +0200 Subject: [PATCH 01/22] BUILD CUDA 13 --- .github/actions/setup-build-cuda/action.yml | 5 +++-- .github/actions/setup-env-build/action.yml | 2 +- .github/workflows/wheels.yml | 4 ++-- .github/workflows/wheels_build.yml | 8 ++++++-- setup.py | 6 ++++-- third_party/cutlass | 2 +- third_party/flash-attention | 2 +- xformers/csrc/sparse24/sparse24_gemm_sm90.cu | 8 ++++---- xformers/ops/fmha/dispatch.py | 2 +- xformers/ops/fmha/flash.py | 2 +- xformers/ops/fmha/flash3.py | 16 ++++++++++++++++ 11 files changed, 40 insertions(+), 17 deletions(-) diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml index 67d08ac7b0..151c393595 100644 --- a/.github/actions/setup-build-cuda/action.yml +++ b/.github/actions/setup-build-cuda/action.yml @@ -24,9 +24,10 @@ runs: print(sys.version) cushort = "${{ inputs.toolkit_short_version }}" # Version uploaded to pypi (rather than PyTorch s3) - TORCH_CUDA_DEFAULT = "128" # since pytorch 2.8.0 + TORCH_CUDA_DEFAULT = "129" # since pytorch 2.8.0 # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts full_version, install_script = { + "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"), "129": ("12.9.0", "https://developer.download.nvidia.com/compute/cuda/12.9.1/local_installers/cuda_12.9.1_575.57.08_linux.run"), "128": ("12.8.1", "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run"), # (Build with nvcc 12.8 on linux even when building for 12.6 to avoid seg fault in Flash3 build) @@ -52,7 +53,7 @@ runs: - name: Install cuda if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda' id: cuda-toolkit - uses: Jimver/cuda-toolkit@v0.2.24 + uses: Jimver/cuda-toolkit@v0.2.27 with: cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }} method: network diff --git a/.github/actions/setup-env-build/action.yml b/.github/actions/setup-env-build/action.yml index 495288a398..8c16200cb8 100644 --- a/.github/actions/setup-env-build/action.yml +++ b/.github/actions/setup-env-build/action.yml @@ -26,7 +26,7 @@ runs: CONDA_INSTALL_CMD = "micromamba create python=${{ inputs.python }} zlib pip ninja ccache=4.8 -c conda-forge -q -y" - conda_env_key = CONDA_INSTALL_CMD + "[cu129][v2]" + conda_env_key = CONDA_INSTALL_CMD + "[cu130][v2]" for file in sorted(glob.glob("requirement*.txt")): conda_env_key += f"\n########## {file}\n" conda_env_key += Path(file).read_text() diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 0be59490eb..ecbf163394 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -33,7 +33,7 @@ jobs: PYTHON_VERSION = "3.9" # NOTE: Don't forget to update `upload_pt`'s matrix # when changing the CUDA/ROCM versions below! - CU_VERSIONS = ['126', '128', '129'] + CU_VERSIONS = ['126', '129', '130'] ROCM_VERSIONS = ['6.4'] include = [] @@ -100,8 +100,8 @@ jobs: matrix: suffix: - cu126 - - cu128 - cu129 + - cu130 - rocm6.4 uses: ./.github/workflows/wheels_upload_s3.yml with: diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml index 654602a26b..87bfe2a355 100644 --- a/.github/workflows/wheels_build.yml +++ b/.github/workflows/wheels_build.yml @@ -53,9 +53,13 @@ jobs: run: shell: bash steps: - - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 120 + - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 120 && fromJSON(inputs.toolkit_short_version) < 130 run: | - echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 9.0a" >> ${GITHUB_ENV} + echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a" >> ${GITHUB_ENV} + + - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 130 + run: | + echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a 10.0f 11.0f 12.0f" >> ${GITHUB_ENV} - if: runner.os == 'Windows' run: git config --system core.longpaths true diff --git a/setup.py b/setup.py index 967732f639..2317d57c6e 100644 --- a/setup.py +++ b/setup.py @@ -176,7 +176,9 @@ def get_flash_attention2_nvcc_archs_flags(cuda_version: int): return [] # Figure out default archs to target DEFAULT_ARCHS_LIST = "" - if cuda_version >= 1208: + if cuda_version >= 1300: + DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0f;11.0f;12.0f" + elif cuda_version >= 1208: DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0;12.0" elif cuda_version >= 1108: DEFAULT_ARCHS_LIST = "8.0;8.6;9.0" @@ -283,7 +285,7 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int): return [] archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST") if archs_list is None: - if torch.cuda.get_device_capability("cuda") != (9, 0): + if torch.cuda.get_device_capability("cuda") != (9, 0) and torch.cuda.get_device_capability("cuda") != (8, 0): return [] archs_list = "8.0 9.0a" nvcc_archs_flags = [] diff --git a/third_party/cutlass b/third_party/cutlass index e9627ce55b..c6aeb9179c 160000 --- a/third_party/cutlass +++ b/third_party/cutlass @@ -1 +1 @@ -Subproject commit e9627ce55b42fd2599f58cd4396da9380954def0 +Subproject commit c6aeb9179c5f74a0fcdbd28527bf4b6ba8c60752 diff --git a/third_party/flash-attention b/third_party/flash-attention index c485eeade0..5183de4335 160000 --- a/third_party/flash-attention +++ b/third_party/flash-attention @@ -1 +1 @@ -Subproject commit c485eeade0c3ec9ce186c3640c52c9f1ce090b81 +Subproject commit 5183de433587a8aedd2450e9f18166c24521af29 diff --git a/xformers/csrc/sparse24/sparse24_gemm_sm90.cu b/xformers/csrc/sparse24/sparse24_gemm_sm90.cu index cabdc7799c..303eca1991 100644 --- a/xformers/csrc/sparse24/sparse24_gemm_sm90.cu +++ b/xformers/csrc/sparse24/sparse24_gemm_sm90.cu @@ -96,10 +96,10 @@ struct SparseRowwiseKernel { float, ElementOut, cutlass::layout::RowMajor, - 1, + 8, ElementOut, cutlass::layout::RowMajor, - 1, + 8, cutlass::epilogue::TmaWarpSpecializedCooperative, EpilogueEVT>::CollectiveOp; @@ -176,10 +176,10 @@ struct SparseRowwiseKernel { float, ElementOut, cutlass::layout::RowMajor, - 1, + 8, ElementOut, cutlass::layout::RowMajor, - 1, + 8, cutlass::epilogue::TmaWarpSpecializedCooperative, EpilogueEVT>::CollectiveOp; diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py index 5908635dac..be0f75ead4 100644 --- a/xformers/ops/fmha/dispatch.py +++ b/xformers/ops/fmha/dispatch.py @@ -31,7 +31,7 @@ def _get_use_fa3() -> bool: def fa3_available() -> bool: has_cuda = torch.version.cuda is not None - is_90a = has_cuda and torch.cuda.get_device_capability() >= (9, 0) + is_90a = has_cuda and (8, 0) <= torch.cuda.get_device_capability() <= (9, 0) has_valid_flash3 = flash3._C_flashattention3 is not None # pyre-ignore[16] return is_90a and has_valid_flash3 diff --git a/xformers/ops/fmha/flash.py b/xformers/ops/fmha/flash.py index 63e436698c..4956bcf25d 100644 --- a/xformers/ops/fmha/flash.py +++ b/xformers/ops/fmha/flash.py @@ -71,7 +71,7 @@ FLASH_VERSION = flash_attn.__version__ FLASH_VER_MIN = parse_version("2.7.1") - FLASH_VER_LAST = parse_version("2.8.3") # last supported, inclusive + FLASH_VER_LAST = parse_version("2.8.4") # last supported, inclusive flash_ver_parsed = parse_version(FLASH_VERSION) if ( flash_ver_parsed < FLASH_VER_MIN or flash_ver_parsed > FLASH_VER_LAST diff --git a/xformers/ops/fmha/flash3.py b/xformers/ops/fmha/flash3.py index b4e55f41ba..770960f1b3 100644 --- a/xformers/ops/fmha/flash3.py +++ b/xformers/ops/fmha/flash3.py @@ -647,6 +647,14 @@ class FwOp(AttentionFwOpBase): @classmethod def not_supported_reasons(cls, d: Inputs) -> List[str]: reasons = super(FwOp, cls).not_supported_reasons(d) + device_type = d.query.device.type + if device_type == "cuda" and (torch.version.hip is None): + device_capability = torch.cuda.get_device_capability(d.device) + if device_capability > cls.CUDA_MINIMUM_COMPUTE_CAPABILITY: + reasons.append( + f"requires device with capability == {cls.CUDA_MINIMUM_COMPUTE_CAPABILITY} " + f"but your GPU has capability {device_capability} (too new)" + ) check_lastdim_alignment_stride1(reasons, "query", d.query, 8) check_lastdim_alignment_stride1(reasons, "key", d.value, 8) check_lastdim_alignment_stride1(reasons, "value", d.value, 8) @@ -801,6 +809,14 @@ class BwOp(AttentionBwOpBase): @classmethod def not_supported_reasons(cls, d: Inputs) -> List[str]: reasons = super(BwOp, cls).not_supported_reasons(d) + device_type = d.query.device.type + if device_type == "cuda" and (torch.version.hip is None): + device_capability = torch.cuda.get_device_capability(d.device) + if device_capability > cls.CUDA_MINIMUM_COMPUTE_CAPABILITY: + reasons.append( + f"requires device with capability == {cls.CUDA_MINIMUM_COMPUTE_CAPABILITY} " + f"but your GPU has capability {device_capability} (too new)" + ) check_lastdim_alignment_stride1(reasons, "query", d.query, 8) check_lastdim_alignment_stride1(reasons, "key", d.value, 8) check_lastdim_alignment_stride1(reasons, "value", d.value, 8) From 3bd4670c6e2f7b088e887f36b6dd502e298957b4 Mon Sep 17 00:00:00 2001 From: Johnny Date: Thu, 9 Oct 2025 12:22:04 +0200 Subject: [PATCH 02/22] Update action.yml --- .github/actions/setup-build-cuda/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml index 151c393595..f5d9a6f771 100644 --- a/.github/actions/setup-build-cuda/action.yml +++ b/.github/actions/setup-build-cuda/action.yml @@ -24,7 +24,7 @@ runs: print(sys.version) cushort = "${{ inputs.toolkit_short_version }}" # Version uploaded to pypi (rather than PyTorch s3) - TORCH_CUDA_DEFAULT = "129" # since pytorch 2.8.0 + TORCH_CUDA_DEFAULT = "130" # since pytorch 2.8.0 # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts full_version, install_script = { "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"), From f0e238402c41fc0d1790e405ec20b895a80c1d5f Mon Sep 17 00:00:00 2001 From: Johnny Date: Wed, 15 Oct 2025 06:08:43 +0200 Subject: [PATCH 03/22] Update action.yml --- .github/actions/setup-build-cuda/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml index f5d9a6f771..a9bf2fd785 100644 --- a/.github/actions/setup-build-cuda/action.yml +++ b/.github/actions/setup-build-cuda/action.yml @@ -53,7 +53,7 @@ runs: - name: Install cuda if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda' id: cuda-toolkit - uses: Jimver/cuda-toolkit@v0.2.27 + uses: Jimver/cuda-toolkit@v0.2.28 with: cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }} method: network From 37eed3afbbb418ca09e2dd98213b3bd40c2ff22d Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Wed, 15 Oct 2025 15:14:37 +0200 Subject: [PATCH 04/22] Update flash-attention hash --- third_party/flash-attention | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/flash-attention b/third_party/flash-attention index 5183de4335..a76e692a6e 160000 --- a/third_party/flash-attention +++ b/third_party/flash-attention @@ -1 +1 @@ -Subproject commit 5183de433587a8aedd2450e9f18166c24521af29 +Subproject commit a76e692a6eb13121c27db6187629acacda6160bc From db622e4f0f333df50a249d4f062de84e0166e5d9 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Tue, 21 Oct 2025 12:18:10 -0700 Subject: [PATCH 05/22] fix error cccl --- third_party/cutlass | 2 +- third_party/flash-attention | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/cutlass b/third_party/cutlass index c6aeb9179c..b1d6e2c9b3 160000 --- a/third_party/cutlass +++ b/third_party/cutlass @@ -1 +1 @@ -Subproject commit c6aeb9179c5f74a0fcdbd28527bf4b6ba8c60752 +Subproject commit b1d6e2c9b334dfa811e4183dfbd02419249e4b52 diff --git a/third_party/flash-attention b/third_party/flash-attention index a76e692a6e..933b2c3ebb 160000 --- a/third_party/flash-attention +++ b/third_party/flash-attention @@ -1 +1 @@ -Subproject commit a76e692a6eb13121c27db6187629acacda6160bc +Subproject commit 933b2c3ebb8a3da378f5fefb4e398c8a9970ad81 From 920ade351460a830dcd89d4b2f9e8f4a62205b2c Mon Sep 17 00:00:00 2001 From: Johnny Date: Tue, 21 Oct 2025 16:47:55 -0700 Subject: [PATCH 06/22] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 399c304661..5ab1043984 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ # Example requirement, can be anything that pip knows # install with `pip install -r requirements.txt`, and make sure that CI does the same -torch >= 2.8 +torch >= 2.9 numpy From c2407a6cddd60ba990b06f7563303f1f64fcf590 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Tue, 21 Oct 2025 16:53:26 -0700 Subject: [PATCH 07/22] fix error pytorch 2.9.0 in CI --- .github/actions/setup-build-cuda/action.yml | 2 +- .github/workflows/wheels.yml | 6 +++--- .github/workflows/win-build.yml | 2 +- third_party/flash-attention | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml index a9bf2fd785..2b140009b1 100644 --- a/.github/actions/setup-build-cuda/action.yml +++ b/.github/actions/setup-build-cuda/action.yml @@ -24,7 +24,7 @@ runs: print(sys.version) cushort = "${{ inputs.toolkit_short_version }}" # Version uploaded to pypi (rather than PyTorch s3) - TORCH_CUDA_DEFAULT = "130" # since pytorch 2.8.0 + TORCH_CUDA_DEFAULT = "130" # since pytorch 2.9.0 # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts full_version, install_script = { "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"), diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index ecbf163394..d56c3640dd 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -38,7 +38,7 @@ jobs: include = [] for os in ['8-core-ubuntu', 'windows-8-core']: - for torch_version in ['2.8.0']: + for torch_version in ['2.9.0']: # CUDA builds for cuda_short_version in CU_VERSIONS: if cuda_short_version < "124" and "windows" in os: @@ -88,7 +88,7 @@ jobs: uses: ./.github/workflows/wheels_upload_pip.yml with: twine_username: __token__ - filter: "*torch2.8.0+cu128*" + filter: "*torch2.9.0+cu130*" execute: ${{ github.repository == 'facebookresearch/xformers' && github.event_name != 'pull_request' }} secrets: twine_password: ${{ secrets.PYPI_TOKEN }} @@ -108,5 +108,5 @@ jobs: aws_role: "arn:aws:iam::749337293305:role/pytorch_bot_uploader_role" s3_path: s3://pytorch/whl/${{ matrix.suffix }}/ aws_s3_cp_extra_args: --acl public-read - filter: "*torch2.8.0+${{ matrix.suffix }}*" + filter: "*torch2.9.0+${{ matrix.suffix }}*" execute: ${{ github.repository == 'facebookresearch/xformers' && github.ref_type == 'tag' }} diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml index bac84742c6..109cb007e6 100644 --- a/.github/workflows/win-build.yml +++ b/.github/workflows/win-build.yml @@ -73,7 +73,7 @@ jobs: - name: Install build dependencies run: | - $PY -m pip install wheel setuptools ninja torch==2.8.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu126 + $PY -m pip install wheel setuptools ninja torch==2.9.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu126 git config --global --add safe.directory "*" $PY -c "import torch; print('torch', torch.__version__)" $PY -c "import torch; print('torch.cuda', torch.version.cuda)" diff --git a/third_party/flash-attention b/third_party/flash-attention index 933b2c3ebb..9dbed03d1a 160000 --- a/third_party/flash-attention +++ b/third_party/flash-attention @@ -1 +1 @@ -Subproject commit 933b2c3ebb8a3da378f5fefb4e398c8a9970ad81 +Subproject commit 9dbed03d1a7a5862998c182c83d8265fea9dc21b From 3e2e11e3ced1f1085a23f2c9db9edb191e17f7f9 Mon Sep 17 00:00:00 2001 From: Johnny Date: Tue, 28 Oct 2025 08:11:39 -0700 Subject: [PATCH 08/22] Update linters_reusable.yml --- .github/workflows/linters_reusable.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/linters_reusable.yml b/.github/workflows/linters_reusable.yml index 9100b63803..a6eee05af7 100644 --- a/.github/workflows/linters_reusable.yml +++ b/.github/workflows/linters_reusable.yml @@ -15,9 +15,9 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.10' - name: Run pre-script if: ${{ inputs.pre-script }} run: ${{ inputs.pre-script }} From bfb2271dcbb250aee14894e94a17a54ee92df8ce Mon Sep 17 00:00:00 2001 From: Johnny Date: Tue, 28 Oct 2025 08:15:07 -0700 Subject: [PATCH 09/22] Update CUDA toolkit and Python versions in workflow --- .github/workflows/win-build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml index 109cb007e6..e44cc95bab 100644 --- a/.github/workflows/win-build.yml +++ b/.github/workflows/win-build.yml @@ -61,8 +61,8 @@ jobs: uses: ./.github/actions/setup-build-cuda with: toolkit_type: "cuda" - toolkit_short_version: "128" - python: "3.9" + toolkit_short_version: "130" + python: "3.10" - name: Remove internal code run: | From d434cb39bdd2462799d141ba30f1a94ea3032b3c Mon Sep 17 00:00:00 2001 From: Johnny Date: Tue, 28 Oct 2025 08:18:33 -0700 Subject: [PATCH 10/22] Update Python version from 3.9 to 3.10 --- .github/workflows/wheels.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index d56c3640dd..57363a5018 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -29,8 +29,8 @@ jobs: environ = os.environ # All builds are python-version agnostic, - # and built with python 3.9 - PYTHON_VERSION = "3.9" + # and built with python 3.10 + PYTHON_VERSION = "3.10" # NOTE: Don't forget to update `upload_pt`'s matrix # when changing the CUDA/ROCM versions below! CU_VERSIONS = ['126', '129', '130'] From ea4407122ac380e35262459005ec45fae719f0ff Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Tue, 28 Oct 2025 08:55:30 -0700 Subject: [PATCH 11/22] upstream --- third_party/cutlass | 2 +- third_party/flash-attention | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/cutlass b/third_party/cutlass index b1d6e2c9b3..b2ca083d2b 160000 --- a/third_party/cutlass +++ b/third_party/cutlass @@ -1 +1 @@ -Subproject commit b1d6e2c9b334dfa811e4183dfbd02419249e4b52 +Subproject commit b2ca083d2bb96c41d9b3c5a930637c641f6669bf diff --git a/third_party/flash-attention b/third_party/flash-attention index 9dbed03d1a..b3f1b6a5bd 160000 --- a/third_party/flash-attention +++ b/third_party/flash-attention @@ -1 +1 @@ -Subproject commit 9dbed03d1a7a5862998c182c83d8265fea9dc21b +Subproject commit b3f1b6a5bdcce820e74cc0bb6f615165387195cc From dbe25a2f5bf8730de12c90e3f60d6301c1e38b54 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Tue, 28 Oct 2025 09:01:56 -0700 Subject: [PATCH 12/22] lint --- .github/workflows/linters.yml | 1 - .github/workflows/rocm_ci.yml | 16 ++++++------- .github/workflows/rocm_docker.yml | 4 ++-- .gitignore | 2 -- setup.py | 5 +++- tests/readme_test_on_rocm.txt | 4 +--- .../benchmarks/readme_benchmark_on_rocm.txt | 3 +-- .../csrc/attention/hip_decoder/CMakeLists.txt | 24 +++++++++---------- .../attention/hip_fmha/GENERATE_INSTANCES.md | 18 +++++++------- 9 files changed, 36 insertions(+), 41 deletions(-) diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml index 44dd0cab33..9bc445c770 100644 --- a/.github/workflows/linters.yml +++ b/.github/workflows/linters.yml @@ -7,4 +7,3 @@ on: jobs: repo: uses: ./.github/workflows/linters_reusable.yml - diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml index 1897eab1d1..f27f393ffc 100644 --- a/.github/workflows/rocm_ci.yml +++ b/.github/workflows/rocm_ci.yml @@ -1,6 +1,6 @@ name: rocm-ci -on: +on: pull_request: types: [labeled, synchronize, reopened] workflow_dispatch: {} @@ -43,23 +43,23 @@ jobs: export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}} echo GIT_BRANCH = $GIT_BRANCH - + export ROCM_PATH=/opt/rocm echo ROCM_PATH = $ROCM_PATH hipcc --version rocm-smi rocminfo | grep "gfx" - + - name: Setup build env run: | conda create -n xformers python=3.11 export PATH=/opt/conda/envs/xformers/bin:$PATH python -VV - + python -m pip install -U torch --index-url=https://download.pytorch.org/whl/rocm6.2 python -c "import torch; print(f'PyTorch version {torch.__version__}')" - + python -m pip install ninja scipy pytest pytest-html - name: Pre-build clean @@ -72,16 +72,16 @@ jobs: run: | export PATH=/opt/conda/envs/xformers/bin:$PATH export MAX_JOBS=20 - + python -m pip install -e ./_xformers --verbose python -m xformers.info - name: Run python tests run: | export PATH=/opt/conda/envs/xformers/bin:$PATH - + python -m pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py - + - name: Archive logs if: '!cancelled()' uses: actions/upload-artifact@v4 diff --git a/.github/workflows/rocm_docker.yml b/.github/workflows/rocm_docker.yml index 31fc242a71..d774306c08 100644 --- a/.github/workflows/rocm_docker.yml +++ b/.github/workflows/rocm_docker.yml @@ -12,13 +12,13 @@ jobs: steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - + - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ vars.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - + - name: Build and push uses: docker/build-push-action@v6 with: diff --git a/.gitignore b/.gitignore index 978b6be3e0..a32f07f167 100644 --- a/.gitignore +++ b/.gitignore @@ -71,5 +71,3 @@ xformers/csrc/attention/hip_fmha/instances/*_hip.h xformers/csrc/attention/hip_decoder/*.cu xformers/csrc/attention/hip_decoder/*.hip xformers/csrc/attention/hip_decoder/*_hip.h - - diff --git a/setup.py b/setup.py index 2317d57c6e..1bd352d25f 100644 --- a/setup.py +++ b/setup.py @@ -285,7 +285,10 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int): return [] archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST") if archs_list is None: - if torch.cuda.get_device_capability("cuda") != (9, 0) and torch.cuda.get_device_capability("cuda") != (8, 0): + if torch.cuda.get_device_capability("cuda") != ( + 9, + 0, + ) and torch.cuda.get_device_capability("cuda") != (8, 0): return [] archs_list = "8.0 9.0a" nvcc_archs_flags = [] diff --git a/tests/readme_test_on_rocm.txt b/tests/readme_test_on_rocm.txt index c21fd0d587..754fac7fe7 100644 --- a/tests/readme_test_on_rocm.txt +++ b/tests/readme_test_on_rocm.txt @@ -3,11 +3,9 @@ 2. verify testing for generic fmha inference on ROCM - #> pytest tests/test_mem_eff_attention.py::test_forward + #> pytest tests/test_mem_eff_attention.py::test_forward 3. verify testing for decoder fmha inference on ROCM #> pytest tests/test_mem_eff_attention.py::test_decoder #> pytest tests/test_mem_eff_attention.py::test_splitk_decoder - - diff --git a/xformers/benchmarks/readme_benchmark_on_rocm.txt b/xformers/benchmarks/readme_benchmark_on_rocm.txt index 9ae61f5294..cb64bb912d 100644 --- a/xformers/benchmarks/readme_benchmark_on_rocm.txt +++ b/xformers/benchmarks/readme_benchmark_on_rocm.txt @@ -8,10 +8,9 @@ 3. Benchmark for decoder fmha inference on ROCM - #> python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py + #> python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py 4. Other Benchmarks for fmha inference on ROCM #> python xformers/benchmarks/benchmark_attn_decoding.py #> python xformers/benchmarks/benchmark_mem_eff_attention_mqa.py - diff --git a/xformers/csrc/attention/hip_decoder/CMakeLists.txt b/xformers/csrc/attention/hip_decoder/CMakeLists.txt index 97e2ab0b22..75e075b09e 100644 --- a/xformers/csrc/attention/hip_decoder/CMakeLists.txt +++ b/xformers/csrc/attention/hip_decoder/CMakeLists.txt @@ -36,14 +36,14 @@ set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES LINKER_LANGUAGE set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES HIP_ARCHITECTURES ${GPU_TARGETS}) -target_compile_options(${exe_name} PUBLIC +target_compile_options(${exe_name} PUBLIC -fno-gpu-rdc $<$: --save-temps > ) -target_compile_options(${splitk_exe_name} PUBLIC +target_compile_options(${splitk_exe_name} PUBLIC -fno-gpu-rdc $<$: --save-temps @@ -52,13 +52,13 @@ target_compile_options(${splitk_exe_name} PUBLIC > ) -target_include_directories(${exe_name} PUBLIC +target_include_directories(${exe_name} PUBLIC ${ck_include} # ck includes ${torch_include} # aten includes ${torch_include}/torch/csrc/api/include # torch includes ) -target_include_directories(${splitk_exe_name} PUBLIC +target_include_directories(${splitk_exe_name} PUBLIC ${ck_include} # ck includes ${torch_include} # aten includes ${torch_include}/torch/csrc/api/include # torch includes @@ -93,14 +93,14 @@ target_link_libraries(${splitk_exe_name} PUBLIC amdhip64 ) -target_compile_definitions(${exe_name} PUBLIC +target_compile_definitions(${exe_name} PUBLIC ATTN_FWD_DECODER_MAIN=1 GLIBCXX_USE_CXX11_ABI=1 __HIP_PLATFORM_HCC__=1 USE_ROCM=1 ) -target_compile_definitions(${splitk_exe_name} PUBLIC +target_compile_definitions(${splitk_exe_name} PUBLIC ATTN_FWD_SPLITK_DECODER_MAIN=1 GLIBCXX_USE_CXX11_ABI=1 __HIP_PLATFORM_HCC__=1 @@ -108,13 +108,13 @@ target_compile_definitions(${splitk_exe_name} PUBLIC ) include(CMakePrintHelpers) -cmake_print_properties(TARGETS ${exe_name} ${splitk_exe_name} PROPERTIES - LINK_LIBRARIES - LINK_DIRECTORIES - INCLUDE_DIRECTORIES - COMPILE_DEFINITIONS +cmake_print_properties(TARGETS ${exe_name} ${splitk_exe_name} PROPERTIES + LINK_LIBRARIES + LINK_DIRECTORIES + INCLUDE_DIRECTORIES + COMPILE_DEFINITIONS COMPILE_OPTIONS SOURCES HIP_ARCHITECTURES) -rocm_install(TARGETS ${exe_name} ${splitk_exe_name}) \ No newline at end of file +rocm_install(TARGETS ${exe_name} ${splitk_exe_name}) diff --git a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md index 829df66469..72cfc4f641 100644 --- a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md +++ b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md @@ -1,16 +1,16 @@ # Instances generator - The instances generator is a simple python tool used to generate several hundred of instances (.cpp files) and their references (.h files). - Without this tool, manually writing those instances and references will be very laborious and easy to get wrong. - - The instances generated by this scripts are divided into three categories visible from the scripts: + The instances generator is a simple python tool used to generate several hundred of instances (.cpp files) and their references (.h files). + Without this tool, manually writing those instances and references will be very laborious and easy to get wrong. + + The instances generated by this scripts are divided into three categories visible from the scripts: * Infer -- which refers to instances for calling inference-only kernels * Forward -- which refers to instances for calling training forward kernels * Backward -- which refers to instances for calling training backward kernels - - The instance generator is for being used by the HIP fmha developers themselves. It is not supposed to be used by the xformers users for - building xformers, since for xformers users, the instances are already well prepared as part of the xformers codes. + + The instance generator is for being used by the HIP fmha developers themselves. It is not supposed to be used by the xformers users for + building xformers, since for xformers users, the instances are already well prepared as part of the xformers codes. ## how to use instance generator @@ -21,7 +21,7 @@ ``` * To generate reduced instances (when headdim256 is not required) - ``` + ``` #> python xformers/csrc/attention/hip_fmha/generate_instances.py --ignore-hd256 ``` * More options except for `--ignore-hd256` could be added to suppport further customization in generating instances as required @@ -29,5 +29,3 @@ ## where the instances files are located The instances files and references files are always located under a folder `instances/` that is located under the same directory as the file `generate_instances.py` itself - - From b819a230a984a6684bf73bfacdeccc8d99386c15 Mon Sep 17 00:00:00 2001 From: Johnny Date: Tue, 28 Oct 2025 10:30:26 -0700 Subject: [PATCH 13/22] Add use-github-cache option to CUDA setup action --- .github/actions/setup-build-cuda/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml index 2b140009b1..61165689f3 100644 --- a/.github/actions/setup-build-cuda/action.yml +++ b/.github/actions/setup-build-cuda/action.yml @@ -57,6 +57,7 @@ runs: with: cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }} method: network + use-github-cache: false - if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda' shell: bash run: | From a62a9a9543760a424acc5389f4f490806b479bbb Mon Sep 17 00:00:00 2001 From: Johnny Date: Tue, 28 Oct 2025 18:53:27 -0700 Subject: [PATCH 14/22] Update cuda-toolkit action to use N-Storm fork Switched from Jimver to N-Storm fork of cuda-toolkit. --- .github/actions/setup-build-cuda/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml index 61165689f3..b4766a3b13 100644 --- a/.github/actions/setup-build-cuda/action.yml +++ b/.github/actions/setup-build-cuda/action.yml @@ -53,11 +53,11 @@ runs: - name: Install cuda if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda' id: cuda-toolkit - uses: Jimver/cuda-toolkit@v0.2.28 + # Using N-Storm fork until https://github.com/Jimver/cuda-toolkit/issues/395 is resolved + uses: N-Storm/cuda-toolkit@v0.2.28 with: cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }} method: network - use-github-cache: false - if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda' shell: bash run: | From f732af6d99c43c726d007d6f0a4cf9c3de049398 Mon Sep 17 00:00:00 2001 From: Johnny Date: Tue, 28 Oct 2025 19:01:29 -0700 Subject: [PATCH 15/22] Fix CUDA architecture list format in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1bd352d25f..1527570426 100644 --- a/setup.py +++ b/setup.py @@ -177,7 +177,7 @@ def get_flash_attention2_nvcc_archs_flags(cuda_version: int): # Figure out default archs to target DEFAULT_ARCHS_LIST = "" if cuda_version >= 1300: - DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0f;11.0f;12.0f" + DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0;11.0;12.0" elif cuda_version >= 1208: DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0;12.0" elif cuda_version >= 1108: From 40872c08bbbfdaa68f009e5987999ff7d02eb349 Mon Sep 17 00:00:00 2001 From: Johnny Date: Wed, 29 Oct 2025 16:36:02 +0100 Subject: [PATCH 16/22] Update TORCH_CUDA_ARCH_LIST for toolkit versioning --- .github/workflows/wheels_build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml index 87bfe2a355..f2b0440f62 100644 --- a/.github/workflows/wheels_build.yml +++ b/.github/workflows/wheels_build.yml @@ -59,7 +59,7 @@ jobs: - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 130 run: | - echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a 10.0f 11.0f 12.0f" >> ${GITHUB_ENV} + echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a 10.0a 10.3a 11.0a 12.0a 12.1a" >> ${GITHUB_ENV} - if: runner.os == 'Windows' run: git config --system core.longpaths true From 6be38b95b1b6fe626301fd22446faa87a0a3a110 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Thu, 30 Oct 2025 13:27:15 +0100 Subject: [PATCH 17/22] try fix windows --- .github/workflows/win-build.yml | 2 +- setup.py | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml index e44cc95bab..dd12adb98b 100644 --- a/.github/workflows/win-build.yml +++ b/.github/workflows/win-build.yml @@ -73,7 +73,7 @@ jobs: - name: Install build dependencies run: | - $PY -m pip install wheel setuptools ninja torch==2.9.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu126 + $PY -m pip install wheel setuptools ninja torch==2.9.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu130 git config --global --add safe.directory "*" $PY -c "import torch; print('torch', torch.__version__)" $PY -c "import torch; print('torch.cuda', torch.version.cuda)" diff --git a/setup.py b/setup.py index 1bd352d25f..6b7c952f84 100644 --- a/setup.py +++ b/setup.py @@ -299,9 +299,20 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int): if num not in [80, 90]: # only support Sm80/Sm90 continue suffix = match.group("suffix") - nvcc_archs_flags.append( - f"-gencode=arch=compute_{num}{suffix},code=sm_{num}{suffix}" - ) + # On Windows, avoid assembling SASS for sm_90a due to nvcc/ptxas instability. + # Emit PTX for 90a and let the driver JIT at runtime. + if ( + (sys.platform == "win32" or platform.system() == "Windows") + and num == 90 + and suffix == "a" + ): + nvcc_archs_flags.append( + f"-gencode=arch=compute_{num}{suffix},code=compute_{num}{suffix}" + ) + else: + nvcc_archs_flags.append( + f"-gencode=arch=compute_{num}{suffix},code=sm_{num}{suffix}" + ) if match.group("ptx") is not None: nvcc_archs_flags.append( f"-gencode=arch=compute_{num}{suffix},code=compute_{num}{suffix}" From 6f6e99e75ce67785f38be75b402f6de25b585294 Mon Sep 17 00:00:00 2001 From: johnnynunez Date: Thu, 30 Oct 2025 14:39:58 +0100 Subject: [PATCH 18/22] avoid compile fa3 windows with cu130 --- setup.py | 19 +++++-------------- third_party/cutlass | 2 +- third_party/flash-attention | 2 +- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/setup.py b/setup.py index 11ddd2b92e..df4a449533 100644 --- a/setup.py +++ b/setup.py @@ -283,6 +283,8 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int): return [] if cuda_version < 1203: return [] + if cuda_version >= 1300: + return [] archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST") if archs_list is None: if torch.cuda.get_device_capability("cuda") != ( @@ -299,20 +301,9 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int): if num not in [80, 90]: # only support Sm80/Sm90 continue suffix = match.group("suffix") - # On Windows, avoid assembling SASS for sm_90a due to nvcc/ptxas instability. - # Emit PTX for 90a and let the driver JIT at runtime. - if ( - (sys.platform == "win32" or platform.system() == "Windows") - and num == 90 - and suffix == "a" - ): - nvcc_archs_flags.append( - f"-gencode=arch=compute_{num}{suffix},code=compute_{num}{suffix}" - ) - else: - nvcc_archs_flags.append( - f"-gencode=arch=compute_{num}{suffix},code=sm_{num}{suffix}" - ) + nvcc_archs_flags.append( + f"-gencode=arch=compute_{num}{suffix},code=sm_{num}{suffix}" + ) if match.group("ptx") is not None: nvcc_archs_flags.append( f"-gencode=arch=compute_{num}{suffix},code=compute_{num}{suffix}" diff --git a/third_party/cutlass b/third_party/cutlass index b2ca083d2b..8afb19d904 160000 --- a/third_party/cutlass +++ b/third_party/cutlass @@ -1 +1 @@ -Subproject commit b2ca083d2bb96c41d9b3c5a930637c641f6669bf +Subproject commit 8afb19d9047afc26816a046059afe66763e68aa5 diff --git a/third_party/flash-attention b/third_party/flash-attention index b3f1b6a5bd..de1584b532 160000 --- a/third_party/flash-attention +++ b/third_party/flash-attention @@ -1 +1 @@ -Subproject commit b3f1b6a5bdcce820e74cc0bb6f615165387195cc +Subproject commit de1584b5328321189a4d7832fe29bbd6813bf6ed From d5acb1527b951194676cb1d488020ba5692f01ae Mon Sep 17 00:00:00 2001 From: Johnny Date: Thu, 30 Oct 2025 14:42:15 +0100 Subject: [PATCH 19/22] Update CUDA version from 13.0.1 to 13.0.2 --- .github/actions/setup-build-cuda/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml index b4766a3b13..862bea7db9 100644 --- a/.github/actions/setup-build-cuda/action.yml +++ b/.github/actions/setup-build-cuda/action.yml @@ -27,7 +27,7 @@ runs: TORCH_CUDA_DEFAULT = "130" # since pytorch 2.9.0 # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts full_version, install_script = { - "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"), + "130": ("13.0.2", "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run"), "129": ("12.9.0", "https://developer.download.nvidia.com/compute/cuda/12.9.1/local_installers/cuda_12.9.1_575.57.08_linux.run"), "128": ("12.8.1", "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run"), # (Build with nvcc 12.8 on linux even when building for 12.6 to avoid seg fault in Flash3 build) From 3b8fcb262bd127e3544c8a9d9f2fedd111fa19b8 Mon Sep 17 00:00:00 2001 From: Johnny Date: Thu, 30 Oct 2025 15:01:15 +0100 Subject: [PATCH 20/22] Modify CUDA version check for Windows platform --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index df4a449533..ee681c7d29 100644 --- a/setup.py +++ b/setup.py @@ -283,7 +283,8 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int): return [] if cuda_version < 1203: return [] - if cuda_version >= 1300: +if ((sys.platform == "win32" or platform.system() == "Windows") + and cuda_version >= 1300): return [] archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST") if archs_list is None: From 7281508d1612a93bb29d3e5da49e6fc5464a7290 Mon Sep 17 00:00:00 2001 From: Johnny Date: Thu, 30 Oct 2025 15:13:49 +0100 Subject: [PATCH 21/22] Update CUDA version from 13.0.2 to 13.0.1 Due action runner in windows is not updates --- .github/actions/setup-build-cuda/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml index 862bea7db9..b4766a3b13 100644 --- a/.github/actions/setup-build-cuda/action.yml +++ b/.github/actions/setup-build-cuda/action.yml @@ -27,7 +27,7 @@ runs: TORCH_CUDA_DEFAULT = "130" # since pytorch 2.9.0 # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts full_version, install_script = { - "130": ("13.0.2", "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/cuda_13.0.2_580.95.05_linux.run"), + "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"), "129": ("12.9.0", "https://developer.download.nvidia.com/compute/cuda/12.9.1/local_installers/cuda_12.9.1_575.57.08_linux.run"), "128": ("12.8.1", "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run"), # (Build with nvcc 12.8 on linux even when building for 12.6 to avoid seg fault in Flash3 build) From fc6b421ec5102be5eda4992a6fdc4f716332fbd9 Mon Sep 17 00:00:00 2001 From: Johnny Date: Thu, 30 Oct 2025 15:21:13 +0100 Subject: [PATCH 22/22] Update setup.py Co-authored-by: dan_the_3rd <43445237+danthe3rd@users.noreply.github.com> --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ee681c7d29..1cac34c942 100644 --- a/setup.py +++ b/setup.py @@ -283,8 +283,8 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int): return [] if cuda_version < 1203: return [] -if ((sys.platform == "win32" or platform.system() == "Windows") - and cuda_version >= 1300): + if ((sys.platform == "win32" or platform.system() == "Windows") + and cuda_version >= 1300): return [] archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST") if archs_list is None: