diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml index 67d08ac7b0..b4766a3b13 100644 --- a/.github/actions/setup-build-cuda/action.yml +++ b/.github/actions/setup-build-cuda/action.yml @@ -24,9 +24,10 @@ runs: print(sys.version) cushort = "${{ inputs.toolkit_short_version }}" # Version uploaded to pypi (rather than PyTorch s3) - TORCH_CUDA_DEFAULT = "128" # since pytorch 2.8.0 + TORCH_CUDA_DEFAULT = "130" # since pytorch 2.9.0 # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts full_version, install_script = { + "130": ("13.0.1", "https://developer.download.nvidia.com/compute/cuda/13.0.1/local_installers/cuda_13.0.1_580.82.07_linux.run"), "129": ("12.9.0", "https://developer.download.nvidia.com/compute/cuda/12.9.1/local_installers/cuda_12.9.1_575.57.08_linux.run"), "128": ("12.8.1", "https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda_12.8.1_570.124.06_linux.run"), # (Build with nvcc 12.8 on linux even when building for 12.6 to avoid seg fault in Flash3 build) @@ -52,7 +53,8 @@ runs: - name: Install cuda if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda' id: cuda-toolkit - uses: Jimver/cuda-toolkit@v0.2.24 + # Using N-Storm fork until https://github.com/Jimver/cuda-toolkit/issues/395 is resolved + uses: N-Storm/cuda-toolkit@v0.2.28 with: cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }} method: network diff --git a/.github/actions/setup-env-build/action.yml b/.github/actions/setup-env-build/action.yml index 495288a398..8c16200cb8 100644 --- a/.github/actions/setup-env-build/action.yml +++ b/.github/actions/setup-env-build/action.yml @@ -26,7 +26,7 @@ runs: CONDA_INSTALL_CMD = "micromamba create python=${{ inputs.python }} zlib pip ninja ccache=4.8 -c conda-forge -q -y" - conda_env_key = CONDA_INSTALL_CMD + "[cu129][v2]" + conda_env_key = CONDA_INSTALL_CMD + "[cu130][v2]" for file in sorted(glob.glob("requirement*.txt")): conda_env_key += f"\n########## {file}\n" conda_env_key += Path(file).read_text() diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml index 44dd0cab33..9bc445c770 100644 --- a/.github/workflows/linters.yml +++ b/.github/workflows/linters.yml @@ -7,4 +7,3 @@ on: jobs: repo: uses: ./.github/workflows/linters_reusable.yml - diff --git a/.github/workflows/linters_reusable.yml b/.github/workflows/linters_reusable.yml index 9100b63803..a6eee05af7 100644 --- a/.github/workflows/linters_reusable.yml +++ b/.github/workflows/linters_reusable.yml @@ -15,9 +15,9 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.10' - name: Run pre-script if: ${{ inputs.pre-script }} run: ${{ inputs.pre-script }} diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml index 1897eab1d1..f27f393ffc 100644 --- a/.github/workflows/rocm_ci.yml +++ b/.github/workflows/rocm_ci.yml @@ -1,6 +1,6 @@ name: rocm-ci -on: +on: pull_request: types: [labeled, synchronize, reopened] workflow_dispatch: {} @@ -43,23 +43,23 @@ jobs: export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}} echo GIT_BRANCH = $GIT_BRANCH - + export ROCM_PATH=/opt/rocm echo ROCM_PATH = $ROCM_PATH hipcc --version rocm-smi rocminfo | grep "gfx" - + - name: Setup build env run: | conda create -n xformers python=3.11 export PATH=/opt/conda/envs/xformers/bin:$PATH python -VV - + python -m pip install -U torch --index-url=https://download.pytorch.org/whl/rocm6.2 python -c "import torch; print(f'PyTorch version {torch.__version__}')" - + python -m pip install ninja scipy pytest pytest-html - name: Pre-build clean @@ -72,16 +72,16 @@ jobs: run: | export PATH=/opt/conda/envs/xformers/bin:$PATH export MAX_JOBS=20 - + python -m pip install -e ./_xformers --verbose python -m xformers.info - name: Run python tests run: | export PATH=/opt/conda/envs/xformers/bin:$PATH - + python -m pytest --html=test_mem_eff_attention.html --self-contained-html -rpfs ./_xformers/tests/test_mem_eff_attention.py - + - name: Archive logs if: '!cancelled()' uses: actions/upload-artifact@v4 diff --git a/.github/workflows/rocm_docker.yml b/.github/workflows/rocm_docker.yml index 31fc242a71..d774306c08 100644 --- a/.github/workflows/rocm_docker.yml +++ b/.github/workflows/rocm_docker.yml @@ -12,13 +12,13 @@ jobs: steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - + - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ vars.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - + - name: Build and push uses: docker/build-push-action@v6 with: diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 0be59490eb..57363a5018 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -29,16 +29,16 @@ jobs: environ = os.environ # All builds are python-version agnostic, - # and built with python 3.9 - PYTHON_VERSION = "3.9" + # and built with python 3.10 + PYTHON_VERSION = "3.10" # NOTE: Don't forget to update `upload_pt`'s matrix # when changing the CUDA/ROCM versions below! - CU_VERSIONS = ['126', '128', '129'] + CU_VERSIONS = ['126', '129', '130'] ROCM_VERSIONS = ['6.4'] include = [] for os in ['8-core-ubuntu', 'windows-8-core']: - for torch_version in ['2.8.0']: + for torch_version in ['2.9.0']: # CUDA builds for cuda_short_version in CU_VERSIONS: if cuda_short_version < "124" and "windows" in os: @@ -88,7 +88,7 @@ jobs: uses: ./.github/workflows/wheels_upload_pip.yml with: twine_username: __token__ - filter: "*torch2.8.0+cu128*" + filter: "*torch2.9.0+cu130*" execute: ${{ github.repository == 'facebookresearch/xformers' && github.event_name != 'pull_request' }} secrets: twine_password: ${{ secrets.PYPI_TOKEN }} @@ -100,13 +100,13 @@ jobs: matrix: suffix: - cu126 - - cu128 - cu129 + - cu130 - rocm6.4 uses: ./.github/workflows/wheels_upload_s3.yml with: aws_role: "arn:aws:iam::749337293305:role/pytorch_bot_uploader_role" s3_path: s3://pytorch/whl/${{ matrix.suffix }}/ aws_s3_cp_extra_args: --acl public-read - filter: "*torch2.8.0+${{ matrix.suffix }}*" + filter: "*torch2.9.0+${{ matrix.suffix }}*" execute: ${{ github.repository == 'facebookresearch/xformers' && github.ref_type == 'tag' }} diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml index 654602a26b..f2b0440f62 100644 --- a/.github/workflows/wheels_build.yml +++ b/.github/workflows/wheels_build.yml @@ -53,9 +53,13 @@ jobs: run: shell: bash steps: - - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 120 + - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 120 && fromJSON(inputs.toolkit_short_version) < 130 run: | - echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 9.0a" >> ${GITHUB_ENV} + echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a" >> ${GITHUB_ENV} + + - if: contains(inputs.toolkit_type, 'cuda') && fromJSON(inputs.toolkit_short_version) >= 130 + run: | + echo "TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST 8.0 9.0a 10.0a 10.3a 11.0a 12.0a 12.1a" >> ${GITHUB_ENV} - if: runner.os == 'Windows' run: git config --system core.longpaths true diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml index bac84742c6..dd12adb98b 100644 --- a/.github/workflows/win-build.yml +++ b/.github/workflows/win-build.yml @@ -61,8 +61,8 @@ jobs: uses: ./.github/actions/setup-build-cuda with: toolkit_type: "cuda" - toolkit_short_version: "128" - python: "3.9" + toolkit_short_version: "130" + python: "3.10" - name: Remove internal code run: | @@ -73,7 +73,7 @@ jobs: - name: Install build dependencies run: | - $PY -m pip install wheel setuptools ninja torch==2.8.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu126 + $PY -m pip install wheel setuptools ninja torch==2.9.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu130 git config --global --add safe.directory "*" $PY -c "import torch; print('torch', torch.__version__)" $PY -c "import torch; print('torch.cuda', torch.version.cuda)" diff --git a/.gitignore b/.gitignore index 978b6be3e0..a32f07f167 100644 --- a/.gitignore +++ b/.gitignore @@ -71,5 +71,3 @@ xformers/csrc/attention/hip_fmha/instances/*_hip.h xformers/csrc/attention/hip_decoder/*.cu xformers/csrc/attention/hip_decoder/*.hip xformers/csrc/attention/hip_decoder/*_hip.h - - diff --git a/requirements.txt b/requirements.txt index 399c304661..5ab1043984 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ # Example requirement, can be anything that pip knows # install with `pip install -r requirements.txt`, and make sure that CI does the same -torch >= 2.8 +torch >= 2.9 numpy diff --git a/setup.py b/setup.py index 967732f639..1cac34c942 100644 --- a/setup.py +++ b/setup.py @@ -176,7 +176,9 @@ def get_flash_attention2_nvcc_archs_flags(cuda_version: int): return [] # Figure out default archs to target DEFAULT_ARCHS_LIST = "" - if cuda_version >= 1208: + if cuda_version >= 1300: + DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0;11.0;12.0" + elif cuda_version >= 1208: DEFAULT_ARCHS_LIST = "8.0;8.6;9.0;10.0;12.0" elif cuda_version >= 1108: DEFAULT_ARCHS_LIST = "8.0;8.6;9.0" @@ -281,9 +283,15 @@ def get_flash_attention3_nvcc_archs_flags(cuda_version: int): return [] if cuda_version < 1203: return [] + if ((sys.platform == "win32" or platform.system() == "Windows") + and cuda_version >= 1300): + return [] archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST") if archs_list is None: - if torch.cuda.get_device_capability("cuda") != (9, 0): + if torch.cuda.get_device_capability("cuda") != ( + 9, + 0, + ) and torch.cuda.get_device_capability("cuda") != (8, 0): return [] archs_list = "8.0 9.0a" nvcc_archs_flags = [] diff --git a/tests/readme_test_on_rocm.txt b/tests/readme_test_on_rocm.txt index c21fd0d587..754fac7fe7 100644 --- a/tests/readme_test_on_rocm.txt +++ b/tests/readme_test_on_rocm.txt @@ -3,11 +3,9 @@ 2. verify testing for generic fmha inference on ROCM - #> pytest tests/test_mem_eff_attention.py::test_forward + #> pytest tests/test_mem_eff_attention.py::test_forward 3. verify testing for decoder fmha inference on ROCM #> pytest tests/test_mem_eff_attention.py::test_decoder #> pytest tests/test_mem_eff_attention.py::test_splitk_decoder - - diff --git a/third_party/cutlass b/third_party/cutlass index e9627ce55b..8afb19d904 160000 --- a/third_party/cutlass +++ b/third_party/cutlass @@ -1 +1 @@ -Subproject commit e9627ce55b42fd2599f58cd4396da9380954def0 +Subproject commit 8afb19d9047afc26816a046059afe66763e68aa5 diff --git a/third_party/flash-attention b/third_party/flash-attention index c485eeade0..de1584b532 160000 --- a/third_party/flash-attention +++ b/third_party/flash-attention @@ -1 +1 @@ -Subproject commit c485eeade0c3ec9ce186c3640c52c9f1ce090b81 +Subproject commit de1584b5328321189a4d7832fe29bbd6813bf6ed diff --git a/xformers/benchmarks/readme_benchmark_on_rocm.txt b/xformers/benchmarks/readme_benchmark_on_rocm.txt index 9ae61f5294..cb64bb912d 100644 --- a/xformers/benchmarks/readme_benchmark_on_rocm.txt +++ b/xformers/benchmarks/readme_benchmark_on_rocm.txt @@ -8,10 +8,9 @@ 3. Benchmark for decoder fmha inference on ROCM - #> python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py + #> python xformers/benchmarks/benchmark_mem_eff_attn_decoder.py 4. Other Benchmarks for fmha inference on ROCM #> python xformers/benchmarks/benchmark_attn_decoding.py #> python xformers/benchmarks/benchmark_mem_eff_attention_mqa.py - diff --git a/xformers/csrc/attention/hip_decoder/CMakeLists.txt b/xformers/csrc/attention/hip_decoder/CMakeLists.txt index 97e2ab0b22..75e075b09e 100644 --- a/xformers/csrc/attention/hip_decoder/CMakeLists.txt +++ b/xformers/csrc/attention/hip_decoder/CMakeLists.txt @@ -36,14 +36,14 @@ set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES LINKER_LANGUAGE set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${exe_name} ${splitk_exe_name} PROPERTIES HIP_ARCHITECTURES ${GPU_TARGETS}) -target_compile_options(${exe_name} PUBLIC +target_compile_options(${exe_name} PUBLIC -fno-gpu-rdc $<$: --save-temps > ) -target_compile_options(${splitk_exe_name} PUBLIC +target_compile_options(${splitk_exe_name} PUBLIC -fno-gpu-rdc $<$: --save-temps @@ -52,13 +52,13 @@ target_compile_options(${splitk_exe_name} PUBLIC > ) -target_include_directories(${exe_name} PUBLIC +target_include_directories(${exe_name} PUBLIC ${ck_include} # ck includes ${torch_include} # aten includes ${torch_include}/torch/csrc/api/include # torch includes ) -target_include_directories(${splitk_exe_name} PUBLIC +target_include_directories(${splitk_exe_name} PUBLIC ${ck_include} # ck includes ${torch_include} # aten includes ${torch_include}/torch/csrc/api/include # torch includes @@ -93,14 +93,14 @@ target_link_libraries(${splitk_exe_name} PUBLIC amdhip64 ) -target_compile_definitions(${exe_name} PUBLIC +target_compile_definitions(${exe_name} PUBLIC ATTN_FWD_DECODER_MAIN=1 GLIBCXX_USE_CXX11_ABI=1 __HIP_PLATFORM_HCC__=1 USE_ROCM=1 ) -target_compile_definitions(${splitk_exe_name} PUBLIC +target_compile_definitions(${splitk_exe_name} PUBLIC ATTN_FWD_SPLITK_DECODER_MAIN=1 GLIBCXX_USE_CXX11_ABI=1 __HIP_PLATFORM_HCC__=1 @@ -108,13 +108,13 @@ target_compile_definitions(${splitk_exe_name} PUBLIC ) include(CMakePrintHelpers) -cmake_print_properties(TARGETS ${exe_name} ${splitk_exe_name} PROPERTIES - LINK_LIBRARIES - LINK_DIRECTORIES - INCLUDE_DIRECTORIES - COMPILE_DEFINITIONS +cmake_print_properties(TARGETS ${exe_name} ${splitk_exe_name} PROPERTIES + LINK_LIBRARIES + LINK_DIRECTORIES + INCLUDE_DIRECTORIES + COMPILE_DEFINITIONS COMPILE_OPTIONS SOURCES HIP_ARCHITECTURES) -rocm_install(TARGETS ${exe_name} ${splitk_exe_name}) \ No newline at end of file +rocm_install(TARGETS ${exe_name} ${splitk_exe_name}) diff --git a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md index 829df66469..72cfc4f641 100644 --- a/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md +++ b/xformers/csrc/attention/hip_fmha/GENERATE_INSTANCES.md @@ -1,16 +1,16 @@ # Instances generator - The instances generator is a simple python tool used to generate several hundred of instances (.cpp files) and their references (.h files). - Without this tool, manually writing those instances and references will be very laborious and easy to get wrong. - - The instances generated by this scripts are divided into three categories visible from the scripts: + The instances generator is a simple python tool used to generate several hundred of instances (.cpp files) and their references (.h files). + Without this tool, manually writing those instances and references will be very laborious and easy to get wrong. + + The instances generated by this scripts are divided into three categories visible from the scripts: * Infer -- which refers to instances for calling inference-only kernels * Forward -- which refers to instances for calling training forward kernels * Backward -- which refers to instances for calling training backward kernels - - The instance generator is for being used by the HIP fmha developers themselves. It is not supposed to be used by the xformers users for - building xformers, since for xformers users, the instances are already well prepared as part of the xformers codes. + + The instance generator is for being used by the HIP fmha developers themselves. It is not supposed to be used by the xformers users for + building xformers, since for xformers users, the instances are already well prepared as part of the xformers codes. ## how to use instance generator @@ -21,7 +21,7 @@ ``` * To generate reduced instances (when headdim256 is not required) - ``` + ``` #> python xformers/csrc/attention/hip_fmha/generate_instances.py --ignore-hd256 ``` * More options except for `--ignore-hd256` could be added to suppport further customization in generating instances as required @@ -29,5 +29,3 @@ ## where the instances files are located The instances files and references files are always located under a folder `instances/` that is located under the same directory as the file `generate_instances.py` itself - - diff --git a/xformers/csrc/sparse24/sparse24_gemm_sm90.cu b/xformers/csrc/sparse24/sparse24_gemm_sm90.cu index cabdc7799c..303eca1991 100644 --- a/xformers/csrc/sparse24/sparse24_gemm_sm90.cu +++ b/xformers/csrc/sparse24/sparse24_gemm_sm90.cu @@ -96,10 +96,10 @@ struct SparseRowwiseKernel { float, ElementOut, cutlass::layout::RowMajor, - 1, + 8, ElementOut, cutlass::layout::RowMajor, - 1, + 8, cutlass::epilogue::TmaWarpSpecializedCooperative, EpilogueEVT>::CollectiveOp; @@ -176,10 +176,10 @@ struct SparseRowwiseKernel { float, ElementOut, cutlass::layout::RowMajor, - 1, + 8, ElementOut, cutlass::layout::RowMajor, - 1, + 8, cutlass::epilogue::TmaWarpSpecializedCooperative, EpilogueEVT>::CollectiveOp; diff --git a/xformers/ops/fmha/dispatch.py b/xformers/ops/fmha/dispatch.py index 5908635dac..be0f75ead4 100644 --- a/xformers/ops/fmha/dispatch.py +++ b/xformers/ops/fmha/dispatch.py @@ -31,7 +31,7 @@ def _get_use_fa3() -> bool: def fa3_available() -> bool: has_cuda = torch.version.cuda is not None - is_90a = has_cuda and torch.cuda.get_device_capability() >= (9, 0) + is_90a = has_cuda and (8, 0) <= torch.cuda.get_device_capability() <= (9, 0) has_valid_flash3 = flash3._C_flashattention3 is not None # pyre-ignore[16] return is_90a and has_valid_flash3 diff --git a/xformers/ops/fmha/flash.py b/xformers/ops/fmha/flash.py index 63e436698c..4956bcf25d 100644 --- a/xformers/ops/fmha/flash.py +++ b/xformers/ops/fmha/flash.py @@ -71,7 +71,7 @@ FLASH_VERSION = flash_attn.__version__ FLASH_VER_MIN = parse_version("2.7.1") - FLASH_VER_LAST = parse_version("2.8.3") # last supported, inclusive + FLASH_VER_LAST = parse_version("2.8.4") # last supported, inclusive flash_ver_parsed = parse_version(FLASH_VERSION) if ( flash_ver_parsed < FLASH_VER_MIN or flash_ver_parsed > FLASH_VER_LAST diff --git a/xformers/ops/fmha/flash3.py b/xformers/ops/fmha/flash3.py index b4e55f41ba..770960f1b3 100644 --- a/xformers/ops/fmha/flash3.py +++ b/xformers/ops/fmha/flash3.py @@ -647,6 +647,14 @@ class FwOp(AttentionFwOpBase): @classmethod def not_supported_reasons(cls, d: Inputs) -> List[str]: reasons = super(FwOp, cls).not_supported_reasons(d) + device_type = d.query.device.type + if device_type == "cuda" and (torch.version.hip is None): + device_capability = torch.cuda.get_device_capability(d.device) + if device_capability > cls.CUDA_MINIMUM_COMPUTE_CAPABILITY: + reasons.append( + f"requires device with capability == {cls.CUDA_MINIMUM_COMPUTE_CAPABILITY} " + f"but your GPU has capability {device_capability} (too new)" + ) check_lastdim_alignment_stride1(reasons, "query", d.query, 8) check_lastdim_alignment_stride1(reasons, "key", d.value, 8) check_lastdim_alignment_stride1(reasons, "value", d.value, 8) @@ -801,6 +809,14 @@ class BwOp(AttentionBwOpBase): @classmethod def not_supported_reasons(cls, d: Inputs) -> List[str]: reasons = super(BwOp, cls).not_supported_reasons(d) + device_type = d.query.device.type + if device_type == "cuda" and (torch.version.hip is None): + device_capability = torch.cuda.get_device_capability(d.device) + if device_capability > cls.CUDA_MINIMUM_COMPUTE_CAPABILITY: + reasons.append( + f"requires device with capability == {cls.CUDA_MINIMUM_COMPUTE_CAPABILITY} " + f"but your GPU has capability {device_capability} (too new)" + ) check_lastdim_alignment_stride1(reasons, "query", d.query, 8) check_lastdim_alignment_stride1(reasons, "key", d.value, 8) check_lastdim_alignment_stride1(reasons, "value", d.value, 8)