From ec608dfc03daf4a059be5ebf42aaf9a660322574 Mon Sep 17 00:00:00 2001 From: Paul Taylor <178183+trxcllnt@users.noreply.github.com> Date: Thu, 20 Nov 2025 09:16:20 -0800 Subject: [PATCH] Use `sccache-dist` build cluster for conda and wheel builds (#341) ## Description RAPIDS has deployed an autoscaling cloud build cluster that can be used to accelerate building large RAPIDS projects. This PR updates the conda and wheel builds to use the build cluster. This contributes to https://github.com/rapidsai/build-planning/issues/228. --- .devcontainer/Dockerfile | 1 + .github/workflows/build.yaml | 9 ++++++++ .github/workflows/pr.yaml | 13 ++++++++++++ .github/workflows/test.yaml | 4 ++++ ci/build_cpp.sh | 4 ++-- ci/build_python.sh | 7 +++--- ci/build_wheel.sh | 6 +++++- cmake/rapids_config.cmake | 3 +++ conda/recipes/libwholegraph/recipe.yaml | 26 +++++++++++++++++------ conda/recipes/pylibwholegraph/recipe.yaml | 25 +++++++++++++++++----- 10 files changed, 80 insertions(+), 18 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 7053dd27..9b070b49 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -40,6 +40,7 @@ ENV HISTFILE="/home/coder/.cache/._bash_history" ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs" ENV SCCACHE_REGION="us-east-2" ENV SCCACHE_BUCKET="rapids-sccache-devs" +ENV SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE=true ENV SCCACHE_IDLE_TIMEOUT=0 ### diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index ee76c240..b0d27477 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -39,8 +39,10 @@ jobs: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} date: ${{ inputs.date }} + node_type: cpu8 script: ci/build_cpp.sh sha: ${{ inputs.sha }} + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN python-build: needs: [cpp-build] secrets: inherit @@ -51,6 +53,7 @@ jobs: date: ${{ inputs.date }} script: ci/build_python.sh sha: ${{ inputs.sha }} + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN docs-build: needs: cpp-build secrets: inherit @@ -82,10 +85,12 @@ jobs: branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} date: ${{ inputs.date }} + node_type: cpu8 script: ci/build_wheel_cugraph-pyg.sh package-name: cugraph-pyg package-type: python pure-wheel: true + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN wheel-publish-cugraph-pyg: needs: wheel-build-cugraph-pyg secrets: inherit @@ -105,11 +110,13 @@ jobs: branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} date: ${{ inputs.date }} + node_type: cpu8 script: ci/build_wheel_libwholegraph.sh package-name: libwholegraph package-type: cpp # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN wheel-build-pylibwholegraph: needs: wheel-build-libwholegraph secrets: inherit @@ -119,9 +126,11 @@ jobs: branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} date: ${{ inputs.date }} + node_type: cpu8 script: ci/build_wheel_pylibwholegraph.sh package-name: pylibwholegraph package-type: python + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN wheel-publish-libwholegraph: needs: wheel-build-libwholegraph secrets: inherit diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 24ea205b..b6403091 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -107,7 +107,9 @@ jobs: uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/25.12 with: build_type: pull-request + node_type: cpu8 script: ci/build_cpp.sh + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN conda-cpp-tests: needs: [conda-cpp-build, changed-files] secrets: inherit @@ -116,6 +118,7 @@ jobs: with: build_type: pull-request script: ci/test_cpp.sh + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN conda-python-build: needs: conda-cpp-build secrets: inherit @@ -123,6 +126,7 @@ jobs: with: build_type: pull-request script: ci/build_python.sh + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit @@ -145,6 +149,7 @@ jobs: # ref: https://github.com/pytorch/pytorch/issues/159779 matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER | startswith("12")))) script: ci/test_python.sh + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN docs-build: needs: conda-cpp-build secrets: inherit @@ -161,19 +166,23 @@ jobs: with: build_type: pull-request script: ci/build_wheel_libwholegraph.sh + node_type: cpu8 package-name: libwholegraph package-type: cpp # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN wheel-build-pylibwholegraph: needs: [checks, wheel-build-libwholegraph] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/25.12 with: build_type: pull-request + node_type: cpu8 script: ci/build_wheel_pylibwholegraph.sh package-name: pylibwholegraph package-type: python + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN wheel-tests-pylibwholegraph: needs: [wheel-build-pylibwholegraph, changed-files] secrets: inherit @@ -183,6 +192,7 @@ jobs: build_type: pull-request script: ci/test_wheel_pylibwholegraph.sh matrix_filter: map(select(.ARCH == "amd64")) + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN wheel-build-cugraph-pyg: needs: checks secrets: inherit @@ -191,10 +201,12 @@ jobs: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) build_type: pull-request + node_type: cpu8 script: ci/build_wheel_cugraph-pyg.sh package-name: cugraph-pyg package-type: python pure-wheel: true + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN wheel-tests-cugraph-pyg: needs: [wheel-build-pylibwholegraph, wheel-build-cugraph-pyg, changed-files] secrets: inherit @@ -204,3 +216,4 @@ jobs: build_type: pull-request script: ci/test_wheel_cugraph-pyg.sh matrix_filter: map(select(.ARCH == "amd64")) + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index cdaf0147..ea88dcef 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -32,6 +32,7 @@ jobs: date: ${{ inputs.date }} script: ci/test_cpp.sh sha: ${{ inputs.sha }} + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN conda-notebook-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/25.12 @@ -56,6 +57,7 @@ jobs: # TODO: remove the CUDA 13 exclusion here once there are pytorch CUDA 13 packages # ref: https://github.com/pytorch/pytorch/issues/159779 matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER | startswith("12")))) + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN wheel-tests-pylibwholegraph: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/25.12 @@ -66,6 +68,7 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_wheel_pylibwholegraph.sh matrix_filter: map(select(.ARCH == "amd64")) + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN wheel-tests-cugraph-pyg: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/25.12 @@ -76,3 +79,4 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_wheel_cugraph-pyg.sh matrix_filter: map(select(.ARCH == "amd64")) + sccache-dist-token-secret-name: GIST_REPO_READ_ORG_GITHUB_TOKEN diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 4f739b0f..2e0dde1b 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -5,7 +5,6 @@ set -euo pipefail source rapids-configure-sccache - source rapids-date-string export CMAKE_GENERATOR=Ninja @@ -14,7 +13,7 @@ rapids-print-env rapids-logger "Begin cpp build" -sccache --zero-stats +sccache --stop-server 2>/dev/null || true RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) export RAPIDS_PACKAGE_VERSION @@ -30,3 +29,4 @@ rattler-build build --recipe conda/recipes/libwholegraph \ "${RATTLER_CHANNELS[@]}" sccache --show-adv-stats +sccache --stop-server >/dev/null 2>&1 || true diff --git a/ci/build_python.sh b/ci/build_python.sh index 320b0c86..c7331969 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -5,7 +5,6 @@ set -euo pipefail source rapids-configure-sccache - source rapids-date-string export CMAKE_GENERATOR=Ninja @@ -27,7 +26,7 @@ rapids-logger "Prepending channel ${CPP_CHANNEL} to RATTLER_CHANNELS" RATTLER_CHANNELS=("--channel" "${CPP_CHANNEL}" "${RATTLER_CHANNELS[@]}") # TODO: Remove `--test skip` flags once importing on a CPU node works correctly -sccache --zero-stats +sccache --stop-server 2>/dev/null || true rapids-logger "Building pylibwholegraph" @@ -40,7 +39,7 @@ rattler-build build --recipe conda/recipes/pylibwholegraph \ "${RATTLER_CHANNELS[@]}" sccache --show-adv-stats -sccache --zero-stats +sccache --stop-server >/dev/null 2>&1 || true rapids-logger "Building cugraph-pyg" @@ -53,7 +52,7 @@ rattler-build build --recipe conda/recipes/cugraph-pyg \ "${RATTLER_CHANNELS[@]}" sccache --show-adv-stats -sccache --zero-stats +sccache --stop-server >/dev/null 2>&1 || true # remove build_cache directory to avoid uploading the entire source tree # tracked in https://github.com/prefix-dev/rattler-build/issues/1424 diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 9531de3e..47170a7a 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -30,13 +30,16 @@ fi source rapids-configure-sccache source rapids-date-string +export SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX="${package_name}/${RAPIDS_CONDA_ARCH}/cuda${RAPIDS_CUDA_VERSION%%.*}/wheel/preprocessor-cache" +export SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE=true + rapids-generate-version > ./VERSION RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")" cd "${package_dir}" -sccache --zero-stats +sccache --stop-server 2>/dev/null || true rapids-logger "Building '${package_name}' wheel" rapids-pip-retry wheel \ @@ -47,6 +50,7 @@ rapids-pip-retry wheel \ . sccache --show-adv-stats +sccache --stop-server >/dev/null 2>&1 || true # pure-python packages should be marked as pure, and not have auditwheel run on them. if [[ ${package_name} == "cugraph-pyg" ]]; then diff --git a/cmake/rapids_config.cmake b/cmake/rapids_config.cmake index 768e2487..9cda1f26 100644 --- a/cmake/rapids_config.cmake +++ b/cmake/rapids_config.cmake @@ -35,3 +35,6 @@ if(NOT rapids-cmake-branch) set(rapids-cmake-branch "${_rapids_branch}") endif() include("${CMAKE_CURRENT_LIST_DIR}/RAPIDS.cmake") + +# Don't use sccache-dist for CMake's compiler tests +set(ENV{SCCACHE_NO_DIST_COMPILE} "1") diff --git a/conda/recipes/libwholegraph/recipe.yaml b/conda/recipes/libwholegraph/recipe.yaml index d0ca3b7b..abc1b524 100644 --- a/conda/recipes/libwholegraph/recipe.yaml +++ b/conda/recipes/libwholegraph/recipe.yaml @@ -33,18 +33,32 @@ cache: - AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY - AWS_SESSION_TOKEN + - SCCACHE_DIST_AUTH_TOKEN env: CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }} CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }} CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }} CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }} - PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL") }} - SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }} - SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }} - SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }} - SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }} - SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }} + NVCC_APPEND_FLAGS: ${{ env.get("NVCC_APPEND_FLAGS", default="") }} + PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL", default="8") }} + RAPIDS_ARTIFACTS_DIR: ${{ env.get("RAPIDS_ARTIFACTS_DIR", default="") }} + SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET", default="") }} + SCCACHE_DIST_AUTH_TYPE: ${{ env.get("SCCACHE_DIST_AUTH_TYPE", default="token") }} + SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE: ${{ env.get("SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE", default="false") }} + SCCACHE_DIST_MAX_RETRIES: ${{ env.get("SCCACHE_DIST_MAX_RETRIES", default="inf") }} + SCCACHE_DIST_REQUEST_TIMEOUT: ${{ env.get("SCCACHE_DIST_REQUEST_TIMEOUT", default="7140") }} + SCCACHE_DIST_SCHEDULER_URL: ${{ env.get("SCCACHE_DIST_SCHEDULER_URL", default="") }} + SCCACHE_ERROR_LOG: ${{ env.get("SCCACHE_ERROR_LOG", default="/tmp/sccache.log") }} + SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT", default="0") }} + SCCACHE_NO_CACHE: ${{ env.get("SCCACHE_NO_CACHE", default="") }} + SCCACHE_RECACHE: ${{ env.get("SCCACHE_RECACHE", default="") }} + SCCACHE_REGION: ${{ env.get("SCCACHE_REGION", default="") }} SCCACHE_S3_KEY_PREFIX: libwholegraph/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_major }} + SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS", default="false") }} + SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: libwholegraph/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_major }}/conda/preprocessor-cache + SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: ${{ env.get("SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE", default="true") }} + SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL", default="true") }} + SCCACHE_SERVER_LOG: ${{ env.get("SCCACHE_SERVER_LOG", default="sccache=debug") }} requirements: build: diff --git a/conda/recipes/pylibwholegraph/recipe.yaml b/conda/recipes/pylibwholegraph/recipe.yaml index aaff2f6d..44043d20 100644 --- a/conda/recipes/pylibwholegraph/recipe.yaml +++ b/conda/recipes/pylibwholegraph/recipe.yaml @@ -28,17 +28,32 @@ build: - AWS_ACCESS_KEY_ID - AWS_SECRET_ACCESS_KEY - AWS_SESSION_TOKEN + - SCCACHE_DIST_AUTH_TOKEN env: CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }} CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }} CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }} CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }} - SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }} - SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }} - SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }} - SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }} - SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }} + NVCC_APPEND_FLAGS: ${{ env.get("NVCC_APPEND_FLAGS", default="") }} + PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL", default="8") }} + RAPIDS_ARTIFACTS_DIR: ${{ env.get("RAPIDS_ARTIFACTS_DIR", default="") }} + SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET", default="") }} + SCCACHE_DIST_AUTH_TYPE: ${{ env.get("SCCACHE_DIST_AUTH_TYPE", default="token") }} + SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE: ${{ env.get("SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE", default="false") }} + SCCACHE_DIST_MAX_RETRIES: ${{ env.get("SCCACHE_DIST_MAX_RETRIES", default="inf") }} + SCCACHE_DIST_REQUEST_TIMEOUT: ${{ env.get("SCCACHE_DIST_REQUEST_TIMEOUT", default="7140") }} + SCCACHE_DIST_SCHEDULER_URL: ${{ env.get("SCCACHE_DIST_SCHEDULER_URL", default="") }} + SCCACHE_ERROR_LOG: ${{ env.get("SCCACHE_ERROR_LOG", default="/tmp/sccache.log") }} + SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT", default="0") }} + SCCACHE_NO_CACHE: ${{ env.get("SCCACHE_NO_CACHE", default="") }} + SCCACHE_RECACHE: ${{ env.get("SCCACHE_RECACHE", default="") }} + SCCACHE_REGION: ${{ env.get("SCCACHE_REGION", default="") }} SCCACHE_S3_KEY_PREFIX: pylibwholegraph/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_major }} + SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS", default="false") }} + SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: pylibwholegraph/${{ env.get("RAPIDS_CONDA_ARCH") }}/cuda${{ cuda_major }}/conda/preprocessor-cache + SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: ${{ env.get("SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE", default="true") }} + SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL", default="true") }} + SCCACHE_SERVER_LOG: ${{ env.get("SCCACHE_SERVER_LOG", default="sccache=debug") }} requirements: build: