Skip to content

increase nofile ulimit if using sccache-dist #2

increase nofile ulimit if using sccache-dist

increase nofile ulimit if using sccache-dist #2

Workflow file for this run

name: Test RAPIDS wheels

Check failure on line 1 in .github/workflows/wheels-test.yaml

View workflow run for this annotation

GitHub Actions / .github/workflows/wheels-test.yaml

Invalid workflow file

(Line: 71, Col: 9): Required property is missing: type
on:
workflow_call:
inputs:
branch:
description: |
Git branch the workflow run targets.
This is required even when 'sha' is provided because it is also used for organizing artifacts.
type: string
date:
description: "Date (YYYY-MM-DD) this run is for. Used to organize artifacts produced by nightly builds"
type: string
sha:
description: "Full git commit SHA to check out"
type: string
repo:
description: "Git repo to check out, in '{org}/{repo}' form, e.g. 'rapidsai/cudf'"
type: string
build_type:
description: "One of: [branch, nightly, pull-request]"
required: true
type: string
matrix_type:
description: "One of: [auto, nightly, pull-request]. 'auto' means 'choose a value based on what's provided via build_type'."
required: false
type: string
default: "auto"
script:
type: string
required: true
description: "Shell code to be executed in a step. Ideally this should just invoke a script managed in the repo the workflow runs from, like 'ci/test_wheel.sh'."
matrix_filter:
description: |
jq expression which modifies the matrix.
For example, 'map(select(.ARCH == "amd64"))' to achieve "only run amd64 jobs".
type: string
default: "."
container-options:
description: |
Command-line arguments passed to 'docker run' when starting the container this workflow runs in.
This should be provided as a single string to be inlined into 'docker run', not an array.
For example, '--quiet --ulimit nofile=2048'.
required: false
type: string
default: "-e _NOOP"
test_summary_show:
description: |
Sets the 'show:' input to the test-summary/action third-party action.
The default, 'fail', means "only show failing tests in the summary in the GitHub UI".
See https://github.com/test-summary/action?tab=readme-ov-file#options for a list of
available options.
required: false
type: string
default: "fail"
rapids-aux-secret-1:
required: false
type: string
default: ''
description: |
The NAME (not value) of a GitHub secret in the calling repo.
This allows callers of the workflow to make a single secret available in the job's
environment, via environment variable `RAPIDS_AUX_SECRET_1`.
build_workflow_name:
description: |
Name of a workflow file that produced artifacts to be downloaded in this run.
If not set (the default), artifact-handling scripts use RAPIDS-conventional defaults (like "build.yaml" when "build_type == nightly").
required: false
type: string
sccache-dist-request-timeout:
default: 7140
description: |
The maximum time (in seconds) the sccache client should wait for a distributed compilation to complete.
sccache-dist-token-secret-name:
type: string
required: false
description: |
The name of the secret that contains the token used to authenticate with the RAPIDS Build Engineering sccache-dist build cluster.
alternative-gh-token-secret-name:
type: string
required: false
description: |
If provided, should contain the name of a secret in the repo which holds a GitHub API token.
When this is non-empty, that secret's value is used in place of the default repo-level token
anywhere that environment variable GH_TOKEN is set. This is especially useful for downloading
artifacts from other private repos, which repo tokens do not have access to.
defaults:
run:
shell: bash
permissions:
actions: read
checks: none
contents: read
deployments: none
discussions: none
id-token: write
issues: none
packages: read
pages: none
pull-requests: read
repository-projects: none
security-events: none
statuses: none
jobs:
compute-matrix:
runs-on: ubuntu-latest
env:
BUILD_TYPE: ${{ inputs.build_type }}
MATRIX_TYPE: ${{ inputs.matrix_type }}
outputs:
MATRIX: ${{ steps.compute-matrix.outputs.MATRIX }}
steps:
- name: Validate Inputs
run: |
if [[ "$BUILD_TYPE" != "branch" ]] && [[ "$BUILD_TYPE" != "nightly" ]] && [[ "$BUILD_TYPE" != "pull-request" ]]; then
echo "Invalid build_type! Must be one of 'branch', 'nightly', or 'pull-request'."
exit 1
fi
if [[ "$MATRIX_TYPE" != "auto" ]] && [[ "$MATRIX_TYPE" != "nightly" ]] && [[ "$MATRIX_TYPE" != "pull-request" ]]; then
echo "Invalid matrix_type! Must be one of 'auto', 'nightly', or 'pull-request'."
exit 1
fi
- name: Compute test matrix
id: compute-matrix
env:
MATRIX_FILTER: ${{ inputs.matrix_filter }}
run: |
set -eo pipefail
# please keep the matrices sorted in ascending order by the following:
#
# [ARCH, PY_VER, CUDA_VER, LINUX_VER, GPU, DRIVER, DEPENDENCIES]
#
export MATRICES="
pull-request:
# amd64
- { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'oldest' }
- { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.1', LINUX_VER: 'ubuntu24.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' }
# arm64
- { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.9.1', LINUX_VER: 'ubuntu22.04', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'latest' }
- { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '13.0.1', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' }
nightly:
# amd64
- { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LINUX_VER: 'rockylinux8', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'oldest' }
- { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'earliest', DEPENDENCIES: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LINUX_VER: 'ubuntu24.04', GPU: 'h100', DRIVER: 'latest', DEPENDENCIES: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.1', LINUX_VER: 'ubuntu24.04', GPU: 'h100', DRIVER: 'latest', DEPENDENCIES: 'latest' }
# arm64
- { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.9.1', LINUX_VER: 'rockylinux8', GPU: 'a100', DRIVER: 'latest', DEPENDENCIES: 'oldest' }
- { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.2.2', LINUX_VER: 'ubuntu22.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' }
- { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '13.0.1', LINUX_VER: 'ubuntu24.04', GPU: 'l4', DRIVER: 'latest', DEPENDENCIES: 'latest' }
"
# only overwrite MATRIX_TYPE if it was set to 'auto'
if [[ "${MATRIX_TYPE}" == "auto" ]]; then
if [[ "${BUILD_TYPE}" == "branch" ]]; then
# Use the nightly matrix for branch tests
MATRIX_TYPE="nightly"
else
MATRIX_TYPE="${BUILD_TYPE}"
fi
fi
export MATRIX_TYPE
TEST_MATRIX=$(yq -n 'env(MATRICES) | .[strenv(MATRIX_TYPE)]')
export TEST_MATRIX
MATRIX="$(
yq -n -o json 'env(TEST_MATRIX)' | \
jq -c "${MATRIX_FILTER} | if (. | length) > 0 then {include: .} else \"Error: Empty matrix\n\" | halt_error(1) end"
)"
echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}"
test:
name: ${{ matrix.CUDA_VER }}, ${{ matrix.PY_VER }}, ${{ matrix.ARCH }}, ${{ matrix.LINUX_VER }}, ${{ matrix.GPU }}, ${{ matrix.DRIVER }}-driver, ${{ matrix.DEPENDENCIES }}-deps
needs: compute-matrix
env:
RAPIDS_ARTIFACTS_DIR: ${{ github.workspace }}/artifacts
RAPIDS_DEPENDENCIES: ${{ matrix.DEPENDENCIES }}
RAPIDS_TESTS_DIR: ${{ github.workspace }}/test-results
strategy:
fail-fast: false
matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }}
runs-on: "linux-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-1"
container:
image: "rapidsai/citestwheel:25.12-cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}"
options: ${{ inputs.container-options }}
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} # GPU jobs must set this container env variable
RAPIDS_BUILD_TYPE: ${{ inputs.build_type }}
steps:
- uses: aws-actions/configure-aws-credentials@00943011d9042930efac3dcd3a170e4273319bc8 # v5.1.0
with:
role-to-assume: ${{ vars.AWS_ROLE_ARN }}
aws-region: ${{ vars.AWS_REGION }}
role-duration-seconds: 43200 # 12h
- name: Run nvidia-smi to make sure GPU is working
run: nvidia-smi
- name: checkout code repo
uses: actions/checkout@v5
with:
repository: ${{ inputs.repo }}
ref: ${{ inputs.sha }}
fetch-depth: 0 # unshallow fetch for setuptools-scm
persist-credentials: false
- name: Telemetry setup
uses: rapidsai/shared-actions/telemetry-dispatch-setup@main
continue-on-error: true
if: ${{ vars.TELEMETRY_ENABLED == 'true' }}
env:
# DOES NOT NEED alternative-gh-token-secret-name - github.token is enough and more limited
GH_TOKEN: ${{ github.token }}
with:
extra_attributes: "rapids.PACKAGER=wheel,rapids.CUDA_VER=${{ matrix.CUDA_VER }},rapids.PY_VER=${{ matrix.PY_VER }},rapids.ARCH=${{ matrix.ARCH }},rapids.LINUX_VER=${{ matrix.LINUX_VER }},rapids.GPU=${{ matrix.GPU }},rapids.DRIVER=${{ matrix.DRIVER }},rapids.DEPENDENCIES=${{ matrix.DEPENDENCIES }}"
- name: Standardize repository information
uses: rapidsai/shared-actions/rapids-github-info@main
with:
repo: ${{ inputs.repo }}
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
sha: ${{ inputs.sha }}
build_workflow_name: ${{ inputs.build_workflow_name }}
- name: Setup proxy cache
uses: nv-gha-runners/setup-proxy-cache@main
continue-on-error: true
# Install latest rapidsai/sccache client and configure sccache-dist
- name: Setup sccache-dist
uses: rapidsai/shared-actions/setup-sccache-dist@fea/setup-sccache-dist
if: ${{ inputs.sccache-dist-token-secret-name != '' }}
env:
AWS_REGION: "${{env.AWS_REGION}}"
AWS_ACCESS_KEY_ID: "${{env.AWS_ACCESS_KEY_ID}}"
AWS_SECRET_ACCESS_KEY: "${{env.AWS_SECRET_ACCESS_KEY}}"
with:
auth: "${{ secrets[inputs.sccache-dist-token-secret-name] }}" # zizmor: ignore[overprovisioned-secrets]
cache-slug: "conda-py${{matrix.PY_VER}}-cuda${{matrix.CUDA_VER}}-${{matrix.ARCH}}"
log-file: "${{ env.RAPIDS_ARTIFACTS_DIR }}/sccache.log"
request-timeout: ${{ inputs.sccache-dist-request-timeout }}
# Per the docs at https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user,
# checking '/rate_limit | jq .' should not itself count against any rate limits.
#
# gh CLI is pre-installed on Github-hosted runners, but may not be on self-hosted runners.
- name: Check GitHub API rate limits
run: |
if ! type gh >/dev/null; then
echo "'gh' CLI is not installed... skipping rate-limits check"
else
gh api /rate_limit | jq .
fi
env:
# NEEDS alternative-gh-token-secret_name - API limits need to be for whatever token is used for upload/download. Repo token may be a different pool for rate limits.
GH_TOKEN: ${{ inputs.alternative-gh-token-secret-name && secrets[inputs.alternative-gh-token-secret-name] || github.token }} # zizmor: ignore[overprovisioned-secrets]
- name: Run tests
run: |
if test -n "${SCCACHE_DIST_TOKEN_NAME:+x}"; then ulimit -n "$(ulimit -Hn)"; fi
$INPUTS_SCRIPT
env:
INPUTS_SCRIPT: "${{ inputs.script }}"
SCCACHE_DIST_TOKEN_NAME: "${{ inputs.sccache-dist-token-secret-name }}"
# NEEDS alternative-gh-token-secret-name - may require a token with more permissions
GH_TOKEN: ${{ inputs.alternative-gh-token-secret-name && secrets[inputs.alternative-gh-token-secret-name] || github.token }} # zizmor: ignore[overprovisioned-secrets]
RAPIDS_AUX_SECRET_1: ${{ inputs.rapids-aux-secret-1 != '' && secrets[inputs.rapids-aux-secret-1] || '' }} # zizmor: ignore[overprovisioned-secrets]
- name: Generate test report
uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
with:
paths: "${{ env.RAPIDS_TESTS_DIR }}/*.xml"
show: ${{ inputs.test_summary_show }}
if: always()
- name: Upload additional artifacts
if: "!cancelled()"
run: rapids-upload-artifacts-dir "cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch)_py${RAPIDS_PY_VERSION//.}"
- name: Telemetry upload attributes
uses: rapidsai/shared-actions/telemetry-dispatch-stash-job-artifacts@main
continue-on-error: true
if: ${{ vars.TELEMETRY_ENABLED == 'true' }}
env:
# DOES NOT NEED alternative-gh-token-secret-name - github.token is enough and more limited
GH_TOKEN: ${{ github.token }}