Skip to content

Commit 2dbc4ab

Browse files
authored
Don't restart the sccache process when refreshing credentials (#595)
Our sccache fork [now watches](rapidsai/sccache@eeb723c) for changes to `~/.aws/{config,credentials}` and recreates its S3 storage backend as necessary. This means we don't have to kill and restart the `sccache` client daemon when reloading credentials, so builds don't get killed if they're running when the cron job runs to refresh the temporary S3 creds.
1 parent b9ea85a commit 2dbc4ab

File tree

13 files changed

+407
-42
lines changed

13 files changed

+407
-42
lines changed

.devcontainer/rapids.Dockerfile

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,7 @@ ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAG
5252
ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
5353
ENV SCCACHE_REGION="us-east-2"
5454
ENV SCCACHE_BUCKET="rapids-sccache-devs"
55-
# 2hr (1 minute longer than sccache-dist request timeout)
56-
ENV SCCACHE_IDLE_TIMEOUT=7200
55+
ENV SCCACHE_IDLE_TIMEOUT=0
5756

5857
###
5958
# sccache-dist configuration
@@ -64,13 +63,8 @@ ENV DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST=1
6463
ENV SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=true
6564
# Retry transient errors 4 times (for a total of 5 attempts)
6665
ENV SCCACHE_DIST_MAX_RETRIES=4
67-
ENV SCCACHE_DIST_CONNECT_TIMEOUT=30
68-
ENV SCCACHE_DIST_CONNECTION_POOL=false
6966
# 1hr 59min (to accommodate debug builds)
7067
ENV SCCACHE_DIST_REQUEST_TIMEOUT=7140
71-
ENV SCCACHE_DIST_KEEPALIVE_ENABLED=true
72-
ENV SCCACHE_DIST_KEEPALIVE_INTERVAL=20
73-
ENV SCCACHE_DIST_KEEPALIVE_TIMEOUT=600
7468
ENV SCCACHE_DIST_URL="https://${TARGETARCH}.linux.sccache.rapids.nvidia.com"
7569

7670
# Build as much in parallel as possible

features/src/utils/devcontainer-feature.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "devcontainer-utils",
33
"id": "utils",
4-
"version": "25.12.0",
4+
"version": "25.12.1",
55
"description": "A feature to install RAPIDS devcontainer utility scripts",
66
"containerEnv": {
77
"BASH_ENV": "/etc/bash.bash_env"

features/src/utils/opt/devcontainer/bin/creds/s3/gh/generate.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ _creds_github_generate() {
2929
. devcontainer-utils-init-github-cli;
3030
3131
# Check whether the user is in one of the allowed GitHub orgs
32-
local allowed_orgs="${AWS_GITHUB_ORGS:-${VAULT_GITHUB_ORGS:-nvidia nv-morpheus nv-legate rapids}}";
32+
local allowed_orgs="${AWS_GITHUB_ORGS:-${VAULT_GITHUB_ORGS:-nvidia nv-morpheus nv-legate rapidsai}}";
3333
allowed_orgs="${allowed_orgs// /|}";
3434
allowed_orgs="${allowed_orgs//;/|}";
3535
allowed_orgs="${allowed_orgs//,/|}";

features/src/utils/opt/devcontainer/bin/creds/s3/propagate.sh

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,28 +8,25 @@ _creds_s3_propagate() {
88
. devcontainer-utils-debug-output 'devcontainer_utils_debug' 'creds-s3 creds-s3-propagate';
99

1010
if ! command -V sccache >/dev/null 2>&1; then
11-
return;
11+
return 1;
1212
fi
1313

14-
local num_restarts="0";
15-
16-
devcontainer-utils-stop-sccache --kill-all;
17-
18-
while true; do
19-
20-
if devcontainer-utils-start-sccache >/dev/null; then
21-
if [ "${num_restarts}" -gt "0" ]; then echo "Success!"; fi
22-
exit 0;
14+
seq 0 20 | while read -r num_restarts; do
15+
if devcontainer-utils-creds-s3-test; then
16+
if test "$num_restarts" -gt 0; then
17+
echo "Success!";
18+
fi
19+
return 0;
2320
fi
2421

25-
if [ "${num_restarts}" -ge "20" ]; then
26-
if [ "${num_restarts}" -gt "0" ]; then echo "Skipping."; fi
27-
exit 1;
22+
if test "$num_restarts" -ge 20; then
23+
if test "$num_restarts" -gt 0; then
24+
echo "Skipping.";
25+
fi
26+
return 1;
2827
fi
2928

30-
num_restarts="$((num_restarts + 1))";
31-
32-
if [ "${num_restarts}" -eq "1" ]; then
29+
if test "$num_restarts" -eq 0; then
3330
echo -n "Waiting for AWS S3 credentials to propagate... ";
3431
fi
3532

features/src/utils/opt/devcontainer/bin/creds/s3/test.sh

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,33 +9,56 @@ _creds_s3_test() {
99
# shellcheck disable=SC1091
1010
. devcontainer-utils-debug-output 'devcontainer_utils_debug' 'creds-s3 creds-s3-test';
1111

12-
if ! command -V sccache >/dev/null 2>&1; then exit 1; fi
12+
if ! command -V sccache >/dev/null 2>&1; then
13+
return 1;
14+
fi
1315

1416
if test -f ~/.aws/stamp; then
1517
local -r now="$(date '+%s')";
1618
local -r stamp="$(cat ~/.aws/stamp)";
1719
local ttl="${VAULT_S3_TTL:-"43200"}";
1820
if [ $((now - stamp)) -ge "${ttl%s}" ]; then
19-
exit 1;
21+
return 1;
2022
fi
2123
fi
2224

2325
local bucket="${SCCACHE_BUCKET:-"$(sed -n 's/bucket=//p' ~/.aws/config 2>/dev/null)"}";
24-
if [ -z "${bucket:-}" ]; then exit 1; fi
26+
if ! test -n "${bucket:+n}"; then
27+
return 1;
28+
fi
2529

2630
local region="${SCCACHE_REGION:-"${AWS_DEFAULT_REGION:-"$(sed -n 's/region=//p' ~/.aws/config 2>/dev/null)"}"}";
2731
local aws_access_key_id="${AWS_ACCESS_KEY_ID:-"$(sed -n 's/aws_access_key_id=//p' ~/.aws/credentials 2>/dev/null)"}";
2832
local aws_session_token="${AWS_SESSION_TOKEN:-"$(sed -n 's/aws_session_token=//p' ~/.aws/credentials 2>/dev/null)"}";
2933
local aws_secret_access_key="${AWS_SECRET_ACCESS_KEY:-"$(sed -n 's/aws_secret_access_key=//p' ~/.aws/credentials 2>/dev/null)"}";
3034

31-
AWS_PROFILE=none \
32-
SCCACHE_BUCKET="${bucket:-}" \
33-
SCCACHE_REGION="${region:-}" \
34-
AWS_ACCESS_KEY_ID="${aws_access_key_id:-}" \
35-
AWS_SESSION_TOKEN="${aws_session_token:-}" \
36-
AWS_SECRET_ACCESS_KEY="${aws_secret_access_key:-}" \
37-
devcontainer-utils-start-sccache >/dev/null \
38-
&& sccache --show-stats 2>/dev/null | grep -qE 'Cache location \s+ s3';
35+
devcontainer-utils-stop-sccache --kill -p 4220 || true;
36+
37+
local result=0;
38+
39+
if ! AWS_PROFILE=none \
40+
SCCACHE_BUCKET="${bucket:-}" \
41+
SCCACHE_REGION="${region:-}" \
42+
AWS_ACCESS_KEY_ID="${aws_access_key_id:-}" \
43+
AWS_SESSION_TOKEN="${aws_session_token:-}" \
44+
AWS_SECRET_ACCESS_KEY="${aws_secret_access_key:-}" \
45+
timeout --preserve-status --kill-after=1m 30s \
46+
devcontainer-utils-start-sccache -p 4220 >/dev/null 2>&1; then
47+
result=1;
48+
elif ! SCCACHE_SERVER_PORT=4220 sccache --show-stats 2>/dev/null \
49+
| grep -qE 'Cache location \s+ s3'; then
50+
result=1;
51+
fi
52+
53+
devcontainer-utils-stop-sccache --kill -p 4220 || true;
54+
55+
if test "$result" -eq 0; then
56+
local logfile="${SCCACHE_ERROR_LOG:-/tmp/sccache.log}";
57+
logfile="$(dirname "$logfile")/$(basename -s .log "$logfile").4220.log";
58+
rm -f "${logfile}";
59+
fi
60+
61+
return "$result";
3962
}
4063

4164
_creds_s3_test "$@";

features/src/utils/opt/devcontainer/bin/creds/s3/vault/generate.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ _creds_vault_generate() {
3535
fi
3636

3737
# Check whether the user is in one of the allowed GitHub orgs
38-
local allowed_orgs="${VAULT_GITHUB_ORGS:-nvidia nv-morpheus nv-legate rapids}";
38+
local allowed_orgs="${VAULT_GITHUB_ORGS:-nvidia nv-morpheus nv-legate rapidsai}";
3939
allowed_orgs="${allowed_orgs// /|}";
4040
allowed_orgs="${allowed_orgs//;/|}";
4141
allowed_orgs="${allowed_orgs//,/|}";

features/src/utils/opt/devcontainer/bin/post-attach-command.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,8 @@ if ! test -n "${SKIP_DEVCONTAINER_UTILS_POST_ATTACH_COMMAND:+x}"; then
3737
# Update ~/.config/sccache/config to use gh token auth
3838
devcontainer-utils-init-sccache-dist --enable-sccache-dist-with-github-auth;
3939
fi
40+
elif command -V sccache >/dev/null 2>&1; then
41+
# Start the sccache client
42+
devcontainer-utils-start-sccache --kill-all;
4043
fi
4144
fi

features/src/utils/opt/devcontainer/bin/post-create-command.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ fi
3434
# Randomize the sccache server port in case the container is launched with --network=host
3535
if ! test -n "${SCCACHE_SERVER_PORT:+x}"; then
3636
reset_envvar SCCACHE_SERVER_PORT;
37-
override_envvar SCCACHE_SERVER_PORT "$((4220 + $RANDOM % 4999))";
37+
override_envvar SCCACHE_SERVER_PORT "$((4226 + $RANDOM % 4999))";
3838
fi
3939

4040
sudo mkdir -m 0777 -p /var/log/devcontainer-utils;

features/src/utils/opt/devcontainer/bin/sccache/dist/init.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ _init_sccache_dist() {
7575
done
7676

7777
# Restart the sccache client with the new configuration
78-
devcontainer-utils-start-sccache;
78+
devcontainer-utils-start-sccache --kill-all;
7979

8080
# Verify sccache-dist status and configuration
8181
if sccache --dist-status 2>/dev/null | jq -er '.SchedulerStatus? != null' >/dev/null 2>&1; then

features/src/utils/opt/devcontainer/bin/sccache/start.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
# Start the sccache server in the foreground or background.
77
#
88
# Boolean options:
9-
# -h,--help Print this text.
109
# -f,--foreground Start the sccache server in the foreground.
10+
# -h,--help Print this text.
11+
# -k,--kill SIGKILL the existing sccache server on the given port.
12+
# -a,--kill-all SIGKILL all sccache processes.
1113
#
1214
# Options that require values:
1315
# -p,--port <port> Start the sccache server on <port>.
@@ -29,7 +31,7 @@ _start_sccache() {
2931
local pidfile="/tmp/sccache.${sccache_port}.pid";
3032

3133
# Stop any existing server
32-
devcontainer-utils-stop-sccache -p "${sccache_port}";
34+
devcontainer-utils-stop-sccache "$@";
3335

3436
local logfile="${SCCACHE_ERROR_LOG:-/tmp/sccache.log}";
3537
local log_lvl="${SCCACHE_LOG:-${SCCACHE_SERVER_LOG-}}";

0 commit comments

Comments
 (0)