Skip to content

Commit ee8b9a1

Browse files
committed
feat: add gracefully shutdown support
also fixed 2 bug around resetting nodes
1 parent af23268 commit ee8b9a1

File tree

23 files changed

+864
-76
lines changed

23 files changed

+864
-76
lines changed

.github/workflows/operator-ci.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ on:
4949
env:
5050
REGISTRY: ghcr.io
5151
IMAGE_NAME: ${{ github.repository }}
52-
GO_VERSION: 1.23.7
52+
GO_VERSION: 1.23.8
5353
PLATFORMS: linux/amd64,linux/arm64
5454

5555
# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.

agent/skyhook-agent/src/skyhook_agent/controller.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,26 @@
3333
import os
3434
import shutil
3535
import glob
36+
import signal
3637

3738
from skyhook_agent.step import Step, UpgradeStep, Idempotence, Mode, CHECK_TO_APPLY
3839
from skyhook_agent import interrupts, config
3940
from typing import List
4041

4142
import logging as logger
4243

44+
# Global flag to track if we received SIGTERM
45+
received_sigterm = False
46+
47+
def sigterm_handler(signum, frame):
48+
"""Handle SIGTERM by setting a global flag and logging the event"""
49+
global received_sigterm
50+
received_sigterm = True
51+
logger.info("Received SIGTERM signal - initiating graceful shutdown")
52+
53+
# Register the SIGTERM handler
54+
signal.signal(signal.SIGTERM, sigterm_handler)
55+
4356
class SkyhookValidationError(Exception):
4457
pass
4558

@@ -414,7 +427,11 @@ def remove_flags(step_data: dict[Mode, list[Step|UpgradeStep]], config_data: dic
414427
if os.path.exists(flag_file): # Check if the file exists before trying to remove it
415428
os.remove(flag_file)
416429

417-
def main(mode: Mode, root_mount: str, copy_dir: str, interrupt_data: None|str, always_run_step=False):
430+
def main(mode: Mode, root_mount: str, copy_dir: str, interrupt_data: None|str, always_run_step=False) -> bool:
431+
'''
432+
returns True if the there is a failure in the steps, otherwise returns False
433+
'''
434+
418435
if mode not in set(map(str, Mode)):
419436
logger.warning(f"This version of the Agent doesn't support the {mode} mode. Options are: {','.join(map(str, Mode))}.")
420437
return False
@@ -448,9 +465,19 @@ def main(mode: Mode, root_mount: str, copy_dir: str, interrupt_data: None|str, a
448465
if not os.path.exists(f"{root_mount}/{copy_dir}/configmaps/{f}"):
449466
raise SkyhookValidationError(f"Expected config file {f} not found in configmaps directory.")
450467

451-
return agent_main(mode, root_mount, copy_dir, config_data, interrupt_data, always_run_step)
468+
try:
469+
return agent_main(mode, root_mount, copy_dir, config_data, interrupt_data, always_run_step)
470+
except Exception as e:
471+
if received_sigterm:
472+
logger.info("Gracefully shutting down due to SIGTERM")
473+
# Perform any cleanup if needed
474+
return True
475+
raise
452476

453-
def agent_main(mode: Mode, root_mount: str, copy_dir: str, config_data: dict, interrupt_data: None|str, always_run_step=False):
477+
def agent_main(mode: Mode, root_mount: str, copy_dir: str, config_data: dict, interrupt_data: None|str, always_run_step=False) -> bool:
478+
'''
479+
returns True if the there is a failure in the steps, otherwise returns False
480+
'''
454481

455482
# Pull out step_data so it matches with existing code
456483
step_data = config_data["modes"]
@@ -464,6 +491,11 @@ def agent_main(mode: Mode, root_mount: str, copy_dir: str, config_data: dict, in
464491
logger.warning(f" There are no {mode} steps defined. This will be ran as a no-op.")
465492

466493
for step in step_data.get(mode, []):
494+
# Check for SIGTERM
495+
if received_sigterm:
496+
logger.info("SIGTERM received, stopping step execution")
497+
return True
498+
467499
# Make the flag file without the host path argument (first one). This is because in operator world
468500
# the host path is going to change every time the Skyhook Custom Resource changes so it would
469501
# look like a step hasn't been run when it fact it had.

chart/templates/skyhook-crd.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,10 @@ spec:
349349
- name
350350
type: object
351351
type: array
352+
gracefulShutdown:
353+
description: GracefulShutdown is the graceful shutdown timeout
354+
for the package, if not set, uses k8s default
355+
type: string
352356
image:
353357
description: Image is the container image to run. Do not included
354358
the tag, that is set in the version.

containers/agentless/entrypoint.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@
2020
# LICENSE END
2121
#
2222

23+
# Handle SIGTERM gracefully
24+
cleanup() {
25+
echo "Received SIGTERM signal, shutting down gracefully..."
26+
sleep 3
27+
exit 0
28+
}
29+
trap cleanup SIGTERM
2330

2431
SLEEP_LEN=${SLEEP_LEN:-$(($RANDOM % 5 + 5))}
2532

containers/ci.Dockerfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818
# LICENSE END
1919
#
2020

21+
## this container is not used in github CI, was from before before we open sourced this project.
22+
## should move to doing something like this in the github actions workflow to save time not installing all the deps all the time
23+
## but for now this is just for when we got to that
24+
2125
ARG GO_VERSION
2226

2327
FROM golang:${GO_VERSION}-bookworm as builder
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
#
2+
# LICENSE START
3+
#
4+
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
#
18+
# LICENSE END
19+
#
20+
---
21+
kind: Pod
22+
apiVersion: v1
23+
metadata:
24+
namespace: skyhook
25+
labels:
26+
skyhook.nvidia.com/name: cleanup-pods
27+
skyhook.nvidia.com/package: aa-1.2.3
28+
annotations:
29+
("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")):
30+
{
31+
"name": "aa",
32+
"version": "1.2.3",
33+
"skyhook": "cleanup-pods",
34+
"stage": "apply",
35+
"image": "ghcr.io/nvidia/skyhook/agentless"
36+
}
37+
ownerReferences:
38+
- apiVersion: skyhook.nvidia.com/v1alpha1
39+
kind: Skyhook
40+
name: cleanup-pods
41+
spec:
42+
nodeName: kind-worker
43+
initContainers:
44+
- name: aa-init
45+
- name: aa-apply
46+
args:
47+
([0]): apply
48+
([1]): /root
49+
(length(@)): 3
50+
- name: aa-applycheck
51+
args:
52+
([0]): apply-check
53+
([1]): /root
54+
(length(@)): 3
55+
---
56+
kind: Pod
57+
apiVersion: v1
58+
metadata:
59+
namespace: skyhook
60+
labels:
61+
skyhook.nvidia.com/name: cleanup-pods
62+
skyhook.nvidia.com/package: aa-1.2.3
63+
annotations:
64+
("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")):
65+
{
66+
"name": "aa",
67+
"version": "1.2.3",
68+
"skyhook": "cleanup-pods",
69+
"stage": "config",
70+
"image": "ghcr.io/nvidia/skyhook/agentless"
71+
}
72+
ownerReferences:
73+
- apiVersion: skyhook.nvidia.com/v1alpha1
74+
kind: Skyhook
75+
name: cleanup-pods
76+
spec:
77+
nodeName: kind-worker
78+
initContainers:
79+
- name: aa-init
80+
- name: aa-config
81+
args:
82+
([0]): config
83+
([1]): /root
84+
(length(@)): 3
85+
- name: aa-configcheck
86+
args:
87+
([0]): config-check
88+
([1]): /root
89+
(length(@)): 3
90+
---
91+
kind: Pod
92+
apiVersion: v1
93+
metadata:
94+
namespace: skyhook
95+
labels:
96+
skyhook.nvidia.com/name: cleanup-pods
97+
skyhook.nvidia.com/package: bb-1.2
98+
annotations:
99+
("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")):
100+
{
101+
"name": "bb",
102+
"version": "1.2",
103+
"skyhook": "cleanup-pods",
104+
"stage": "apply",
105+
"image": "ghcr.io/nvidia/skyhook/agentless"
106+
}
107+
ownerReferences:
108+
- apiVersion: skyhook.nvidia.com/v1alpha1
109+
kind: Skyhook
110+
name: cleanup-pods
111+
spec:
112+
nodeName: kind-worker
113+
initContainers:
114+
- name: bb-init
115+
- name: bb-apply
116+
args:
117+
([0]): apply
118+
([1]): /root
119+
(length(@)): 3
120+
- name: bb-applycheck
121+
args:
122+
([0]): apply-check
123+
([1]): /root
124+
(length(@)): 3
125+
---
126+
apiVersion: skyhook.nvidia.com/v1alpha1
127+
kind: Skyhook
128+
metadata:
129+
name: cleanup-pods
130+
status:
131+
status: erroring
132+
observedGeneration: 4
133+
nodeState:
134+
kind-worker:
135+
aa|1.2.3:
136+
name: aa
137+
state: complete
138+
version: '1.2.3'
139+
image: ghcr.io/nvidia/skyhook/agentless
140+
stage: config
141+
bb|1.2:
142+
name: bb
143+
state: erroring
144+
version: '1.2'
145+
image: ghcr.io/nvidia/skyhook/agentless
146+
stage: apply
147+
kind-worker2:
148+
aa|1.2.3:
149+
name: aa
150+
state: complete
151+
version: '1.2.3'
152+
image: ghcr.io/nvidia/skyhook/agentless
153+
stage: config
154+
bb|1.2:
155+
name: bb
156+
state: erroring
157+
version: '1.2'
158+
image: ghcr.io/nvidia/skyhook/agentless
159+
stage: config
160+
161+
nodeStatus:
162+
# grab values should be one and is complete
163+
(values(@)):
164+
- erroring
165+
- erroring
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#
2+
# LICENSE START
3+
#
4+
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
#
18+
# LICENSE END
19+
#
20+
21+
---
22+
kind: Pod
23+
apiVersion: v1
24+
metadata:
25+
namespace: skyhook
26+
labels:
27+
skyhook.nvidia.com/name: cleanup-pods
28+
skyhook.nvidia.com/package: bb-1.2
29+
annotations:
30+
("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")):
31+
{
32+
"name": "bb",
33+
"version": "1.2",
34+
"skyhook": "cleanup-pods",
35+
"stage": "config",
36+
"image": "ghcr.io/nvidia/skyhook/agentless"
37+
}
38+
ownerReferences:
39+
- apiVersion: skyhook.nvidia.com/v1alpha1
40+
kind: Skyhook
41+
name: cleanup-pods
42+
spec:
43+
terminationGracePeriodSeconds: 46
44+
initContainers:
45+
- name: bb-init
46+
- name: bb-config
47+
args:
48+
([0]): config
49+
([1]): /root
50+
(length(@)): 3
51+
- name: bb-configcheck
52+
args:
53+
([0]): config-check
54+
([1]): /root
55+
(length(@)): 3
56+
---
57+
apiVersion: skyhook.nvidia.com/v1alpha1
58+
kind: Skyhook
59+
metadata:
60+
name: cleanup-pods
61+
status:
62+
status: complete
63+
observedGeneration: 4
64+
nodeState:
65+
(values(@)):
66+
- aa|1.2.3:
67+
image: ghcr.io/nvidia/skyhook/agentless
68+
name: aa
69+
stage: config
70+
state: complete
71+
version: 1.2.3
72+
- bb|1.2:
73+
image: ghcr.io/nvidia/skyhook/agentless
74+
name: bb
75+
stage: config
76+
state: complete
77+
version: "1.2"
78+
nodeStatus:
79+
# grab values should be one and is complete
80+
(values(@)):
81+
- complete
82+
- complete

0 commit comments

Comments
 (0)