Skip to content

Commit 5a35ede

Browse files
committed
feat: Add post-upgrade hook to clean up deprecated node conditions
1 parent f950fb0 commit 5a35ede

File tree

3 files changed

+251
-0
lines changed

3 files changed

+251
-0
lines changed
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/bin/bash
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
set -e
17+
18+
if [ -z "$DEPRECATED_CONDITIONS" ]; then
19+
echo "No deprecated conditions configured"
20+
exit 0
21+
fi
22+
23+
IFS=',' read -ra conditions <<< "$DEPRECATED_CONDITIONS"
24+
echo "Node Condition Cleanup: Removing ${conditions[*]}"
25+
26+
nodes=$(kubectl get nodes -o jsonpath='{.items[*].metadata.name}')
27+
node_count=$(echo "$nodes" | wc -w | tr -d ' ')
28+
echo "Processing $node_count nodes (up to 6 in parallel)..."
29+
30+
process_node() {
31+
local node=$1
32+
local removed=0
33+
34+
# Fetch current conditions once
35+
local current_conditions
36+
current_conditions=$(kubectl get node "$node" -o json 2>/dev/null | jq -c '.status.conditions // []' 2>/dev/null || echo "[]")
37+
38+
if [ "$current_conditions" = "[]" ]; then
39+
echo "- $node: no conditions found"
40+
return
41+
fi
42+
43+
# Check if any deprecated conditions exist and filter them out
44+
local filtered_conditions="$current_conditions"
45+
local found_conditions=()
46+
47+
for condition in "${conditions[@]}"; do
48+
# Check if this condition exists
49+
local exists
50+
exists=$(echo "$current_conditions" | jq --arg type "$condition" 'any(.[]; .type == $type)' 2>/dev/null || echo "false")
51+
52+
if [ "$exists" = "true" ]; then
53+
found_conditions+=("$condition")
54+
filtered_conditions=$(echo "$filtered_conditions" | jq --arg type "$condition" '[.[] | select(.type != $type)]' 2>/dev/null || echo "$filtered_conditions")
55+
fi
56+
done
57+
58+
if [ ${#found_conditions[@]} -eq 0 ]; then
59+
return
60+
fi
61+
62+
# Apply the filtered conditions atomically
63+
local patch_output
64+
if patch_output=$(kubectl patch node "$node" --type=json -p="[{\"op\":\"replace\",\"path\":\"/status/conditions\",\"value\":$filtered_conditions}]" --subresource=status 2>&1); then
65+
echo "$node: removed ${#found_conditions[@]} condition(s): ${found_conditions[*]}"
66+
else
67+
echo "$node: failed to remove conditions - $patch_output"
68+
fi
69+
}
70+
71+
active_jobs=0
72+
max_parallel=6
73+
processed=0
74+
75+
for node in $nodes; do
76+
process_node "$node" &
77+
active_jobs=$((active_jobs + 1))
78+
processed=$((processed + 1))
79+
80+
if [ $active_jobs -ge $max_parallel ]; then
81+
wait -n 2>/dev/null || true
82+
active_jobs=$((active_jobs - 1))
83+
fi
84+
85+
if [ $((processed % 10)) -eq 0 ]; then
86+
echo "[$processed/$node_count nodes started]"
87+
fi
88+
done
89+
90+
wait
91+
echo "Cleanup completed for $node_count nodes"
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
{{- if .Values.nodeConditionCleanup.enabled }}
16+
---
17+
apiVersion: v1
18+
kind: ServiceAccount
19+
metadata:
20+
name: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
21+
namespace: {{ .Release.Namespace }}
22+
labels:
23+
{{- include "nvsentinel.labels" . | nindent 4 }}
24+
annotations:
25+
helm.sh/hook: post-upgrade
26+
helm.sh/hook-weight: "-5"
27+
helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded,hook-failed
28+
argocd.argoproj.io/hook: PostSync
29+
argocd.argoproj.io/hook-delete-policy: BeforeHookCreation
30+
---
31+
apiVersion: rbac.authorization.k8s.io/v1
32+
kind: ClusterRole
33+
metadata:
34+
name: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
35+
labels:
36+
{{- include "nvsentinel.labels" . | nindent 4 }}
37+
annotations:
38+
helm.sh/hook: post-upgrade
39+
helm.sh/hook-weight: "-5"
40+
helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded,hook-failed
41+
argocd.argoproj.io/hook: PostSync
42+
argocd.argoproj.io/hook-delete-policy: BeforeHookCreation
43+
rules:
44+
- apiGroups:
45+
- ""
46+
resources:
47+
- nodes
48+
verbs:
49+
- get
50+
- list
51+
- patch
52+
- apiGroups:
53+
- ""
54+
resources:
55+
- nodes/status
56+
verbs:
57+
- get
58+
- patch
59+
- update
60+
---
61+
apiVersion: rbac.authorization.k8s.io/v1
62+
kind: ClusterRoleBinding
63+
metadata:
64+
name: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
65+
labels:
66+
{{- include "nvsentinel.labels" . | nindent 4 }}
67+
annotations:
68+
helm.sh/hook: post-upgrade
69+
helm.sh/hook-weight: "-5"
70+
helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded,hook-failed
71+
argocd.argoproj.io/hook: PostSync
72+
argocd.argoproj.io/hook-delete-policy: BeforeHookCreation
73+
roleRef:
74+
apiGroup: rbac.authorization.k8s.io
75+
kind: ClusterRole
76+
name: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
77+
subjects:
78+
- kind: ServiceAccount
79+
name: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
80+
namespace: {{ .Release.Namespace }}
81+
---
82+
apiVersion: batch/v1
83+
kind: Job
84+
metadata:
85+
name: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
86+
namespace: {{ .Release.Namespace }}
87+
labels:
88+
{{- include "nvsentinel.labels" . | nindent 4 }}
89+
annotations:
90+
helm.sh/hook: post-upgrade
91+
helm.sh/hook-weight: "-5"
92+
helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded,hook-failed
93+
argocd.argoproj.io/hook: PostSync
94+
argocd.argoproj.io/hook-delete-policy: BeforeHookCreation
95+
spec:
96+
backoffLimit: 3
97+
activeDeadlineSeconds: 300
98+
template:
99+
metadata:
100+
labels:
101+
{{- include "nvsentinel.selectorLabels" . | nindent 8 }}
102+
app.kubernetes.io/component: node-condition-cleanup
103+
spec:
104+
serviceAccountName: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
105+
restartPolicy: OnFailure
106+
{{- with .Values.global.imagePullSecrets }}
107+
imagePullSecrets:
108+
{{- toYaml . | nindent 8 }}
109+
{{- end }}
110+
{{- with .Values.nodeConditionCleanup.nodeSelector }}
111+
nodeSelector:
112+
{{- toYaml . | nindent 8 }}
113+
{{- end }}
114+
{{- with .Values.nodeConditionCleanup.tolerations }}
115+
tolerations:
116+
{{- toYaml . | nindent 8 }}
117+
{{- end }}
118+
containers:
119+
- name: cleanup
120+
image: {{ .Values.nodeConditionCleanup.image.repository }}:{{ .Values.nodeConditionCleanup.image.tag }}
121+
imagePullPolicy: {{ .Values.nodeConditionCleanup.image.pullPolicy }}
122+
env:
123+
- name: DEPRECATED_CONDITIONS
124+
value: {{ .Values.nodeConditionCleanup.deprecatedConditions | join "," | quote }}
125+
command:
126+
- /bin/bash
127+
- -c
128+
- |
129+
{{ .Files.Get "files/node-condition-cleanup.sh" | indent 10 }}
130+
resources:
131+
{{- toYaml .Values.nodeConditionCleanup.resources | nindent 10 }}
132+
securityContext:
133+
allowPrivilegeEscalation: false
134+
runAsNonRoot: true
135+
runAsUser: 65534
136+
readOnlyRootFilesystem: true
137+
capabilities:
138+
drop:
139+
- ALL
140+
seccompProfile:
141+
type: RuntimeDefault
142+
{{- end }}

distros/kubernetes/nvsentinel/values.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,21 @@ platformConnector:
105105
- "cloud.google.com/gce-topology-subblock"
106106

107107
socketPath: "/var/run/nvsentinel.sock"
108+
109+
# Node condition cleanup hook configuration
110+
nodeConditionCleanup:
111+
enabled: false
112+
deprecatedConditions: []
113+
# - "OldConditionType1"
114+
# - "OldConditionType2"
115+
image:
116+
repository: docker.io/bitnamilegacy/kubectl
117+
tag: "1.30.6"
118+
pullPolicy: IfNotPresent
119+
resources:
120+
limits:
121+
cpu: 100m
122+
memory: 128Mi
123+
requests:
124+
cpu: 50m
125+
memory: 64Mi

0 commit comments

Comments
 (0)