Skip to content

Commit 10a4c13

Browse files
Merge branch 'main' into HIPPO-1446
2 parents 6090a48 + de088fb commit 10a4c13

File tree

3 files changed

+273
-0
lines changed

3 files changed

+273
-0
lines changed
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
#!/bin/bash
2+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
set -e
17+
18+
if [ -z "$DEPRECATED_CONDITIONS" ]; then
19+
echo "No deprecated conditions configured"
20+
exit 0
21+
fi
22+
23+
IFS=',' read -ra conditions <<< "$DEPRECATED_CONDITIONS"
24+
echo "Node Condition Cleanup: Removing ${conditions[*]}"
25+
26+
nodes=$(kubectl get nodes -o jsonpath='{.items[*].metadata.name}')
27+
node_count=$(echo "$nodes" | wc -w | tr -d ' ')
28+
echo "Processing $node_count nodes (up to 5 in parallel)..."
29+
30+
process_node() {
31+
local node=$1
32+
local -r MAX_RETRIES=3
33+
local -r BASE_DELAY=0.1
34+
local retry=0
35+
36+
while true; do
37+
# Fetch current conditions (refetch on each retry to resolve conflicts)
38+
local current_conditions
39+
current_conditions=$(kubectl get node "$node" -o json 2>/dev/null | jq -c '.status.conditions // []' 2>/dev/null || echo "[]")
40+
41+
if [ "$current_conditions" = "[]" ]; then
42+
if [ $retry -eq 0 ]; then
43+
echo "- $node: no conditions found"
44+
fi
45+
return 0
46+
fi
47+
48+
# Check if any deprecated conditions exist and filter them out
49+
local filtered_conditions="$current_conditions"
50+
local found_conditions=()
51+
52+
for condition in "${conditions[@]}"; do
53+
# Check if this condition exists using jq -e (exit code based)
54+
if echo "$current_conditions" | jq -e --arg type "$condition" 'any(.[]; .type == $type)' >/dev/null 2>&1; then
55+
found_conditions+=("$condition")
56+
filtered_conditions=$(echo "$filtered_conditions" | jq --arg type "$condition" '[.[] | select(.type != $type)]' 2>/dev/null || echo "$filtered_conditions")
57+
fi
58+
done
59+
60+
if [ ${#found_conditions[@]} -eq 0 ]; then
61+
return 0
62+
fi
63+
64+
# Attempt to patch the node with the freshly computed filtered conditions
65+
local patch_output
66+
if patch_output=$(kubectl patch node "$node" --type=json \
67+
-p="[{\"op\":\"replace\",\"path\":\"/status/conditions\",\"value\":$filtered_conditions}]" \
68+
--subresource=status 2>&1); then
69+
echo "$node: removed ${#found_conditions[@]} condition(s): ${found_conditions[*]}"
70+
return 0
71+
fi
72+
73+
# Check if this is a conflict error that warrants retry
74+
if echo "$patch_output" | grep -qi "conflict"; then
75+
retry=$((retry + 1))
76+
77+
if [ $retry -lt $MAX_RETRIES ]; then
78+
local delay=$(awk "BEGIN {print $BASE_DELAY * (2 ^ ($retry - 1))}")
79+
sleep "$delay"
80+
continue
81+
fi
82+
83+
echo "$node: conflict persists after $MAX_RETRIES attempts - $patch_output"
84+
return 1
85+
fi
86+
87+
# Non-retriable error
88+
echo "$node: patch failed - $patch_output"
89+
return 1
90+
done
91+
}
92+
93+
active_jobs=0
94+
max_parallel=5
95+
processed=0
96+
97+
for node in $nodes; do
98+
process_node "$node" &
99+
active_jobs=$((active_jobs + 1))
100+
processed=$((processed + 1))
101+
102+
if [ $active_jobs -ge $max_parallel ]; then
103+
wait -n 2>/dev/null || true
104+
active_jobs=$((active_jobs - 1))
105+
fi
106+
107+
if [ $((processed % 10)) -eq 0 ]; then
108+
echo "[$processed/$node_count nodes started]"
109+
fi
110+
done
111+
112+
wait
113+
echo "Cleanup completed for $node_count nodes"
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
{{ if .Values.nodeConditionCleanup.enabled }}
16+
---
17+
apiVersion: v1
18+
kind: ServiceAccount
19+
metadata:
20+
name: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
21+
namespace: {{ .Release.Namespace }}
22+
labels:
23+
{{- include "nvsentinel.labels" . | nindent 4 }}
24+
annotations:
25+
helm.sh/hook: post-upgrade
26+
helm.sh/hook-weight: "-5"
27+
helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded,hook-failed
28+
argocd.argoproj.io/hook: PostSync
29+
argocd.argoproj.io/hook-delete-policy: BeforeHookCreation
30+
---
31+
apiVersion: rbac.authorization.k8s.io/v1
32+
kind: ClusterRole
33+
metadata:
34+
name: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
35+
labels:
36+
{{- include "nvsentinel.labels" . | nindent 4 }}
37+
annotations:
38+
helm.sh/hook: post-upgrade
39+
helm.sh/hook-weight: "-5"
40+
helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded,hook-failed
41+
argocd.argoproj.io/hook: PostSync
42+
argocd.argoproj.io/hook-delete-policy: BeforeHookCreation
43+
rules:
44+
- apiGroups:
45+
- ""
46+
resources:
47+
- nodes
48+
verbs:
49+
- get
50+
- list
51+
- patch
52+
- apiGroups:
53+
- ""
54+
resources:
55+
- nodes/status
56+
verbs:
57+
- get
58+
- patch
59+
- update
60+
---
61+
apiVersion: rbac.authorization.k8s.io/v1
62+
kind: ClusterRoleBinding
63+
metadata:
64+
name: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
65+
labels:
66+
{{- include "nvsentinel.labels" . | nindent 4 }}
67+
annotations:
68+
helm.sh/hook: post-upgrade
69+
helm.sh/hook-weight: "-5"
70+
helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded,hook-failed
71+
argocd.argoproj.io/hook: PostSync
72+
argocd.argoproj.io/hook-delete-policy: BeforeHookCreation
73+
roleRef:
74+
apiGroup: rbac.authorization.k8s.io
75+
kind: ClusterRole
76+
name: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
77+
subjects:
78+
- kind: ServiceAccount
79+
name: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
80+
namespace: {{ .Release.Namespace }}
81+
---
82+
apiVersion: batch/v1
83+
kind: Job
84+
metadata:
85+
name: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
86+
namespace: {{ .Release.Namespace }}
87+
labels:
88+
{{- include "nvsentinel.labels" . | nindent 4 }}
89+
annotations:
90+
helm.sh/hook: post-upgrade
91+
helm.sh/hook-weight: "-5"
92+
helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded,hook-failed
93+
argocd.argoproj.io/hook: PostSync
94+
argocd.argoproj.io/hook-delete-policy: BeforeHookCreation
95+
spec:
96+
backoffLimit: 3
97+
activeDeadlineSeconds: 300
98+
template:
99+
metadata:
100+
labels:
101+
{{- include "nvsentinel.selectorLabels" . | nindent 8 }}
102+
app.kubernetes.io/component: node-condition-cleanup
103+
spec:
104+
serviceAccountName: {{ include "nvsentinel.fullname" . }}-node-condition-cleanup
105+
restartPolicy: OnFailure
106+
{{- with .Values.global.imagePullSecrets }}
107+
imagePullSecrets:
108+
{{- toYaml . | nindent 8 }}
109+
{{- end }}
110+
{{- with .Values.nodeConditionCleanup.nodeSelector }}
111+
nodeSelector:
112+
{{- toYaml . | nindent 8 }}
113+
{{- end }}
114+
{{- with .Values.nodeConditionCleanup.tolerations }}
115+
tolerations:
116+
{{- toYaml . | nindent 8 }}
117+
{{- end }}
118+
containers:
119+
- name: cleanup
120+
image: {{ .Values.nodeConditionCleanup.image.repository }}:{{ .Values.nodeConditionCleanup.image.tag }}
121+
imagePullPolicy: {{ .Values.nodeConditionCleanup.image.pullPolicy }}
122+
env:
123+
- name: DEPRECATED_CONDITIONS
124+
value: {{ .Values.nodeConditionCleanup.deprecatedConditions | join "," | quote }}
125+
command:
126+
- /bin/bash
127+
- -c
128+
- |
129+
{{ .Files.Get "files/node-condition-cleanup.sh" | indent 10 }}
130+
resources:
131+
{{- toYaml .Values.nodeConditionCleanup.resources | nindent 10 }}
132+
securityContext:
133+
allowPrivilegeEscalation: false
134+
runAsNonRoot: true
135+
runAsUser: 65534
136+
readOnlyRootFilesystem: true
137+
capabilities:
138+
drop:
139+
- ALL
140+
seccompProfile:
141+
type: RuntimeDefault
142+
{{- end }}

distros/kubernetes/nvsentinel/values.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,21 @@ platformConnector:
105105
- "cloud.google.com/gce-topology-subblock"
106106

107107
socketPath: "/var/run/nvsentinel.sock"
108+
109+
# Node condition cleanup hook configuration
110+
nodeConditionCleanup:
111+
enabled: false
112+
deprecatedConditions: []
113+
# - "OldConditionType1"
114+
# - "OldConditionType2"
115+
image:
116+
repository: docker.io/bitnamilegacy/kubectl
117+
tag: "1.30.6"
118+
pullPolicy: IfNotPresent
119+
resources:
120+
limits:
121+
cpu: 100m
122+
memory: 128Mi
123+
requests:
124+
cpu: 50m
125+
memory: 64Mi

0 commit comments

Comments
 (0)