Skip to content

Commit 44a0934

Browse files
authored
Merge pull request #188 from ArangoGutierrez/prune
Enable automated Prune via CRD NodeFeatureDiscoveries
2 parents 3e940ba + 8bea7b8 commit 44a0934

11 files changed

+311
-1
lines changed

api/v1/nodefeaturediscovery_types.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,13 @@ type NodeFeatureDiscoverySpec struct {
6363
// worker.
6464
// +optional
6565
WorkerConfig ConfigMap `json:"workerConfig"`
66+
67+
// PruneOnDelete defines whether the NFD-master prune should be
68+
// enabled or not. If enabled, the Operator will deploy an NFD-Master prune
69+
// job that will remove all NFD labels (and other NFD-managed assets such
70+
// as annotations, extended resources and taints) from the cluster nodes.
71+
// +optional
72+
PruneOnDelete bool `json:"prunerOnDelete"`
6673
}
6774

6875
// OperandSpec describes configuration options for the operand
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: v1
2+
kind: ServiceAccount
3+
metadata:
4+
name: nfd-prune
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: ClusterRole
3+
metadata:
4+
name: nfd-prune
5+
rules:
6+
- apiGroups:
7+
- ""
8+
resources:
9+
- nodes
10+
verbs:
11+
- get
12+
- patch
13+
- update
14+
- list
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: ClusterRoleBinding
3+
metadata:
4+
name: nfd-prune
5+
roleRef:
6+
apiGroup: rbac.authorization.k8s.io
7+
kind: ClusterRole
8+
name: nfd-prune
9+
subjects:
10+
- kind: ServiceAccount
11+
name: nfd-prune
12+
namespace: node-feature-discovery-operator
13+
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
labels:
5+
app: nfd
6+
name: nfd-prune
7+
spec:
8+
completions: 1
9+
template:
10+
metadata:
11+
labels:
12+
app: nfd-prune
13+
spec:
14+
affinity:
15+
nodeAffinity:
16+
preferredDuringSchedulingIgnoredDuringExecution:
17+
- preference:
18+
matchExpressions:
19+
- key: node-role.kubernetes.io/master
20+
operator: In
21+
values:
22+
- ""
23+
weight: 1
24+
- preference:
25+
matchExpressions:
26+
- key: node-role.kubernetes.io/control-plane
27+
operator: In
28+
values:
29+
- ""
30+
weight: 1
31+
containers:
32+
- args:
33+
- -prune
34+
command:
35+
- nfd-master
36+
env:
37+
- name: NODE_NAME
38+
valueFrom:
39+
fieldRef:
40+
fieldPath: spec.nodeName
41+
image: $(NODE_FEATURE_DISCOVERY_IMAGE)
42+
imagePullPolicy: Always
43+
name: nfd-master
44+
securityContext:
45+
allowPrivilegeEscalation: false
46+
capabilities:
47+
drop:
48+
- ALL
49+
readOnlyRootFilesystem: true
50+
runAsNonRoot: true
51+
restartPolicy: Never
52+
serviceAccount: nfd-master
53+
tolerations:
54+
- effect: NoSchedule
55+
key: node-role.kubernetes.io/master
56+
operator: Equal
57+
value: ""
58+
- effect: NoSchedule
59+
key: node-role.kubernetes.io/control-plane
60+
operator: Equal
61+
value: ""

config/crd/bases/nfd.kubernetes.io_nodefeaturediscoveries.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ kind: CustomResourceDefinition
44
metadata:
55
annotations:
66
controller-gen.kubebuilder.io/version: v0.8.0
7+
api-approved.kubernetes.io: unapproved, experimental-only
78
creationTimestamp: null
89
name: nodefeaturediscoveries.nfd.kubernetes.io
910
spec:
@@ -71,6 +72,13 @@ spec:
7172
listens for incoming requests.
7273
type: integer
7374
type: object
75+
prunerOnDelete:
76+
description: PruneOnDelete defines whether the NFD-master prune should
77+
be enabled or not. If enabled, the Operator will deploy an NFD-Master
78+
prune job that will remove all NFD labels (and other NFD-managed
79+
assets such as annotations, extended resources and taints) from
80+
the cluster nodes.
81+
type: boolean
7482
resourceLabels:
7583
description: ResourceLabels defines the list of features to be advertised
7684
as extended resources instead of labels.

controllers/nodefeaturediscovery_controls.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"strings"
2323

2424
appsv1 "k8s.io/api/apps/v1"
25+
batchv1 "k8s.io/api/batch/v1"
2526
corev1 "k8s.io/api/core/v1"
2627
rbacv1 "k8s.io/api/rbac/v1"
2728
"k8s.io/apimachinery/pkg/api/errors"
@@ -528,6 +529,67 @@ func Deployment(n NFD) (ResourceStatus, error) {
528529
return Ready, nil
529530
}
530531

532+
// Job checks the readiness of a Job and creates one if it doesn't exist
533+
func Job(n NFD) (ResourceStatus, error) {
534+
// state represents the resource's 'control' function index
535+
state := n.idx
536+
537+
// It is assumed that the index has already been verified to be a
538+
// Job object, so let's get the resource's Job object
539+
obj := n.resources[state].Job
540+
541+
// Update the NFD operand image
542+
obj.Spec.Template.Spec.Containers[0].Image = n.ins.Spec.Operand.ImagePath()
543+
544+
// Update the image pull policy
545+
if n.ins.Spec.Operand.ImagePullPolicy != "" {
546+
obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = n.ins.Spec.Operand.ImagePolicy(n.ins.Spec.Operand.ImagePullPolicy)
547+
}
548+
549+
// Set namespace based on the NFD namespace. (And again,
550+
// it is assumed that the Namespace has already been
551+
// determined before this function was called.)
552+
obj.SetNamespace(n.ins.GetNamespace())
553+
554+
// found states if the Job was found
555+
found := &batchv1.Job{}
556+
557+
klog.InfoS("Looking for Job", "name", obj.Name, "namespace", obj.Namespace)
558+
559+
// SetControllerReference sets the owner as a Controller OwnerReference
560+
// and is used for garbage collection of the controlled object. It is
561+
// also used to reconcile the owner object on changes to the controlled
562+
// object. If we cannot set the owner, then return NotReady
563+
if err := controllerutil.SetControllerReference(n.ins, &obj, n.rec.Scheme); err != nil {
564+
return NotReady, err
565+
}
566+
567+
// Look for the Job to see if it exists, and if so, check if it's
568+
// Ready/NotReady. If the Job does not exist, then attempt to
569+
// create it
570+
err := n.rec.Client.Get(context.TODO(), types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
571+
if err != nil && errors.IsNotFound(err) {
572+
klog.InfoS("Job not found, creating", "name", obj.Name, "namespace", obj.Namespace)
573+
err = n.rec.Client.Create(context.TODO(), &obj)
574+
if err != nil {
575+
klog.ErrorS(err, "Couldn't create Job", "name", obj.Name, "namespace", obj.Namespace)
576+
return NotReady, err
577+
}
578+
return Ready, nil
579+
} else if err != nil {
580+
return NotReady, err
581+
}
582+
583+
// If we found the Job, and is Ready, then we're done
584+
if found.Status.Active > 0 {
585+
return NotReady, nil
586+
} else if found.Status.Failed > 0 {
587+
return NotReady, fmt.Errorf("prune Job failed")
588+
}
589+
590+
return Ready, nil
591+
}
592+
531593
// Service checks if a Service exists and creates one if it doesn't exist
532594
func Service(n NFD) (ResourceStatus, error) {
533595
// state represents the resource's 'control' function index

controllers/nodefeaturediscovery_finalizers.go

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,16 @@ func (r *NodeFeatureDiscoveryReconciler) finalizeNFDOperand(ctx context.Context,
4949
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
5050
}
5151

52+
if instance.Spec.PruneOnDelete {
53+
klog.Info("Deleting NFD labels and NodeFeature CRs from cluster")
54+
if err := deployPrune(ctx, r, instance); err != nil {
55+
klog.Error(err, "Failed to delete NFD labels and NodeFeature CRs from cluster")
56+
return ctrl.Result{}, err
57+
}
58+
} else {
59+
klog.Warning("PruneOnDelete is disabled, NFD labels and NodeFeature CRs will not be deleted from cluster")
60+
}
61+
5262
// If all components are deleted, then remove the finalizer
5363
klog.Info("Secondary check passed. Removing finalizer if it exists.")
5464
if r.hasFinalizer(instance, finalizer) {
@@ -363,6 +373,100 @@ func (r *NodeFeatureDiscoveryReconciler) doComponentsExist(ctx context.Context,
363373
return false
364374
}
365375

376+
// deployPrune deploys nfd-master with --prune option
377+
// to remove labels and NodeFeature CRs
378+
func deployPrune(ctx context.Context, r *NodeFeatureDiscoveryReconciler, instance *nfdv1.NodeFeatureDiscovery) error {
379+
res, ctrl := addResourcesControls("/opt/nfd/prune")
380+
n := NFD{
381+
rec: r,
382+
ins: instance,
383+
idx: 0,
384+
}
385+
386+
n.controls = append(n.controls, ctrl)
387+
n.resources = append(n.resources, res)
388+
389+
// Run through all control functions, return an error on any NotReady resource.
390+
for {
391+
err := n.step()
392+
if err != nil {
393+
return err
394+
}
395+
if n.last() {
396+
break
397+
}
398+
}
399+
400+
// wait until job is finished and then delete it
401+
err := wait.Poll(RetryInterval, time.Minute*3, func() (done bool, err error) {
402+
job, err := r.getJob(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
403+
if err != nil {
404+
return false, err
405+
}
406+
if job.Status.Succeeded > 0 {
407+
return true, nil
408+
}
409+
return false, nil
410+
})
411+
if err != nil {
412+
return err
413+
}
414+
415+
// delete job and RBAC objects
416+
// Attempt to delete the Job
417+
err = wait.Poll(RetryInterval, Timeout, func() (done bool, err error) {
418+
err = r.deleteJob(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
419+
if err != nil {
420+
return false, interpretError(err, "Prune Job")
421+
}
422+
klog.Info("nfd-prune Job resource has been deleted.")
423+
return true, nil
424+
})
425+
if err != nil {
426+
return err
427+
}
428+
// Attempt to delete the ServiceAccount
429+
err = wait.Poll(RetryInterval, Timeout, func() (done bool, err error) {
430+
err = r.deleteServiceAccount(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
431+
if err != nil {
432+
return false, interpretError(err, "Prune ServiceAccount")
433+
}
434+
klog.Info("nfd-prune ServiceAccount resource has been deleted.")
435+
return true, nil
436+
})
437+
if err != nil {
438+
return err
439+
}
440+
441+
// Attempt to delete the ClusterRole
442+
err = wait.Poll(RetryInterval, Timeout, func() (done bool, err error) {
443+
err = r.deleteClusterRole(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
444+
if err != nil {
445+
return false, interpretError(err, "Prune ClusterRole")
446+
}
447+
klog.Info("nfd-prune ClusterRole resource has been deleted.")
448+
return true, nil
449+
})
450+
if err != nil {
451+
return err
452+
}
453+
454+
// Attempt to delete the ClusterRoleBinding
455+
err = wait.Poll(RetryInterval, Timeout, func() (done bool, err error) {
456+
err = r.deleteClusterRoleBinding(ctx, instance.ObjectMeta.Namespace, nfdPruneApp)
457+
if err != nil {
458+
return false, interpretError(err, "Prune ClusterRoleBinding")
459+
}
460+
klog.Info("nfd-prune ClusterRoleBinding resource has been deleted.")
461+
return true, nil
462+
})
463+
if err != nil {
464+
return err
465+
}
466+
467+
return nil
468+
}
469+
366470
// interpretError determines if a resource has already been
367471
// (successfully) deleted
368472
func interpretError(err error, resourceName string) error {

controllers/nodefeaturediscovery_resources.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"strings"
2525

2626
appsv1 "k8s.io/api/apps/v1"
27+
batchv1 "k8s.io/api/batch/v1"
2728
corev1 "k8s.io/api/core/v1"
2829
rbacv1 "k8s.io/api/rbac/v1"
2930
k8serrors "k8s.io/apimachinery/pkg/api/errors"
@@ -46,6 +47,7 @@ type Resources struct {
4647
ClusterRoleBinding rbacv1.ClusterRoleBinding
4748
ConfigMap corev1.ConfigMap
4849
DaemonSet appsv1.DaemonSet
50+
Job batchv1.Job
4951
Deployment appsv1.Deployment
5052
Pod corev1.Pod
5153
Service corev1.Service
@@ -143,6 +145,10 @@ func addResourcesControls(path string) (Resources, controlFunc) {
143145
_, _, err := s.Decode(m, nil, &res.Deployment)
144146
panicIfError(err)
145147
ctrl = append(ctrl, Deployment)
148+
case "Job":
149+
_, _, err := s.Decode(m, nil, &res.Job)
150+
panicIfError(err)
151+
ctrl = append(ctrl, Job)
146152
case "Service":
147153
_, _, err := s.Decode(m, nil, &res.Service)
148154
panicIfError(err)
@@ -184,6 +190,13 @@ func (r *NodeFeatureDiscoveryReconciler) getDeployment(ctx context.Context, name
184190
return d, err
185191
}
186192

193+
// getJob gets one of the NFD Operand's Job
194+
func (r *NodeFeatureDiscoveryReconciler) getJob(ctx context.Context, namespace string, name string) (*batchv1.Job, error) {
195+
j := &batchv1.Job{}
196+
err := r.Get(ctx, client.ObjectKey{Namespace: namespace, Name: name}, j)
197+
return j, err
198+
}
199+
187200
// getConfigMap gets one of the NFD Operand's ConfigMap
188201
func (r *NodeFeatureDiscoveryReconciler) getConfigMap(ctx context.Context, namespace string, name string) (*corev1.ConfigMap, error) {
189202
cm := &corev1.ConfigMap{}
@@ -290,6 +303,22 @@ func (r *NodeFeatureDiscoveryReconciler) deleteDeployment(ctx context.Context, n
290303
return r.Delete(context.TODO(), d)
291304
}
292305

306+
// deleteJob deletes Operand job
307+
func (r *NodeFeatureDiscoveryReconciler) deleteJob(ctx context.Context, namespace string, name string) error {
308+
j, err := r.getJob(ctx, namespace, name)
309+
310+
// Do not return an error if the object has already been deleted
311+
if k8serrors.IsNotFound(err) {
312+
return nil
313+
}
314+
315+
if err != nil {
316+
return err
317+
}
318+
319+
return r.Delete(context.TODO(), j)
320+
}
321+
293322
// deleteService deletes the NFD Operand's Service
294323
func (r *NodeFeatureDiscoveryReconciler) deleteService(ctx context.Context, namespace string, name string) error {
295324
svc, err := r.getService(ctx, namespace, name)

controllers/nodefeaturediscovery_status.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ const (
3333
nfdWorkerApp string = "nfd-worker"
3434
nfdMasterApp string = "nfd-master"
3535
nfdTopologyUpdaterApp string = "nfd-topology-updater"
36+
nfdPruneApp string = "nfd-prune"
3637
)
3738

3839
const (

0 commit comments

Comments
 (0)