Skip to content

Commit 78a1cc8

Browse files
feat: add NIC Configuration Operator as a NCP state (Mellanox#1373)
2 parents 4df6177 + 03e17b7 commit 78a1cc8

28 files changed

+2470
-5
lines changed

api/v1alpha1/nicclusterpolicy_types.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,43 @@ type DOCATelemetryServiceSpec struct {
288288
Config *DOCATelemetryServiceConfig `json:"config"`
289289
}
290290

291+
// NicFirmwareStorageSpec contains configuration for the NIC firmware storage
292+
type NicFirmwareStorageSpec struct {
293+
// Create specifies whether to create a new PVC or use an existing one
294+
// +kubebuilder:default:=true
295+
Create bool `json:"create,omitempty"`
296+
// PVCName is the name of the PVC to mount as NIC Firmware storage. Default value: "nic-fw-storage-pvc"
297+
// +kubebuilder:validation:MinLength=1
298+
// +kubebuilder:validation:MaxLength=63
299+
// +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`
300+
// +kubebuilder:default:="nic-fw-storage-pvc"
301+
PVCName string `json:"pvcName,omitempty"`
302+
// StorageClassName is the name of a storage class to be used to store NIC FW binaries during NIC FW upgrade.
303+
// If not provided, the cluster-default storage class will be used
304+
// +kubebuilder:validation:MinLength=1
305+
// +kubebuilder:validation:MaxLength=63
306+
// +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`
307+
StorageClassName string `json:"storageClassName,omitempty"`
308+
// storage size for the NIC Configuration Operator to request. Default value: 1Gi
309+
// +kubebuilder:validation:Pattern=`^(\d+)(Ei|Pi|Ti|Gi|Mi|Ki)$`
310+
// +kubebuilder:default:="1Gi"
311+
AvailableStorageSize string `json:"availableStorageSize,omitempty"`
312+
}
313+
314+
// NicConfigurationOperatorSpec is the configuration for NIC Configuration Operator
315+
type NicConfigurationOperatorSpec struct {
316+
// Image information for nic-configuration-operator
317+
Operator *ImageSpec `json:"operator"`
318+
// Image information for nic-configuration-daemon
319+
ConfigurationDaemon *ImageSpec `json:"configurationDaemon"`
320+
// NicFirmwareStorage contains configuration for the NIC firmware storage
321+
NicFirmwareStorage *NicFirmwareStorageSpec `json:"nicFirmwareStorage,omitempty"`
322+
// LogLevel sets the verbosity level of the logs. info|debug
323+
// +kubebuilder:validation:Enum={"info", "debug"}
324+
// +kubebuilder:default:="info"
325+
LogLevel string `json:"logLevel,omitempty"`
326+
}
327+
291328
// NicClusterPolicySpec defines the desired state of NicClusterPolicy
292329
type NicClusterPolicySpec struct {
293330
// OFEDDriver is a specialized driver for NVIDIA NICs which can replace the inbox driver that comes with an OS.
@@ -325,6 +362,10 @@ type NicClusterPolicySpec struct {
325362
// DOCATelemetryService exposes telemetry from NVIDIA networking components to prometheus.
326363
// See: https://docs.nvidia.com/doca/sdk/nvidia+doca+telemetry+service+guide/index.html
327364
DOCATelemetryService *DOCATelemetryServiceSpec `json:"docaTelemetryService,omitempty"`
365+
//nolint:lll
366+
// NicConfigurationOperator provides Kubernetes CRD API to allow FW configuration on NVIDIA NICs in a coordinated manner
367+
// See: https://github.com/Mellanox/nic-configuration-operator
368+
NicConfigurationOperator *NicConfigurationOperatorSpec `json:"nicConfigurationOperator,omitempty"`
328369
// NodeAffinity rules to inject to the DaemonSets objects that are managed by the operator
329370
NodeAffinity *v1.NodeAffinity `json:"nodeAffinity,omitempty"`
330371
// Tolerations to inject to the DaemonSets objects that are managed by the operator

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 50 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/mellanox.com_nicclusterpolicies.yaml

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,186 @@ spec:
216216
- ufmSecret
217217
- version
218218
type: object
219+
nicConfigurationOperator:
220+
description: |-
221+
NicConfigurationOperator provides Kubernetes CRD API to allow FW configuration on NVIDIA NICs in a coordinated manner
222+
See: https://github.com/Mellanox/nic-configuration-operator
223+
properties:
224+
configurationDaemon:
225+
description: Image information for nic-configuration-daemon
226+
properties:
227+
containerResources:
228+
description: ResourceRequirements describes the compute resource
229+
requirements
230+
items:
231+
description: ResourceRequirements describes the compute
232+
resource requirements.
233+
properties:
234+
limits:
235+
additionalProperties:
236+
anyOf:
237+
- type: integer
238+
- type: string
239+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
240+
x-kubernetes-int-or-string: true
241+
description: |-
242+
Limits describes the maximum amount of compute resources allowed.
243+
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
244+
type: object
245+
name:
246+
description: Name of the container the requirements
247+
are set for
248+
type: string
249+
requests:
250+
additionalProperties:
251+
anyOf:
252+
- type: integer
253+
- type: string
254+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
255+
x-kubernetes-int-or-string: true
256+
description: |-
257+
Requests describes the minimum amount of compute resources required.
258+
If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
259+
otherwise to an implementation-defined value. Requests cannot exceed Limits.
260+
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
261+
type: object
262+
required:
263+
- name
264+
type: object
265+
type: array
266+
image:
267+
description: Name of the image
268+
pattern: '[a-zA-Z0-9\-]+'
269+
type: string
270+
imagePullSecrets:
271+
default: []
272+
description: |-
273+
ImagePullSecrets is an optional list of references to secrets in the same
274+
namespace to use for pulling the image
275+
items:
276+
type: string
277+
type: array
278+
repository:
279+
description: Address of the registry that stores the image
280+
pattern: '[a-zA-Z0-9\.\-\/]+'
281+
type: string
282+
version:
283+
description: Version of the image to use
284+
type: string
285+
required:
286+
- image
287+
- repository
288+
- version
289+
type: object
290+
logLevel:
291+
default: info
292+
description: LogLevel sets the verbosity level of the logs. info|debug
293+
enum:
294+
- info
295+
- debug
296+
type: string
297+
nicFirmwareStorage:
298+
description: NicFirmwareStorage contains configuration for the
299+
NIC firmware storage
300+
properties:
301+
availableStorageSize:
302+
default: 1Gi
303+
description: 'storage size for the NIC Configuration Operator
304+
to request. Default value: 1Gi'
305+
pattern: ^(\d+)(Ei|Pi|Ti|Gi|Mi|Ki)$
306+
type: string
307+
create:
308+
default: true
309+
description: Create specifies whether to create a new PVC
310+
or use an existing one
311+
type: boolean
312+
pvcName:
313+
default: nic-fw-storage-pvc
314+
description: 'PVCName is the name of the PVC to mount as NIC
315+
Firmware storage. Default value: "nic-fw-storage-pvc"'
316+
maxLength: 63
317+
minLength: 1
318+
pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$
319+
type: string
320+
storageClassName:
321+
description: |-
322+
StorageClassName is the name of a storage class to be used to store NIC FW binaries during NIC FW upgrade.
323+
If not provided, the cluster-default storage class will be used
324+
maxLength: 63
325+
minLength: 1
326+
pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$
327+
type: string
328+
type: object
329+
operator:
330+
description: Image information for nic-configuration-operator
331+
properties:
332+
containerResources:
333+
description: ResourceRequirements describes the compute resource
334+
requirements
335+
items:
336+
description: ResourceRequirements describes the compute
337+
resource requirements.
338+
properties:
339+
limits:
340+
additionalProperties:
341+
anyOf:
342+
- type: integer
343+
- type: string
344+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
345+
x-kubernetes-int-or-string: true
346+
description: |-
347+
Limits describes the maximum amount of compute resources allowed.
348+
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
349+
type: object
350+
name:
351+
description: Name of the container the requirements
352+
are set for
353+
type: string
354+
requests:
355+
additionalProperties:
356+
anyOf:
357+
- type: integer
358+
- type: string
359+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
360+
x-kubernetes-int-or-string: true
361+
description: |-
362+
Requests describes the minimum amount of compute resources required.
363+
If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
364+
otherwise to an implementation-defined value. Requests cannot exceed Limits.
365+
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
366+
type: object
367+
required:
368+
- name
369+
type: object
370+
type: array
371+
image:
372+
description: Name of the image
373+
pattern: '[a-zA-Z0-9\-]+'
374+
type: string
375+
imagePullSecrets:
376+
default: []
377+
description: |-
378+
ImagePullSecrets is an optional list of references to secrets in the same
379+
namespace to use for pulling the image
380+
items:
381+
type: string
382+
type: array
383+
repository:
384+
description: Address of the registry that stores the image
385+
pattern: '[a-zA-Z0-9\.\-\/]+'
386+
type: string
387+
version:
388+
description: Version of the image to use
389+
type: string
390+
required:
391+
- image
392+
- repository
393+
- version
394+
type: object
395+
required:
396+
- configurationDaemon
397+
- operator
398+
type: object
219399
nicFeatureDiscovery:
220400
description: |-
221401
NicFeatureDiscovery works with NodeFeatureDiscovery to expose information about NVIDIA NICs.

config/rbac/role.yaml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,41 @@ rules:
117117
- get
118118
- list
119119
- watch
120+
- apiGroups:
121+
- configuration.net.nvidia.com
122+
resources:
123+
- nicconfigurationtemplates
124+
- nicdevices
125+
- nicfirmwaresources
126+
- nicfirmwaretemplates
127+
verbs:
128+
- create
129+
- delete
130+
- get
131+
- list
132+
- patch
133+
- update
134+
- watch
135+
- apiGroups:
136+
- configuration.net.nvidia.com
137+
resources:
138+
- nicconfigurationtemplates/finalizers
139+
- nicdevices/finalizers
140+
- nicfirmwaresources/finalizers
141+
- nicfirmwaretemplates/finalizers
142+
verbs:
143+
- update
144+
- apiGroups:
145+
- configuration.net.nvidia.com
146+
resources:
147+
- nicconfigurationtemplates/status
148+
- nicdevices/status
149+
- nicfirmwaresources/status
150+
- nicfirmwaretemplates/status
151+
verbs:
152+
- get
153+
- patch
154+
- update
120155
- apiGroups:
121156
- coordination.k8s.io
122157
resources:
@@ -158,6 +193,18 @@ rules:
158193
- patch
159194
- update
160195
- watch
196+
- apiGroups:
197+
- maintenance.nvidia.com
198+
resources:
199+
- nodemaintenances
200+
verbs:
201+
- create
202+
- delete
203+
- get
204+
- list
205+
- patch
206+
- update
207+
- watch
161208
- apiGroups:
162209
- mellanox.com
163210
resources:

controllers/nicclusterpolicy_controller.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ type NicClusterPolicyReconciler struct {
7272
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;update;patch
7373
// +kubebuilder:rbac:groups="",resources=pods,verbs=list
7474
// +kubebuilder:rbac:groups="",resources=pods/eviction,verbs=create;delete;get;list;patch;update;watch
75+
// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get
7576
// +kubebuilder:rbac:groups=apps,resources=deployments;daemonsets;replicasets;statefulsets,verbs=get;list;watch;create;update;patch;delete
7677
// +kubebuilder:rbac:groups=apps,resources=deployments/finalizers,verbs=update
7778
// +kubebuilder:rbac:groups=monitoring.coreos.com,resources=servicemonitors,verbs=get;list;watch;create
@@ -89,6 +90,19 @@ type NicClusterPolicyReconciler struct {
8990
// +kubebuilder:rbac:groups=admissionregistration.k8s.io,resources=validatingwebhookconfigurations,verbs=get;list;watch;create;update;patch;delete
9091
// +kubebuilder:rbac:groups=image.openshift.io,resources=imagestreams,verbs=get;list;watch
9192
// +kubebuilder:rbac:groups=k8s.cni.cncf.io,resources=network-attachment-definitions,verbs=get;list;watch;create;update;patch;delete
93+
// +kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicconfigurationtemplates,verbs=get;list;watch;create;update;patch;delete
94+
// +kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicconfigurationtemplates/status,verbs=get;update;patch
95+
// +kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicconfigurationtemplates/finalizers,verbs=update
96+
// +kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicdevices/status,verbs=get;update;patch
97+
// +kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicdevices,verbs=get;list;watch;create;update;patch;delete
98+
// +kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicdevices/finalizers,verbs=update
99+
// +kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicfirmwaretemplates,verbs=get;list;watch;create;update;patch;delete
100+
// +kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicfirmwaretemplates/status,verbs=get;update;patch
101+
// +kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicfirmwaretemplates/finalizers,verbs=update
102+
// +kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicfirmwaresources/status,verbs=get;update;patch
103+
// +kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicfirmwaresources,verbs=get;list;watch;create;update;patch;delete
104+
// +kubebuilder:rbac:groups=configuration.net.nvidia.com,resources=nicfirmwaresources/finalizers,verbs=update
105+
// +kubebuilder:rbac:groups=maintenance.nvidia.com,resources=nodemaintenances,verbs=get;list;watch;create;update;patch;delete
92106

93107
// Reconcile is part of the main kubernetes reconciliation loop which aims to
94108
// move the current state of the cluster closer to the desired state.

0 commit comments

Comments
 (0)