diff --git a/README.md b/README.md index dc90828..29db8c1 100644 --- a/README.md +++ b/README.md @@ -236,3 +236,8 @@ To indicate when NIC configuration is in progress to the pods that depend on it, To use this mechanism, the next pods in the pipeline can add `nvidia.com/operator.nic-configuration.wait=false` to their node label selectors. That way, they will automatically be evicted from the node when the NICs are being configured. The NIC Configuration Daemon itself relies on the `network.nvidia.com/operator.mofed.wait=false` label to be present on the node as it requires the DOCA-OFED driver to be running for some of the configurations. + +## Feature flags +Feature flags can be enabled via environment variables in the helm chart or NVIDIA Network Operator's NicClusterPolicy. +Supported flags: +* `FW_RESET_AFTER_CONFIG_UPDATE`=`true`: explicitely reset the NIC's Firmware before the reboot and after updating its non-volatile configuration. Might be required on DGX servers where configuration update is not successfully applied after the warm reboot. \ No newline at end of file diff --git a/deployment/nic-configuration-operator-chart/templates/config-daemon.yaml b/deployment/nic-configuration-operator-chart/templates/config-daemon.yaml index 86bead0..991b585 100644 --- a/deployment/nic-configuration-operator-chart/templates/config-daemon.yaml +++ b/deployment/nic-configuration-operator-chart/templates/config-daemon.yaml @@ -43,6 +43,9 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + {{- if .Values.configDaemon.env }} + {{- toYaml .Values.configDaemon.env | nindent 12 }} + {{- end }} {{- if .Values.logLevel}} - name: LOG_LEVEL value: {{ .Values.logLevel }} diff --git a/deployment/nic-configuration-operator-chart/values.yaml b/deployment/nic-configuration-operator-chart/values.yaml index 6d39907..e2df6cc 100644 --- a/deployment/nic-configuration-operator-chart/values.yaml +++ b/deployment/nic-configuration-operator-chart/values.yaml @@ -50,6 +50,11 @@ configDaemon: name: nic-configuration-operator-daemon # -- image tag to use for the config daemon image tag: latest + # -- environment variables for the config daemon + # env: + # -- feature gate to enable FW reset after nv config update + # - name: FW_RESET_AFTER_CONFIG_UPDATE + # value: "true" # -- node selector for the config daemon nodeSelector: network.nvidia.com/operator.mofed.wait: "false" diff --git a/internal/controller/nicdevice_controller.go b/internal/controller/nicdevice_controller.go index 0f5e809..51421fb 100644 --- a/internal/controller/nicdevice_controller.go +++ b/internal/controller/nicdevice_controller.go @@ -19,6 +19,7 @@ import ( "context" "errors" "fmt" + "os" "reflect" "sync" "time" @@ -483,6 +484,18 @@ func (r *NicDeviceReconciler) applyNvConfig(ctx context.Context, status *nicDevi return err } + // On some platforms, explicit FW reset is required before reboot to apply the changes to NV spec + featureGate := os.Getenv(consts.FEATURE_GATE_FW_RESET_AFTER_CONFIG_UPDATE) + if featureGate == consts.LabelValueTrue { + log.Log.Info("Feature gate FW_RESET_AFTER_CONFIG_UPDATE is enabled, resetting NIC firmware before reboot", "device", status.device.Name) + err = r.ConfigurationManager.ResetNicFirmware(ctx, status.device) + if err != nil { + log.Log.Error(err, "failed to reset NIC firmware before reboot", "device", status.device.Name) + return err + } + log.Log.Info("NIC firmware reset successful", "device", status.device.Name) + } + status.rebootRequired = rebootRequired return nil diff --git a/internal/controller/nicdevice_controller_test.go b/internal/controller/nicdevice_controller_test.go index 1fe84ab..110b457 100644 --- a/internal/controller/nicdevice_controller_test.go +++ b/internal/controller/nicdevice_controller_test.go @@ -18,6 +18,7 @@ package controller import ( "context" "errors" + "os" "sync" "time" @@ -414,6 +415,66 @@ var _ = Describe("NicDeviceReconciler", func() { maintenanceManager.AssertCalled(GinkgoT(), "ReleaseMaintenance", mock.Anything) maintenanceManager.AssertExpectations(GinkgoT()) }) + //nolint:errcheck + It("Should reset FW after nv config update if feature gate FW_RESET_AFTER_CONFIG_UPDATE is enabled", func() { + os.Setenv(consts.FEATURE_GATE_FW_RESET_AFTER_CONFIG_UPDATE, consts.LabelValueTrue) + defer os.Unsetenv(consts.FEATURE_GATE_FW_RESET_AFTER_CONFIG_UPDATE) + + configurationManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, true, nil) + configurationManager.On("ApplyDeviceNvSpec", mock.Anything, mock.Anything).Return(true, nil) + configurationManager.On("ResetNicFirmware", mock.Anything, mock.Anything).Return(nil) + maintenanceManager.On("ScheduleMaintenance", mock.Anything).Return(nil) + maintenanceManager.On("MaintenanceAllowed", mock.Anything).Return(true, nil) + + createDevice(false, nil) + + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + _, err := json.Marshal(device.Spec) + Expect(err).NotTo(HaveOccurred()) + + Eventually(getDeviceConditions, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: consts.UpdateStartedReason, + })) + + Eventually(getDeviceConditions, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: consts.PendingRebootReason, + })) + + configurationManager.AssertCalled(GinkgoT(), "ResetNicFirmware", mock.Anything, mock.Anything) + }) + It("Should NOT reset FW after nv config update if feature gate FW_RESET_AFTER_CONFIG_UPDATE is NOT enabled", func() { + configurationManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, true, nil) + configurationManager.On("ApplyDeviceNvSpec", mock.Anything, mock.Anything).Return(true, nil) + configurationManager.On("ResetNicFirmware", mock.Anything, mock.Anything).Return(nil) + maintenanceManager.On("ScheduleMaintenance", mock.Anything).Return(nil) + maintenanceManager.On("MaintenanceAllowed", mock.Anything).Return(true, nil) + + createDevice(false, nil) + + device := &v1alpha1.NicDevice{} + Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed()) + _, err := json.Marshal(device.Spec) + Expect(err).NotTo(HaveOccurred()) + + Eventually(getDeviceConditions, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: consts.UpdateStartedReason, + })) + + Eventually(getDeviceConditions, timeout).Should(testutils.MatchCondition(metav1.Condition{ + Type: consts.ConfigUpdateInProgressCondition, + Status: metav1.ConditionTrue, + Reason: consts.PendingRebootReason, + })) + + configurationManager.AssertNotCalled(GinkgoT(), "ResetNicFirmware", mock.Anything, mock.Anything) + }) It("Should keep in UpdateStarted status if maintenance fails to schedule", func() { errorText := "maintenance request failed" configurationManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, false, nil) diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go index ac0a302..b8aa82d 100644 --- a/pkg/consts/consts.go +++ b/pkg/consts/consts.go @@ -141,4 +141,6 @@ const ( NodeNicConfigurationWaitLabel = "network.nvidia.com/operator.nic-configuration.wait" LabelValueTrue = "true" LabelValueFalse = "false" + + FEATURE_GATE_FW_RESET_AFTER_CONFIG_UPDATE = "FW_RESET_AFTER_CONFIG_UPDATE" )