Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,3 +236,8 @@ To indicate when NIC configuration is in progress to the pods that depend on it,
To use this mechanism, the next pods in the pipeline can add `nvidia.com/operator.nic-configuration.wait=false` to their node label selectors. That way, they will automatically be evicted from the node when the NICs are being configured.

The NIC Configuration Daemon itself relies on the `network.nvidia.com/operator.mofed.wait=false` label to be present on the node as it requires the DOCA-OFED driver to be running for some of the configurations.

## Feature flags
Feature flags can be enabled via environment variables in the helm chart or NVIDIA Network Operator's NicClusterPolicy.
Supported flags:
* `FW_RESET_AFTER_CONFIG_UPDATE`=`true`: explicitely reset the NIC's Firmware before the reboot and after updating its non-volatile configuration. Might be required on DGX servers where configuration update is not successfully applied after the warm reboot.
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
{{- if .Values.configDaemon.env }}
{{- toYaml .Values.configDaemon.env | nindent 12 }}
{{- end }}
{{- if .Values.logLevel}}
- name: LOG_LEVEL
value: {{ .Values.logLevel }}
Expand Down
5 changes: 5 additions & 0 deletions deployment/nic-configuration-operator-chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ configDaemon:
name: nic-configuration-operator-daemon
# -- image tag to use for the config daemon image
tag: latest
# -- environment variables for the config daemon
# env:
# -- feature gate to enable FW reset after nv config update
# - name: FW_RESET_AFTER_CONFIG_UPDATE
# value: "true"
# -- node selector for the config daemon
nodeSelector:
network.nvidia.com/operator.mofed.wait: "false"
Expand Down
13 changes: 13 additions & 0 deletions internal/controller/nicdevice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"context"
"errors"
"fmt"
"os"
"reflect"
"sync"
"time"
Expand Down Expand Up @@ -483,6 +484,18 @@ func (r *NicDeviceReconciler) applyNvConfig(ctx context.Context, status *nicDevi
return err
}

// On some platforms, explicit FW reset is required before reboot to apply the changes to NV spec
featureGate := os.Getenv(consts.FEATURE_GATE_FW_RESET_AFTER_CONFIG_UPDATE)
if featureGate == consts.LabelValueTrue {
log.Log.Info("Feature gate FW_RESET_AFTER_CONFIG_UPDATE is enabled, resetting NIC firmware before reboot", "device", status.device.Name)
err = r.ConfigurationManager.ResetNicFirmware(ctx, status.device)
if err != nil {
log.Log.Error(err, "failed to reset NIC firmware before reboot", "device", status.device.Name)
return err
}
log.Log.Info("NIC firmware reset successful", "device", status.device.Name)
}

status.rebootRequired = rebootRequired

return nil
Expand Down
61 changes: 61 additions & 0 deletions internal/controller/nicdevice_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package controller
import (
"context"
"errors"
"os"
"sync"
"time"

Expand Down Expand Up @@ -414,6 +415,66 @@ var _ = Describe("NicDeviceReconciler", func() {
maintenanceManager.AssertCalled(GinkgoT(), "ReleaseMaintenance", mock.Anything)
maintenanceManager.AssertExpectations(GinkgoT())
})
//nolint:errcheck
It("Should reset FW after nv config update if feature gate FW_RESET_AFTER_CONFIG_UPDATE is enabled", func() {
os.Setenv(consts.FEATURE_GATE_FW_RESET_AFTER_CONFIG_UPDATE, consts.LabelValueTrue)
defer os.Unsetenv(consts.FEATURE_GATE_FW_RESET_AFTER_CONFIG_UPDATE)

configurationManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, true, nil)
configurationManager.On("ApplyDeviceNvSpec", mock.Anything, mock.Anything).Return(true, nil)
configurationManager.On("ResetNicFirmware", mock.Anything, mock.Anything).Return(nil)
maintenanceManager.On("ScheduleMaintenance", mock.Anything).Return(nil)
maintenanceManager.On("MaintenanceAllowed", mock.Anything).Return(true, nil)

createDevice(false, nil)

device := &v1alpha1.NicDevice{}
Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed())
_, err := json.Marshal(device.Spec)
Expect(err).NotTo(HaveOccurred())

Eventually(getDeviceConditions, timeout).Should(testutils.MatchCondition(metav1.Condition{
Type: consts.ConfigUpdateInProgressCondition,
Status: metav1.ConditionTrue,
Reason: consts.UpdateStartedReason,
}))

Eventually(getDeviceConditions, timeout).Should(testutils.MatchCondition(metav1.Condition{
Type: consts.ConfigUpdateInProgressCondition,
Status: metav1.ConditionTrue,
Reason: consts.PendingRebootReason,
}))

configurationManager.AssertCalled(GinkgoT(), "ResetNicFirmware", mock.Anything, mock.Anything)
})
It("Should NOT reset FW after nv config update if feature gate FW_RESET_AFTER_CONFIG_UPDATE is NOT enabled", func() {
configurationManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, true, nil)
configurationManager.On("ApplyDeviceNvSpec", mock.Anything, mock.Anything).Return(true, nil)
configurationManager.On("ResetNicFirmware", mock.Anything, mock.Anything).Return(nil)
maintenanceManager.On("ScheduleMaintenance", mock.Anything).Return(nil)
maintenanceManager.On("MaintenanceAllowed", mock.Anything).Return(true, nil)

createDevice(false, nil)

device := &v1alpha1.NicDevice{}
Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed())
_, err := json.Marshal(device.Spec)
Expect(err).NotTo(HaveOccurred())

Eventually(getDeviceConditions, timeout).Should(testutils.MatchCondition(metav1.Condition{
Type: consts.ConfigUpdateInProgressCondition,
Status: metav1.ConditionTrue,
Reason: consts.UpdateStartedReason,
}))

Eventually(getDeviceConditions, timeout).Should(testutils.MatchCondition(metav1.Condition{
Type: consts.ConfigUpdateInProgressCondition,
Status: metav1.ConditionTrue,
Reason: consts.PendingRebootReason,
}))

configurationManager.AssertNotCalled(GinkgoT(), "ResetNicFirmware", mock.Anything, mock.Anything)
})
It("Should keep in UpdateStarted status if maintenance fails to schedule", func() {
errorText := "maintenance request failed"
configurationManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, false, nil)
Expand Down
2 changes: 2 additions & 0 deletions pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,6 @@ const (
NodeNicConfigurationWaitLabel = "network.nvidia.com/operator.nic-configuration.wait"
LabelValueTrue = "true"
LabelValueFalse = "false"

FEATURE_GATE_FW_RESET_AFTER_CONFIG_UPDATE = "FW_RESET_AFTER_CONFIG_UPDATE"
)
Loading