Skip to content

Commit 0240c3d

Browse files
committed
Support FW_RESET_AFTER_CONFIG_UPDATE feature flag
This flag will enable fw reset after applying nv config. This is required on some DGX server where a warm reboot is not enough to apply the configuration successfully Signed-off-by: Alexander Maslennikov <[email protected]>
1 parent 323096a commit 0240c3d

File tree

6 files changed

+89
-0
lines changed

6 files changed

+89
-0
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,3 +236,8 @@ To indicate when NIC configuration is in progress to the pods that depend on it,
236236
To use this mechanism, the next pods in the pipeline can add `nvidia.com/operator.nic-configuration.wait=false` to their node label selectors. That way, they will automatically be evicted from the node when the NICs are being configured.
237237

238238
The NIC Configuration Daemon itself relies on the `network.nvidia.com/operator.mofed.wait=false` label to be present on the node as it requires the DOCA-OFED driver to be running for some of the configurations.
239+
240+
## Feature flags
241+
Feature flags can be enabled via environment variables in the helm chart or NVIDIA Network Operator's NicClusterPolicy.
242+
Supported flags:
243+
* `FW_RESET_AFTER_CONFIG_UPDATE`=`true`: explicitely reset the NIC's Firmware before the reboot and after updating its non-volatile configuration. Might be required on DGX servers where configuration update is not successfully applied after the warm reboot.

deployment/nic-configuration-operator-chart/templates/config-daemon.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ spec:
4343
valueFrom:
4444
fieldRef:
4545
fieldPath: metadata.namespace
46+
{{- if .Values.configDaemon.env }}
47+
{{- toYaml .Values.configDaemon.env | nindent 12 }}
48+
{{- end }}
4649
{{- if .Values.logLevel}}
4750
- name: LOG_LEVEL
4851
value: {{ .Values.logLevel }}

deployment/nic-configuration-operator-chart/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ configDaemon:
5050
name: nic-configuration-operator-daemon
5151
# -- image tag to use for the config daemon image
5252
tag: latest
53+
# -- environment variables for the config daemon
54+
# env:
55+
# -- feature gate to enable FW reset after nv config update
56+
# - name: FW_RESET_AFTER_CONFIG_UPDATE
57+
# value: "true"
5358
# -- node selector for the config daemon
5459
nodeSelector:
5560
network.nvidia.com/operator.mofed.wait: "false"

internal/controller/nicdevice_controller.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"context"
2020
"errors"
2121
"fmt"
22+
"os"
2223
"reflect"
2324
"sync"
2425
"time"
@@ -483,6 +484,18 @@ func (r *NicDeviceReconciler) applyNvConfig(ctx context.Context, status *nicDevi
483484
return err
484485
}
485486

487+
// On some platforms, explicit FW reset is required before reboot to apply the changes to NV spec
488+
featureGate := os.Getenv(consts.FEATURE_GATE_FW_RESET_AFTER_CONFIG_UPDATE)
489+
if featureGate == consts.LabelValueTrue {
490+
log.Log.Info("Feature gate FW_RESET_AFTER_CONFIG_UPDATE is enabled, resetting NIC firmware before reboot", "device", status.device.Name)
491+
err = r.ConfigurationManager.ResetNicFirmware(ctx, status.device)
492+
if err != nil {
493+
log.Log.Error(err, "failed to reset NIC firmware before reboot", "device", status.device.Name)
494+
return err
495+
}
496+
log.Log.Info("NIC firmware reset successful", "device", status.device.Name)
497+
}
498+
486499
status.rebootRequired = rebootRequired
487500

488501
return nil

internal/controller/nicdevice_controller_test.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package controller
1818
import (
1919
"context"
2020
"errors"
21+
"os"
2122
"sync"
2223
"time"
2324

@@ -414,6 +415,66 @@ var _ = Describe("NicDeviceReconciler", func() {
414415
maintenanceManager.AssertCalled(GinkgoT(), "ReleaseMaintenance", mock.Anything)
415416
maintenanceManager.AssertExpectations(GinkgoT())
416417
})
418+
//nolint:errcheck
419+
It("Should reset FW after nv config update if feature gate FW_RESET_AFTER_CONFIG_UPDATE is enabled", func() {
420+
os.Setenv(consts.FEATURE_GATE_FW_RESET_AFTER_CONFIG_UPDATE, consts.LabelValueTrue)
421+
defer os.Unsetenv(consts.FEATURE_GATE_FW_RESET_AFTER_CONFIG_UPDATE)
422+
423+
configurationManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, true, nil)
424+
configurationManager.On("ApplyDeviceNvSpec", mock.Anything, mock.Anything).Return(true, nil)
425+
configurationManager.On("ResetNicFirmware", mock.Anything, mock.Anything).Return(nil)
426+
maintenanceManager.On("ScheduleMaintenance", mock.Anything).Return(nil)
427+
maintenanceManager.On("MaintenanceAllowed", mock.Anything).Return(true, nil)
428+
429+
createDevice(false, nil)
430+
431+
device := &v1alpha1.NicDevice{}
432+
Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed())
433+
_, err := json.Marshal(device.Spec)
434+
Expect(err).NotTo(HaveOccurred())
435+
436+
Eventually(getDeviceConditions, timeout).Should(testutils.MatchCondition(metav1.Condition{
437+
Type: consts.ConfigUpdateInProgressCondition,
438+
Status: metav1.ConditionTrue,
439+
Reason: consts.UpdateStartedReason,
440+
}))
441+
442+
Eventually(getDeviceConditions, timeout).Should(testutils.MatchCondition(metav1.Condition{
443+
Type: consts.ConfigUpdateInProgressCondition,
444+
Status: metav1.ConditionTrue,
445+
Reason: consts.PendingRebootReason,
446+
}))
447+
448+
configurationManager.AssertCalled(GinkgoT(), "ResetNicFirmware", mock.Anything, mock.Anything)
449+
})
450+
It("Should NOT reset FW after nv config update if feature gate FW_RESET_AFTER_CONFIG_UPDATE is NOT enabled", func() {
451+
configurationManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, true, nil)
452+
configurationManager.On("ApplyDeviceNvSpec", mock.Anything, mock.Anything).Return(true, nil)
453+
configurationManager.On("ResetNicFirmware", mock.Anything, mock.Anything).Return(nil)
454+
maintenanceManager.On("ScheduleMaintenance", mock.Anything).Return(nil)
455+
maintenanceManager.On("MaintenanceAllowed", mock.Anything).Return(true, nil)
456+
457+
createDevice(false, nil)
458+
459+
device := &v1alpha1.NicDevice{}
460+
Expect(k8sClient.Get(ctx, k8sTypes.NamespacedName{Name: deviceName, Namespace: namespaceName}, device)).To(Succeed())
461+
_, err := json.Marshal(device.Spec)
462+
Expect(err).NotTo(HaveOccurred())
463+
464+
Eventually(getDeviceConditions, timeout).Should(testutils.MatchCondition(metav1.Condition{
465+
Type: consts.ConfigUpdateInProgressCondition,
466+
Status: metav1.ConditionTrue,
467+
Reason: consts.UpdateStartedReason,
468+
}))
469+
470+
Eventually(getDeviceConditions, timeout).Should(testutils.MatchCondition(metav1.Condition{
471+
Type: consts.ConfigUpdateInProgressCondition,
472+
Status: metav1.ConditionTrue,
473+
Reason: consts.PendingRebootReason,
474+
}))
475+
476+
configurationManager.AssertNotCalled(GinkgoT(), "ResetNicFirmware", mock.Anything, mock.Anything)
477+
})
417478
It("Should keep in UpdateStarted status if maintenance fails to schedule", func() {
418479
errorText := "maintenance request failed"
419480
configurationManager.On("ValidateDeviceNvSpec", mock.Anything, mock.Anything).Return(true, false, nil)

pkg/consts/consts.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,4 +141,6 @@ const (
141141
NodeNicConfigurationWaitLabel = "network.nvidia.com/operator.nic-configuration.wait"
142142
LabelValueTrue = "true"
143143
LabelValueFalse = "false"
144+
145+
FEATURE_GATE_FW_RESET_AFTER_CONFIG_UPDATE = "FW_RESET_AFTER_CONFIG_UPDATE"
144146
)

0 commit comments

Comments
 (0)