Skip to content

Commit 244b9d1

Browse files
authored
Merge pull request #11 from souleb/fix-issue-4390974
fix: dpu host node should not stop when network is down
2 parents b969ff5 + 9b75ccb commit 244b9d1

File tree

2 files changed

+52
-24
lines changed

2 files changed

+52
-24
lines changed

go-controller/pkg/node/default_node_network_controller_test.go

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ import (
1111

1212
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config"
1313
adminpolicybasedrouteclient "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/crd/adminpolicybasedroute/v1/apis/clientset/versioned/fake"
14-
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/kube"
1514
"github.com/ovn-org/ovn-kubernetes/go-controller/pkg/kube/mocks"
1615

1716
ovntest "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing"
@@ -776,20 +775,29 @@ func heartbeatDPUHostTest(app *cli.App, uplinkName, hostIP string) {
776775
_, err = kubeFakeClient.CoordinationV1().Leases(defaultLeaseNS).Update(context.Background(), lease, metav1.UpdateOptions{})
777776
Expect(err).NotTo(HaveOccurred())
778777

779-
//verify that error was reported
780-
err = <-errChan
781-
Expect(err).To(HaveOccurred())
778+
Eventually(func(g Gomega) {
779+
// check that the node is tainted
780+
node, err = kubeFakeClient.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
781+
g.Expect(err).NotTo(HaveOccurred())
782+
g.Expect(node.Spec.Taints).To(HaveLen(1))
783+
g.Expect(node.Spec.Taints[0].Key).To(Equal(kapi.TaintNodeNetworkUnavailable))
784+
}).ShouldNot(HaveOccurred())
782785

783-
// check that the node is tainted
784-
node, err = kubeFakeClient.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
786+
// update the lease to make it valid again
787+
lease, err = kubeFakeClient.CoordinationV1().Leases(defaultLeaseNS).Get(context.Background(), nodeName, metav1.GetOptions{})
785788
Expect(err).NotTo(HaveOccurred())
786-
Expect(node.Spec.Taints).To(HaveLen(1))
787-
Expect(node.Spec.Taints[0].Key).To(Equal(networkUnavailableTaintKey))
788-
789-
// remove the taint
790-
err = removeNodeNetworkUnavailableTaint(context.Background(), &kube.Kube{KClient: kubeFakeClient}, nodeName)
789+
lease.Spec.RenewTime = &metav1.MicroTime{Time: time.Now()}
790+
lease.Spec.LeaseDurationSeconds = ptr.To(int32(40))
791+
_, err = kubeFakeClient.CoordinationV1().Leases(defaultLeaseNS).Update(context.Background(), lease, metav1.UpdateOptions{})
791792
Expect(err).NotTo(HaveOccurred())
792793

794+
// check that the node is not tainted
795+
Eventually(func(g Gomega) {
796+
node, err = kubeFakeClient.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
797+
g.Expect(err).NotTo(HaveOccurred())
798+
g.Expect(node.Spec.Taints).To(HaveLen(0))
799+
}).ShouldNot(HaveOccurred())
800+
793801
return nil
794802
}
795803

go-controller/pkg/node/heartbeat_dpu.go

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,13 @@ func (o IntervalOption) Apply(options *heartbeatOptions) {
7979
}
8080

8181
type heartbeat struct {
82-
nodeName string
83-
zone string
84-
client kube.Interface
85-
clientSet kubernetes.Interface
86-
lease *coordinationv1.Lease
87-
errChan chan error
82+
nodeName string
83+
zone string
84+
client kube.Interface
85+
clientSet kubernetes.Interface
86+
lease *coordinationv1.Lease
87+
errChan chan error
88+
taintMarker bool
8889
heartbeatOptions
8990
}
9091

@@ -209,7 +210,6 @@ func (h *heartbeat) runDPUHost(ctx context.Context) error {
209210
h.errChan <- nil
210211
return
211212
case <-ticker.C:
212-
var errs []error
213213
if err := wait.ExponentialBackoffWithContext(ctx,
214214
wait.Backoff{
215215
Duration: retryInterval,
@@ -228,13 +228,33 @@ func (h *heartbeat) runDPUHost(ctx context.Context) error {
228228
h.errChan <- nil
229229
return
230230
}
231-
errs = append(errs, fmt.Errorf("failed to check heartbeat lease: %w", err))
232-
if err := setNodeNetworkUnavailableTaint(ctx, h.client, h.nodeName); err != nil {
233-
klog.Errorf("Failed to set NetworkUnavailable taint: %v", err)
234-
errs = append(errs, err)
231+
if !h.taintMarker {
232+
var errs []error
233+
errs = append(errs, fmt.Errorf("failed to check heartbeat lease: %w", err))
234+
if err := setNodeNetworkUnavailableTaint(ctx, h.client, h.nodeName); err != nil {
235+
klog.Errorf("Failed to set NetworkUnavailable taint: %v", err)
236+
errs = append(errs, err)
237+
h.errChan <- kerrors.NewAggregate(errs)
238+
// if we cannot set the taint, we need to return
239+
// and exit the process, because pod can be scheduled
240+
// on this node and we have an unhealthy node
241+
return
242+
}
243+
// if we set the taint, we need to set the marker
244+
// and move on until next heartbeat
245+
h.taintMarker = true
246+
}
247+
continue
248+
}
249+
// if we are here, it means the heartbeat lease is valid
250+
// and we can remove the taint
251+
if h.taintMarker {
252+
if err := removeNodeNetworkUnavailableTaint(ctx, h.client, h.nodeName); err != nil {
253+
klog.Errorf("Failed to remove NetworkUnavailable taint: %v", err)
254+
h.errChan <- err
255+
return
235256
}
236-
h.errChan <- kerrors.NewAggregate(errs)
237-
return
257+
h.taintMarker = false
238258
}
239259
}
240260
}

0 commit comments

Comments
 (0)