Skip to content

Commit f3a77b3

Browse files
Merge pull request #168 from winsopc/ib-k8s-ha
Add High Availability support with Kubernetes leader election
2 parents bb697ba + 000e904 commit f3a77b3

File tree

4 files changed

+243
-20
lines changed

4 files changed

+243
-20
lines changed

deployment/ib-kubernetes.yaml

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,13 @@ rules:
1616
- apiGroups: ["k8s.cni.cncf.io"]
1717
resources: ["*"]
1818
verbs: ["get"]
19+
# Leader election permissions
20+
- apiGroups: ["coordination.k8s.io"]
21+
resources: ["leases"]
22+
verbs: ["get", "list", "create", "update", "patch", "watch"]
23+
- apiGroups: [""]
24+
resources: ["events"]
25+
verbs: ["create"]
1926
---
2027
apiVersion: rbac.authorization.k8s.io/v1
2128
kind: ClusterRoleBinding
@@ -42,8 +49,11 @@ metadata:
4249
spec:
4350
progressDeadlineSeconds: 600
4451
strategy:
45-
type: Recreate
46-
replicas: 1
52+
type: RollingUpdate
53+
rollingUpdate:
54+
maxUnavailable: 1
55+
maxSurge: 1
56+
replicas: 2
4757
selector:
4858
matchLabels:
4959
name: ib-kubernetes
@@ -82,6 +92,16 @@ spec:
8292
operator: In
8393
values:
8494
- "linux"
95+
# Pod anti-affinity to ensure pods are not scheduled on the same node
96+
podAntiAffinity:
97+
requiredDuringSchedulingIgnoredDuringExecution:
98+
- labelSelector:
99+
matchExpressions:
100+
- key: name
101+
operator: In
102+
values:
103+
- ib-kubernetes
104+
topologyKey: kubernetes.io/hostname
85105
containers:
86106
- name: ib-kubernetes
87107
image: mellanox/ib-kubernetes
@@ -92,6 +112,18 @@ spec:
92112
cpu: 100m
93113
memory: 300Mi
94114
env:
115+
- name: K8S_NODE
116+
valueFrom:
117+
fieldRef:
118+
fieldPath: spec.nodeName
119+
- name: POD_NAMESPACE
120+
valueFrom:
121+
fieldRef:
122+
fieldPath: metadata.namespace
123+
- name: POD_UID
124+
valueFrom:
125+
fieldRef:
126+
fieldPath: metadata.uid
95127
- name: DAEMON_SM_PLUGIN
96128
valueFrom:
97129
configMapKeyRef:

pkg/daemon/daemon.go

Lines changed: 123 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package daemon
1818

1919
import (
20+
"context"
2021
"encoding/json"
2122
"fmt"
2223
"net"
@@ -31,8 +32,11 @@ import (
3132
"github.com/rs/zerolog/log"
3233
kapi "k8s.io/api/core/v1"
3334
kerrors "k8s.io/apimachinery/pkg/api/errors"
35+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3436
"k8s.io/apimachinery/pkg/types"
3537
"k8s.io/apimachinery/pkg/util/wait"
38+
"k8s.io/client-go/tools/leaderelection"
39+
"k8s.io/client-go/tools/leaderelection/resourcelock"
3640

3741
"github.com/Mellanox/ib-kubernetes/pkg/config"
3842
"github.com/Mellanox/ib-kubernetes/pkg/guid"
@@ -158,19 +162,131 @@ func NewDaemon() (Daemon, error) {
158162
}
159163

160164
func (d *daemon) Run() {
165+
ctx, cancel := context.WithCancel(context.Background())
166+
defer cancel()
167+
161168
// setup signal handling
162169
sigChan := make(chan os.Signal, 1)
163170
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
164171

165-
// Init the guid pool
172+
// Use node name + Pod UID for stable and unique leader identity
173+
nodeName := os.Getenv("K8S_NODE")
174+
if nodeName == "" {
175+
log.Warn().Msg("K8S_NODE environment variable not set, falling back to hostname")
176+
var err error
177+
nodeName, err = os.Hostname()
178+
if err != nil {
179+
log.Error().Msgf("Failed to get hostname: %v", err)
180+
return
181+
}
182+
}
183+
184+
podUID := os.Getenv("POD_UID")
185+
var identity string
186+
if podUID == "" {
187+
log.Warn().Msg("POD_UID environment variable not set, falling back to node name only")
188+
identity = nodeName
189+
} else {
190+
identity = nodeName + "_" + podUID
191+
}
192+
193+
// Get the namespace where this pod is running
194+
namespace := os.Getenv("POD_NAMESPACE")
195+
if namespace == "" {
196+
log.Warn().Msg("POD_NAMESPACE environment variable not set, falling back to 'kube-system'")
197+
namespace = "kube-system"
198+
}
199+
200+
log.Info().Msgf("Starting leader election in namespace: %s with identity: %s", namespace, identity)
201+
202+
// Create leader election configuration
203+
lock := &resourcelock.LeaseLock{
204+
LeaseMeta: metav1.ObjectMeta{
205+
Name: "ib-kubernetes-leader",
206+
Namespace: namespace,
207+
},
208+
Client: d.kubeClient.GetCoordinationV1(),
209+
LockConfig: resourcelock.ResourceLockConfig{
210+
Identity: identity,
211+
},
212+
}
213+
214+
leaderElectionConfig := leaderelection.LeaderElectionConfig{
215+
Lock: lock,
216+
ReleaseOnCancel: true,
217+
LeaseDuration: 60 * time.Second, // Standard Kubernetes components duration
218+
RenewDeadline: 30 * time.Second, // Standard Kubernetes components deadline
219+
RetryPeriod: 20 * time.Second, // Standard Kubernetes components retry
220+
Callbacks: leaderelection.LeaderCallbacks{
221+
OnStartedLeading: func(ctx context.Context) {
222+
log.Info().Msgf("Started leading with identity: %s", identity)
223+
if err := d.becomeLeader(); err != nil {
224+
log.Error().Msgf("Failed to become leader: %v", err)
225+
// Cancel context to gracefully release lease and exit
226+
cancel()
227+
return
228+
}
229+
},
230+
OnStoppedLeading: func() {
231+
log.Error().Msgf("Lost leadership unexpectedly, identity: %s", identity)
232+
// Leadership lost unexpectedly - force immediate restart for clean state
233+
os.Exit(1)
234+
},
235+
OnNewLeader: func(leaderIdentity string) {
236+
if leaderIdentity == identity {
237+
log.Info().Msgf("We are the new leader: %s", leaderIdentity)
238+
} else {
239+
log.Info().Msgf("New leader elected: %s", leaderIdentity)
240+
}
241+
},
242+
},
243+
}
244+
245+
// Start leader election in background
246+
leaderElectionDone := make(chan struct{})
247+
go func() {
248+
defer close(leaderElectionDone)
249+
leaderelection.RunOrDie(ctx, leaderElectionConfig)
250+
}()
251+
252+
// Wait for termination signal or leader election completion
253+
select {
254+
case sig := <-sigChan:
255+
log.Info().Msgf("Received signal %s. Terminating...", sig)
256+
cancel() // This triggers ReleaseOnCancel
257+
// Wait for graceful lease release
258+
select {
259+
case <-leaderElectionDone:
260+
case <-time.After(5 * time.Second):
261+
log.Warn().Msg("Graceful shutdown timeout exceeded")
262+
}
263+
case <-leaderElectionDone:
264+
log.Info().Msg("Leader election completed")
265+
}
266+
}
267+
268+
// becomeLeader is called when this instance becomes the leader
269+
func (d *daemon) becomeLeader() error {
270+
log.Info().Msg("Becoming leader, initializing daemon logic")
271+
272+
// Initialize the GUID pool (rebuild state from existing pods)
166273
if err := d.initPool(); err != nil {
167-
log.Error().Msgf("initPool(): Daemon could not init the guid pool: %v", err)
168-
os.Exit(1)
274+
log.Error().Msgf("initPool(): Leader could not init the guid pool: %v", err)
275+
return fmt.Errorf("failed to initialize GUID pool as leader: %v", err)
169276
}
170277

171-
// Run periodic tasks
172-
// closing the channel will stop the goroutines executed in the wait.Until() calls below
278+
// Start the actual daemon logic
279+
d.runLeaderLogic()
280+
return nil
281+
}
282+
283+
// runLeaderLogic runs the actual daemon operations, only called by the leader
284+
func (d *daemon) runLeaderLogic() {
285+
log.Info().Msg("Starting leader daemon logic")
286+
287+
// Run periodic tasks (only leader should do this)
173288
stopPeriodicsChan := make(chan struct{})
289+
174290
go wait.Until(d.AddPeriodicUpdate, time.Duration(d.config.PeriodicUpdate)*time.Second, stopPeriodicsChan)
175291
go wait.Until(d.DeletePeriodicUpdate, time.Duration(d.config.PeriodicUpdate)*time.Second, stopPeriodicsChan)
176292
defer close(stopPeriodicsChan)
@@ -180,6 +296,8 @@ func (d *daemon) Run() {
180296
defer watcherStopFunc()
181297

182298
// Run until interrupted by os signals
299+
sigChan := make(chan os.Signal, 1)
300+
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
183301
sig := <-sigChan
184302
log.Info().Msgf("Received signal %s. Terminating...", sig)
185303
}

pkg/k8s-client/client.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2929
"k8s.io/apimachinery/pkg/types"
3030
"k8s.io/client-go/kubernetes"
31+
coordv1client "k8s.io/client-go/kubernetes/typed/coordination/v1"
3132
"k8s.io/client-go/rest"
3233
"sigs.k8s.io/controller-runtime/pkg/client/config"
3334
)
@@ -38,6 +39,7 @@ type Client interface {
3839
PatchPod(pod *kapi.Pod, patchType types.PatchType, patchData []byte) error
3940
GetNetworkAttachmentDefinition(namespace, name string) (*netapi.NetworkAttachmentDefinition, error)
4041
GetRestClient() rest.Interface
42+
GetCoordinationV1() coordv1client.CoordinationV1Interface
4143
}
4244

4345
type client struct {
@@ -113,3 +115,8 @@ func (c *client) GetNetworkAttachmentDefinition(namespace, name string) (*netapi
113115
func (c *client) GetRestClient() rest.Interface {
114116
return c.clientset.CoreV1().RESTClient()
115117
}
118+
119+
// GetCoordinationV1 returns the coordination v1 client for leader election
120+
func (c *client) GetCoordinationV1() coordv1client.CoordinationV1Interface {
121+
return c.clientset.CoordinationV1()
122+
}

0 commit comments

Comments
 (0)