Skip to content

Commit 6599c91

Browse files
committed
Fix bug in UncordonNode() so that nodes get uncordoned properly
Our usage of the drain.RunCordonOrUncordon() helpher method was incorrect. We were always cordoning the node when we meant to uncordon. Signed-off-by: Christopher Desiniotis <[email protected]>
1 parent bce348d commit 6599c91

File tree

1 file changed

+10
-28
lines changed

1 file changed

+10
-28
lines changed

internal/kubernetes/client.go

Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,6 @@ const (
4040
nvidiaDomainPrefix = "nvidia.com"
4141
nvidiaResourceNamePrefix = nvidiaDomainPrefix + "/" + "gpu"
4242
nvidiaMigResourcePrefix = nvidiaDomainPrefix + "/" + "mig-"
43-
44-
nodeOperationCordon = "cordon"
45-
nodeOperationUncordon = "uncordon"
4643
)
4744

4845
// Client represents a Kubernetes client wrapper use to perform all the Kubernetes operations required by k8s-driver-manager
@@ -133,42 +130,27 @@ func (c *Client) GetNodeAnnotationValue(nodeName, annotation string) (string, er
133130
// CordonNode cordons a Node given a Node name marking it as Unschedulable
134131
func (c *Client) CordonNode(nodeName string) error {
135132
c.log.Infof("Cordoning node %s", nodeName)
136-
return c.cordonOrUncordon(nodeName, nodeOperationCordon)
133+
134+
node, err := c.clientset.CoreV1().Nodes().Get(c.ctx, nodeName, metav1.GetOptions{})
135+
if err != nil {
136+
return fmt.Errorf("failed to get node %s: %w", nodeName, err)
137+
}
138+
139+
drainHelper := &drain.Helper{Ctx: c.ctx, Client: c.clientset}
140+
return drain.RunCordonOrUncordon(drainHelper, node, true)
137141
}
138142

139143
// UncordonNode uncordons a Node given a Node name marking it as Schedulable
140144
func (c *Client) UncordonNode(nodeName string) error {
141145
c.log.Infof("Uncordoning node %s", nodeName)
142-
return c.cordonOrUncordon(nodeName, nodeOperationUncordon)
143-
}
144146

145-
func (c *Client) cordonOrUncordon(nodeName string, operation string) error {
146-
// Get the node
147147
node, err := c.clientset.CoreV1().Nodes().Get(c.ctx, nodeName, metav1.GetOptions{})
148148
if err != nil {
149149
return fmt.Errorf("failed to get node %s: %w", nodeName, err)
150150
}
151151

152-
switch operation {
153-
case nodeOperationCordon:
154-
node.Spec.Unschedulable = true
155-
case nodeOperationUncordon:
156-
node.Spec.Unschedulable = false
157-
default:
158-
// this should never get executed
159-
panic(fmt.Errorf("unknown operation %q", operation))
160-
}
161-
162-
drainHelper := &drain.Helper{
163-
Ctx: c.ctx,
164-
Client: c.clientset,
165-
}
166-
err = drain.RunCordonOrUncordon(drainHelper, node, true)
167-
if err != nil {
168-
return fmt.Errorf("failed to perform %s of node %s: %w", operation, nodeName, err)
169-
}
170-
171-
return nil
152+
drainHelper := &drain.Helper{Ctx: c.ctx, Client: c.clientset}
153+
return drain.RunCordonOrUncordon(drainHelper, node, false)
172154
}
173155

174156
// WaitForPodTermination will wait for the termination of pods matching labels from the selectorMap on the node with the specified namespace.

0 commit comments

Comments
 (0)