Skip to content

Commit a837602

Browse files
authored
Merge pull request #116 from NVIDIA/fix-uncordon-bug
Fix bug in UncordonNode() so that nodes get uncordoned properly
2 parents 8fce1fe + 6599c91 commit a837602

File tree

1 file changed

+10
-28
lines changed

1 file changed

+10
-28
lines changed

internal/kubernetes/client.go

Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,6 @@ const (
4040
nvidiaDomainPrefix = "nvidia.com"
4141
nvidiaResourceNamePrefix = nvidiaDomainPrefix + "/" + "gpu"
4242
nvidiaMigResourcePrefix = nvidiaDomainPrefix + "/" + "mig-"
43-
44-
nodeOperationCordon = "cordon"
45-
nodeOperationUncordon = "uncordon"
4643
)
4744

4845
// Client represents a Kubernetes client wrapper use to perform all the Kubernetes operations required by k8s-driver-manager
@@ -133,42 +130,27 @@ func (c *Client) GetNodeAnnotationValue(nodeName, annotation string) (string, er
133130
// CordonNode cordons a Node given a Node name marking it as Unschedulable
134131
func (c *Client) CordonNode(nodeName string) error {
135132
c.log.Infof("Cordoning node %s", nodeName)
136-
return c.cordonOrUncordon(nodeName, nodeOperationCordon)
133+
134+
node, err := c.clientset.CoreV1().Nodes().Get(c.ctx, nodeName, metav1.GetOptions{})
135+
if err != nil {
136+
return fmt.Errorf("failed to get node %s: %w", nodeName, err)
137+
}
138+
139+
drainHelper := &drain.Helper{Ctx: c.ctx, Client: c.clientset}
140+
return drain.RunCordonOrUncordon(drainHelper, node, true)
137141
}
138142

139143
// UncordonNode uncordons a Node given a Node name marking it as Schedulable
140144
func (c *Client) UncordonNode(nodeName string) error {
141145
c.log.Infof("Uncordoning node %s", nodeName)
142-
return c.cordonOrUncordon(nodeName, nodeOperationUncordon)
143-
}
144146

145-
func (c *Client) cordonOrUncordon(nodeName string, operation string) error {
146-
// Get the node
147147
node, err := c.clientset.CoreV1().Nodes().Get(c.ctx, nodeName, metav1.GetOptions{})
148148
if err != nil {
149149
return fmt.Errorf("failed to get node %s: %w", nodeName, err)
150150
}
151151

152-
switch operation {
153-
case nodeOperationCordon:
154-
node.Spec.Unschedulable = true
155-
case nodeOperationUncordon:
156-
node.Spec.Unschedulable = false
157-
default:
158-
// this should never get executed
159-
panic(fmt.Errorf("unknown operation %q", operation))
160-
}
161-
162-
drainHelper := &drain.Helper{
163-
Ctx: c.ctx,
164-
Client: c.clientset,
165-
}
166-
err = drain.RunCordonOrUncordon(drainHelper, node, true)
167-
if err != nil {
168-
return fmt.Errorf("failed to perform %s of node %s: %w", operation, nodeName, err)
169-
}
170-
171-
return nil
152+
drainHelper := &drain.Helper{Ctx: c.ctx, Client: c.clientset}
153+
return drain.RunCordonOrUncordon(drainHelper, node, false)
172154
}
173155

174156
// WaitForPodTermination will wait for the termination of pods matching labels from the selectorMap on the node with the specified namespace.

0 commit comments

Comments
 (0)