Skip to content

Commit 7da50fe

Browse files
committed
fix: remove interrupt timeout which was flawed by design
additionally changing the default copy dir to work with versions of linux that do not support exec in the /tmp dir
1 parent ae6b657 commit 7da50fe

File tree

11 files changed

+36
-43
lines changed

11 files changed

+36
-43
lines changed

.vscode/launch.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"type": "go",
1010
"request": "launch",
1111
"mode": "debug",
12-
"program": "${workspaceRoot}/cmd/main.go",
12+
"program": "${workspaceRoot}/operator/cmd/main.go",
1313
"buildFlags": "--ldflags '-X github.com/NVIDIA/skyhook/internal/version.GIT_SHA=foobars -X github.com/NVIDIA/skyhook/internal/version.VERSION=v0.5.0'",
1414
"env": {
1515
"ENABLE_WEBHOOKS": "false",

chart/templates/skyhook-crd.yaml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -126,12 +126,6 @@ spec:
126126
minimum: 0
127127
nullable: true
128128
type: integer
129-
timeout:
130-
default: 15m
131-
description: Timeout is how long we wait before we assume something
132-
bad happened to a give node, and are going to move forward.
133-
format: duration
134-
type: string
135129
type: object
136130
nodeSelectors:
137131
description: NodeSelector are a set of labels we want to monitor nodes

chart/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ controllerManager:
4343
env:
4444
## copyDirRoot is the directory for which the operator will work from on the host.
4545
## Some environments may require this to be set to a specific directory.
46-
copyDirRoot: /tmp
46+
copyDirRoot: /var/lib/skyhook
4747
## enableWebhooks: "true" will enable the webhook setup in the operator controller.
4848
## Default is "true" and is required for production.
4949
enableWebhooks: "true"

operator/Makefile

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ create-kind-cluster: ## deletes and creates a new kind cluster. versions is set
190190

191191
podman-create-machine: ## creates a podman machine
192192
podman machine stop podman-machine-default || true
193-
podman machine rm podman-machine-default || true
193+
podman machine rm -f podman-machine-default || true
194194
podman machine init --cpus=6 -m=12288 --disk-size=300 podman-machine-default
195195
podman machine start podman-machine-default
196196

@@ -271,10 +271,29 @@ ifndef ignore-not-found
271271
ignore-not-found = false
272272
endif
273273

274+
create-namespace: ## Create the namespace in the K8s cluster specified in ~/.kube/config.
275+
$(KUBECTL) create namespace $(SKYHOOK_NAMESPACE) --dry-run=client -o yaml | kubectl apply -f -
276+
274277
.PHONY: install
275-
install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~/.kube/config.
278+
install: manifests kustomize create-namespace ## Install CRDs into the K8s cluster specified in ~/.kube/config.
276279
$(KUSTOMIZE) build config/crd | $(KUBECTL) apply -f -
277-
$(KUBECTL) create namespace $(SKYHOOK_NAMESPACE) --dry-run=client -o yaml | kubectl apply -f -
280+
281+
cert-manager-yaml-url=https://github.com/cert-manager/cert-manager/releases/download/v1.17.0/cert-manager.yaml
282+
.PHONY: install-cert-manager
283+
install-cert-manager: ## Install cert-manager into the K8s cluster specified in ~/.kube/config.
284+
kubectl apply -f $(cert-manager-yaml-url)
285+
sleep 30 ## wait for cert-manager to be ready
286+
287+
.PHONY: install-helm-chart
288+
install-helm-chart: helm create-namespace install-cert-manager ## Install helm chart into the K8s cluster specified in ~/.kube/config.
289+
$(HELM) install skyhook-operator ../chart -n $(SKYHOOK_NAMESPACE)
290+
291+
.PHONY: uninstall-helm-chart
292+
uninstall-helm-chart: helm ## Uninstall helm chart from the K8s cluster specified in ~/.kube/config.
293+
$(HELM) uninstall skyhook-operator -n $(SKYHOOK_NAMESPACE)
294+
295+
uninstall-cert-manager: ## Uninstall cert-manager from the K8s cluster specified in ~/.kube/config.
296+
kubectl delete -f $(cert-manager-yaml-url)
278297

279298
.PHONY: uninstall
280299
uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.

operator/api/v1alpha1/skyhook_types.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,11 +146,6 @@ type InterruptionBudget struct {
146146
//+kubebuilder:validation:Minimum=0
147147
//+nullable
148148
Count *int `json:"count,omitempty"`
149-
150-
// Timeout is how long we wait before we assume something bad happened to a give node, and are going to move forward.
151-
//+kubebuilder:validation:Format=duration
152-
//+kubebuilder:default="15m"
153-
Timeout *metav1.Duration `json:"timeout,omitempty"`
154149
}
155150

156151
func (i *InterruptionBudget) Validate() error {

operator/api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 0 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,6 @@ spec:
113113
minimum: 0
114114
nullable: true
115115
type: integer
116-
timeout:
117-
default: 15m
118-
description: Timeout is how long we wait before we assume something
119-
bad happened to a give node, and are going to move forward.
120-
format: duration
121-
type: string
122116
type: object
123117
nodeSelectors:
124118
description: NodeSelector are a set of labels we want to monitor nodes

operator/config/manager/manager.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ spec:
8484
- name: IMAGE_PULL_SECRET
8585
value: node-init-secret
8686
- name: COPY_DIR_ROOT
87-
value: /tmp
87+
value: /var/lib/skyhook
8888
- name: REAPPLY_ON_REBOOT
8989
value: "false"
9090
- name: RUNTIME_REQUIRED_TAINT_KEY

operator/deps.mk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ GINKGO ?= $(LOCALBIN)/ginkgo
7373
MOCKERY ?= $(LOCALBIN)/mockery
7474
CHAINSAW ?= $(LOCALBIN)/chainsaw
7575
HELMIFY ?= $(LOCALBIN)/helmify
76+
HELM ?= $(LOCALBIN)/helm
7677

7778
.PHONY: kustomize
7879
kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary. If wrong version is installed, it will be removed before downloading.

operator/internal/controller/cluster_state_v2.go

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -265,23 +265,18 @@ func NewNodePicker(runtimeRequiredToleration corev1.Toleration) *NodePicker {
265265
}
266266

267267
// primeAndPruneNodes add current priority from skyhook status, and check time removing old ones
268-
func (s *NodePicker) primeAndPruneNodes(skyhook *wrapper.Skyhook) {
269-
270-
now := time.Now()
271-
for n, t := range skyhook.Status.NodePriority {
272-
if now.Sub(t.Time) > skyhook.Spec.InterruptionBudget.Timeout.Duration {
273-
// prune
274-
// was not cleaned up before now, so it has timed out...
275-
// this could cause issues if the node is still in the cluster api,
276-
// but is bad, but seems like if that is true, there might be bigger problems
277-
// so removing from status
278-
delete(skyhook.Status.NodePriority, n)
279-
skyhook.Updated = true
268+
func (s *NodePicker) primeAndPruneNodes(skyhook *skyhookNodes) {
269+
270+
for n, t := range skyhook.skyhook.Status.NodePriority {
271+
// prune
272+
// if the node is complete, remove it from the priority list
273+
if nodeStatus, _ := skyhook.GetNode(n); nodeStatus == v1alpha1.StatusComplete {
274+
delete(skyhook.skyhook.Status.NodePriority, n)
275+
skyhook.skyhook.Updated = true
280276
} else {
281277
s.priorityNodes[n] = t.Time
282278
}
283279
}
284-
285280
}
286281

287282
// upsertPick updates or inserts the node priority for a given name in the Skyhook object.
@@ -328,7 +323,7 @@ func CheckTaintToleration(tolerations []corev1.Toleration, taints []corev1.Taint
328323

329324
func (np *NodePicker) SelectNodes(s *skyhookNodes) []wrapper.SkyhookNode {
330325

331-
np.primeAndPruneNodes(s.skyhook)
326+
np.primeAndPruneNodes(s)
332327

333328
nodes := make([]wrapper.SkyhookNode, 0)
334329

0 commit comments

Comments
 (0)