diff --git a/.github/workflows/agentless-container.yaml b/.github/workflows/agentless-container.yaml index 987411cc..e20c3a12 100644 --- a/.github/workflows/agentless-container.yaml +++ b/.github/workflows/agentless-container.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - name: Build and push agentless container image # Configures this workflow to run every time a tag is created diff --git a/.github/workflows/commit-linting.yaml b/.github/workflows/commit-linting.yaml index c0ef45b0..e7da020a 100644 --- a/.github/workflows/commit-linting.yaml +++ b/.github/workflows/commit-linting.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - name: Commit Linting on: pull_request jobs: diff --git a/.github/workflows/operator-ci.yaml b/.github/workflows/operator-ci.yaml index accd24b1..42289a50 100644 --- a/.github/workflows/operator-ci.yaml +++ b/.github/workflows/operator-ci.yaml @@ -49,26 +49,26 @@ on: env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} - GO_VERSION: 1.23.7 + GO_VERSION: 1.23.8 PLATFORMS: linux/amd64,linux/arm64 # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. jobs: - unit-test: + ## move it all down to the tests job, should do all the tests then, and not double install stuff + # unit-test: + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v4 + # - name: Setup Go 1.23 + # uses: actions/setup-go@v5 + # with: + # go-version: 1.23 + # - name: Unit tests + # run: | + # cd operator + # make unit-tests + tests: runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Setup Go 1.23 - uses: actions/setup-go@v5 - with: - go-version: 1.23 - - name: Unit tests - run: | - cd operator - make unit-tests - k8s-tests: - runs-on: ubuntu-latest - needs: [unit-test] # Don't run the k8s tests if the unit tests fail steps: - uses: actions/checkout@v4 with: @@ -94,10 +94,10 @@ jobs: run: | cd operator GITHUB_TOKEN=${{ secrets.github_token }} make create-kind-cluster - make e2e-tests + make test build-and-push-operator: runs-on: ubuntu-latest - needs: [k8s-tests] # Don't run the build and push if the k8s tests fail + needs: [tests] # Don't run the build and push if the k8s tests fail # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. permissions: contents: read diff --git a/.gitignore b/.gitignore index 3c2c83e9..405983bd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ .cursorignore .pytest_cache -.vscode .idea \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..184a15b8 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,35 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Debug Manager", + "type": "go", + "request": "launch", + "mode": "debug", + "program": "${workspaceRoot}/operator/cmd/main.go", + "buildFlags": "--ldflags '-X github.com/NVIDIA/skyhook/internal/version.GIT_SHA=foobars -X github.com/NVIDIA/skyhook/internal/version.VERSION=v0.5.0'", + "env": { + "ENABLE_WEBHOOKS": "false", + "LOG_ENCODER": "console", + "REAPPLY_ON_REBOOT": "false", + // "AGENT_IMAGE": "ghcr.io/nvidia/skyhook/agent:latest", // the real agent image, for testing actual packages + "AGENT_IMAGE": "ghcr.io/nvidia/skyhook/agentless:6.2.0", // this is the mock image, if you need to test for real, use the real image. + }, + "args": [], + "showLog": true + }, + { + "name": "Test Current File", + "type": "go", + "request": "launch", + "mode": "test", + "program": "${file}", + "env": {}, + "args": [], + "showLog": true + } + ] +} \ No newline at end of file diff --git a/agent/skyhook-agent/src/skyhook_agent/__about__.py b/agent/skyhook-agent/src/skyhook_agent/__about__.py index 9ba14a20..2055bbc5 100644 --- a/agent/skyhook-agent/src/skyhook_agent/__about__.py +++ b/agent/skyhook-agent/src/skyhook_agent/__about__.py @@ -18,11 +18,4 @@ # LICENSE END # - - - - - - - __version__ = "0.0.0" diff --git a/agent/skyhook-agent/src/skyhook_agent/__init__.py b/agent/skyhook-agent/src/skyhook_agent/__init__.py index ae7de4e2..7099a60a 100644 --- a/agent/skyhook-agent/src/skyhook_agent/__init__.py +++ b/agent/skyhook-agent/src/skyhook_agent/__init__.py @@ -17,11 +17,3 @@ # # LICENSE END # - - - - - - - - diff --git a/agent/skyhook-agent/src/skyhook_agent/controller.py b/agent/skyhook-agent/src/skyhook_agent/controller.py index 93a7aec7..93dc8a6a 100644 --- a/agent/skyhook-agent/src/skyhook_agent/controller.py +++ b/agent/skyhook-agent/src/skyhook_agent/controller.py @@ -33,6 +33,7 @@ import os import shutil import glob +import signal from skyhook_agent.step import Step, UpgradeStep, Idempotence, Mode, CHECK_TO_APPLY from skyhook_agent import interrupts, config @@ -40,6 +41,18 @@ import logging as logger +# Global flag to track if we received SIGTERM +received_sigterm = False + +def sigterm_handler(signum, frame): + """Handle SIGTERM by setting a global flag and logging the event""" + global received_sigterm + received_sigterm = True + logger.info("Received SIGTERM signal - initiating graceful shutdown") + +# Register the SIGTERM handler +signal.signal(signal.SIGTERM, sigterm_handler) + class SkyhookValidationError(Exception): pass @@ -414,7 +427,11 @@ def remove_flags(step_data: dict[Mode, list[Step|UpgradeStep]], config_data: dic if os.path.exists(flag_file): # Check if the file exists before trying to remove it os.remove(flag_file) -def main(mode: Mode, root_mount: str, copy_dir: str, interrupt_data: None|str, always_run_step=False): +def main(mode: Mode, root_mount: str, copy_dir: str, interrupt_data: None|str, always_run_step=False) -> bool: + ''' + returns True if the there is a failure in the steps, otherwise returns False + ''' + if mode not in set(map(str, Mode)): logger.warning(f"This version of the Agent doesn't support the {mode} mode. Options are: {','.join(map(str, Mode))}.") return False @@ -448,9 +465,19 @@ def main(mode: Mode, root_mount: str, copy_dir: str, interrupt_data: None|str, a if not os.path.exists(f"{root_mount}/{copy_dir}/configmaps/{f}"): raise SkyhookValidationError(f"Expected config file {f} not found in configmaps directory.") - return agent_main(mode, root_mount, copy_dir, config_data, interrupt_data, always_run_step) + try: + return agent_main(mode, root_mount, copy_dir, config_data, interrupt_data, always_run_step) + except Exception as e: + if received_sigterm: + logger.info("Gracefully shutting down due to SIGTERM") + # Perform any cleanup if needed + return True + raise -def agent_main(mode: Mode, root_mount: str, copy_dir: str, config_data: dict, interrupt_data: None|str, always_run_step=False): +def agent_main(mode: Mode, root_mount: str, copy_dir: str, config_data: dict, interrupt_data: None|str, always_run_step=False) -> bool: + ''' + returns True if the there is a failure in the steps, otherwise returns False + ''' # Pull out step_data so it matches with existing code step_data = config_data["modes"] @@ -464,6 +491,11 @@ def agent_main(mode: Mode, root_mount: str, copy_dir: str, config_data: dict, in logger.warning(f" There are no {mode} steps defined. This will be ran as a no-op.") for step in step_data.get(mode, []): + # Check for SIGTERM + if received_sigterm: + logger.info("SIGTERM received, stopping step execution") + return True + # Make the flag file without the host path argument (first one). This is because in operator world # the host path is going to change every time the Skyhook Custom Resource changes so it would # look like a step hasn't been run when it fact it had. diff --git a/agent/skyhook-agent/src/skyhook_agent/enums.py b/agent/skyhook-agent/src/skyhook_agent/enums.py index 6135b9e1..3905ab39 100644 --- a/agent/skyhook-agent/src/skyhook_agent/enums.py +++ b/agent/skyhook-agent/src/skyhook_agent/enums.py @@ -18,13 +18,6 @@ # LICENSE END # - - - - - - - from enum import Enum class SortableEnum(Enum): diff --git a/agent/skyhook-agent/src/skyhook_agent/interrupts.py b/agent/skyhook-agent/src/skyhook_agent/interrupts.py index 974cbb59..b529d8f3 100644 --- a/agent/skyhook-agent/src/skyhook_agent/interrupts.py +++ b/agent/skyhook-agent/src/skyhook_agent/interrupts.py @@ -19,12 +19,6 @@ # - - - - - - import string import base64 import json diff --git a/agent/skyhook-agent/src/skyhook_agent/step.py b/agent/skyhook-agent/src/skyhook_agent/step.py index 33ba0f4c..3b2b5742 100644 --- a/agent/skyhook-agent/src/skyhook_agent/step.py +++ b/agent/skyhook-agent/src/skyhook_agent/step.py @@ -19,12 +19,6 @@ # - - - - - - from typing import IO from enum import Enum import json diff --git a/agent/skyhook-agent/tests/__init__.py b/agent/skyhook-agent/tests/__init__.py index ae7de4e2..7099a60a 100644 --- a/agent/skyhook-agent/tests/__init__.py +++ b/agent/skyhook-agent/tests/__init__.py @@ -17,11 +17,3 @@ # # LICENSE END # - - - - - - - - diff --git a/agent/skyhook-agent/tests/test_config.py b/agent/skyhook-agent/tests/test_config.py index 937d07fd..06071b56 100644 --- a/agent/skyhook-agent/tests/test_config.py +++ b/agent/skyhook-agent/tests/test_config.py @@ -18,13 +18,6 @@ # LICENSE END # - - - - - - - import unittest from tempfile import TemporaryDirectory diff --git a/agent/skyhook-agent/tests/test_controller.py b/agent/skyhook-agent/tests/test_controller.py index 26db6424..50900e50 100644 --- a/agent/skyhook-agent/tests/test_controller.py +++ b/agent/skyhook-agent/tests/test_controller.py @@ -18,6 +18,7 @@ # LICENSE END # + import unittest import tempfile import os diff --git a/agent/skyhook-agent/tests/test_enums.py b/agent/skyhook-agent/tests/test_enums.py index c9d0c7a3..1baf45a4 100644 --- a/agent/skyhook-agent/tests/test_enums.py +++ b/agent/skyhook-agent/tests/test_enums.py @@ -19,12 +19,6 @@ # - - - - - - import unittest from skyhook_agent.enums import SortableEnum, get_latest_schema diff --git a/agent/skyhook-agent/tests/test_interrupts.py b/agent/skyhook-agent/tests/test_interrupts.py index 662fc30d..f825e6c0 100644 --- a/agent/skyhook-agent/tests/test_interrupts.py +++ b/agent/skyhook-agent/tests/test_interrupts.py @@ -19,12 +19,6 @@ # - - - - - - import unittest import base64 import json diff --git a/agent/skyhook-agent/tests/test_steps.py b/agent/skyhook-agent/tests/test_steps.py index 1e5595bf..d6b03371 100644 --- a/agent/skyhook-agent/tests/test_steps.py +++ b/agent/skyhook-agent/tests/test_steps.py @@ -19,12 +19,6 @@ # - - - - - - import unittest, os from tempfile import TemporaryDirectory diff --git a/chart/templates/skyhook-crd.yaml b/chart/templates/skyhook-crd.yaml index b85764ef..ec1dbab2 100644 --- a/chart/templates/skyhook-crd.yaml +++ b/chart/templates/skyhook-crd.yaml @@ -349,6 +349,10 @@ spec: - name type: object type: array + gracefulShutdown: + description: GracefulShutdown is the graceful shutdown timeout + for the package, if not set, uses k8s default + type: string image: description: Image is the container image to run. Do not included the tag, that is set in the version. diff --git a/containers/agentless/Dockerfile b/containers/agentless/Dockerfile index 37c60502..39b98ce8 100644 --- a/containers/agentless/Dockerfile +++ b/containers/agentless/Dockerfile @@ -18,7 +18,6 @@ # LICENSE END # - ARG BUSYBOX_TAG=1.36.1 FROM busybox:${BUSYBOX_TAG} diff --git a/containers/agentless/entrypoint.sh b/containers/agentless/entrypoint.sh index c98e69ae..aaf914df 100755 --- a/containers/agentless/entrypoint.sh +++ b/containers/agentless/entrypoint.sh @@ -20,6 +20,13 @@ # LICENSE END # +# Handle SIGTERM gracefully +cleanup() { + echo "Received SIGTERM signal, shutting down gracefully..." + sleep 3 + exit 0 +} +trap cleanup SIGTERM SLEEP_LEN=${SLEEP_LEN:-$(($RANDOM % 5 + 5))} diff --git a/containers/ci.Dockerfile b/containers/ci.Dockerfile index 7a3454f5..674a1557 100644 --- a/containers/ci.Dockerfile +++ b/containers/ci.Dockerfile @@ -18,6 +18,10 @@ # LICENSE END # +## this container is not used in github CI, was from before before we open sourced this project. +## should move to doing something like this in the github actions workflow to save time not installing all the deps all the time +## but for now this is just for when we got to that + ARG GO_VERSION FROM golang:${GO_VERSION}-bookworm as builder diff --git a/demos/interrupt-wait-for-pod/non-workload.yaml b/demos/interrupt-wait-for-pod/non-workload.yaml index face6753..699e0661 100644 --- a/demos/interrupt-wait-for-pod/non-workload.yaml +++ b/demos/interrupt-wait-for-pod/non-workload.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: apps/v1 kind: ReplicaSet metadata: diff --git a/demos/interrupt-wait-for-pod/scr.yaml b/demos/interrupt-wait-for-pod/scr.yaml index a57d0ff3..6f05d644 100644 --- a/demos/interrupt-wait-for-pod/scr.yaml +++ b/demos/interrupt-wait-for-pod/scr.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: @@ -42,8 +36,8 @@ spec: app: skyhook-demo-workload packages: baz: - version: 2024.8.10 - image: nvcr.io/nvidian/swgpu-baseos/shellscript:1.3.1 + version: 1.1.0 + image: ghcr.io/nvidia/skyhook-packages/shellscript interrupt: type: reboot # type: service diff --git a/demos/interrupt-wait-for-pod/workload.yaml b/demos/interrupt-wait-for-pod/workload.yaml index fead5d4f..1cb8cebf 100644 --- a/demos/interrupt-wait-for-pod/workload.yaml +++ b/demos/interrupt-wait-for-pod/workload.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: apps/v1 kind: DaemonSet metadata: diff --git a/k8s-tests/chainsaw/helm/helm-chart-test/assert-no-schedule.yaml b/k8s-tests/chainsaw/helm/helm-chart-test/assert-no-schedule.yaml index a3e3ff53..b7d6fa24 100644 --- a/k8s-tests/chainsaw/helm/helm-chart-test/assert-no-schedule.yaml +++ b/k8s-tests/chainsaw/helm/helm-chart-test/assert-no-schedule.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Pod metadata: @@ -59,8 +53,7 @@ spec: - command: - /manager ((env[?name == 'RUNTIME_REQUIRED_TAINT'].value)[0] == 'skyhook.nvidia.com=runtime-required:NoSchedule'): true - image: nvcr.io/nvidian/swgpu-baseos/skyhook-operator:test - imagePullPolicy: IfNotPresent + image: ghcr.io/nvidia/skyhook/operator:latest ## THIS should change to be like a tag so it can point at a specific commit livenessProbe: failureThreshold: 3 httpGet: @@ -109,7 +102,6 @@ spec: - name: KUBERNETES_CLUSTER_DOMAIN value: cluster.local image: quay.io/brancz/kube-rbac-proxy:v0.15.0 - imagePullPolicy: IfNotPresent name: kube-rbac-proxy ports: - containerPort: 8443 diff --git a/k8s-tests/chainsaw/helm/helm-chart-test/assert-scheduled.yaml b/k8s-tests/chainsaw/helm/helm-chart-test/assert-scheduled.yaml index 875b2ccd..2c4109d1 100644 --- a/k8s-tests/chainsaw/helm/helm-chart-test/assert-scheduled.yaml +++ b/k8s-tests/chainsaw/helm/helm-chart-test/assert-scheduled.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Pod metadata: @@ -59,7 +53,6 @@ spec: - command: - /manager ((env[?name == 'RUNTIME_REQUIRED_TAINT'].value)[0] == 'skyhook.nvidia.com=runtime-required:NoSchedule'): true - imagePullPolicy: IfNotPresent livenessProbe: failureThreshold: 3 httpGet: @@ -108,7 +101,6 @@ spec: - name: KUBERNETES_CLUSTER_DOMAIN value: cluster.local image: quay.io/brancz/kube-rbac-proxy:v0.15.0 - imagePullPolicy: IfNotPresent name: kube-rbac-proxy ports: - containerPort: 8443 diff --git a/k8s-tests/chainsaw/helm/helm-chart-test/values.yaml b/k8s-tests/chainsaw/helm/helm-chart-test/values.yaml index b2bffdce..5a243142 100644 --- a/k8s-tests/chainsaw/helm/helm-chart-test/values.yaml +++ b/k8s-tests/chainsaw/helm/helm-chart-test/values.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - controllerManager: # setting a toleration on the manager for testing purposes tolerations: @@ -35,5 +29,5 @@ controllerManager: # telling the helm chart to use the test operator image # for more info refer to the README image: - repository: nvcr.io/nvidian/swgpu-baseos/skyhook-operator - tag: test \ No newline at end of file + repository: ghcr.io/nvidia/skyhook/operator + tag: latest ## THIS should change to be like a tag so it can point at a specific commit \ No newline at end of file diff --git a/k8s-tests/chainsaw/helm/helm-scale-test/assert-override-resources.yaml b/k8s-tests/chainsaw/helm/helm-scale-test/assert-override-resources.yaml index 76335270..b676d158 100644 --- a/k8s-tests/chainsaw/helm/helm-scale-test/assert-override-resources.yaml +++ b/k8s-tests/chainsaw/helm/helm-scale-test/assert-override-resources.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Pod metadata: diff --git a/k8s-tests/chainsaw/helm/helm-scale-test/assert-scaled-resources.yaml b/k8s-tests/chainsaw/helm/helm-scale-test/assert-scaled-resources.yaml index 078af114..fd61d352 100644 --- a/k8s-tests/chainsaw/helm/helm-scale-test/assert-scaled-resources.yaml +++ b/k8s-tests/chainsaw/helm/helm-scale-test/assert-scaled-resources.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Pod metadata: diff --git a/k8s-tests/chainsaw/helm/helm-scale-test/chainsaw-test.yaml b/k8s-tests/chainsaw/helm/helm-scale-test/chainsaw-test.yaml index e21ab381..3d300f6c 100644 --- a/k8s-tests/chainsaw/helm/helm-scale-test/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/helm/helm-scale-test/chainsaw-test.yaml @@ -22,7 +22,7 @@ apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test metadata: - name: helm-chart + name: helm-scale spec: description: This test asserts that the helm chart is working as expected. Specifically it asserts that the helm chart works when given a different deployment name than skyhook-operator and that the tolerations that are given to the chart through a values file work as expected. diff --git a/k8s-tests/chainsaw/helm/helm-scale-test/values-scale.yaml b/k8s-tests/chainsaw/helm/helm-scale-test/values-scale.yaml index db9c493a..54d5ed56 100644 --- a/k8s-tests/chainsaw/helm/helm-scale-test/values-scale.yaml +++ b/k8s-tests/chainsaw/helm/helm-scale-test/values-scale.yaml @@ -18,18 +18,12 @@ # LICENSE END # - - - - - - controllerManager: manager: # telling the helm chart to use the test operator image # for more info refer to the README image: - repository: nvcr.io/nvidian/swgpu-baseos/skyhook-operator - tag: test + repository: ghcr.io/nvidia/skyhook/operator + tag: latest ## THIS should change to be like a tag so it can point at a specific commit estimatedNodeCount: 400 estimatedPackageCount: 5 \ No newline at end of file diff --git a/k8s-tests/chainsaw/helm/helm-scale-test/vaules-override-resources.yaml b/k8s-tests/chainsaw/helm/helm-scale-test/vaules-override-resources.yaml index a632674e..8cb402ad 100644 --- a/k8s-tests/chainsaw/helm/helm-scale-test/vaules-override-resources.yaml +++ b/k8s-tests/chainsaw/helm/helm-scale-test/vaules-override-resources.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - controllerManager: manager: resources: @@ -36,5 +30,5 @@ controllerManager: # telling the helm chart to use the test operator image # for more info refer to the README image: - repository: nvcr.io/nvidian/swgpu-baseos/skyhook-operator + repository: ghcr.io/nvidia/skyhook/operator tag: test \ No newline at end of file diff --git a/k8s-tests/chainsaw/helm/install-cert-manager.sh b/k8s-tests/chainsaw/helm/install-cert-manager.sh index a20a2f06..8c0905ef 100755 --- a/k8s-tests/chainsaw/helm/install-cert-manager.sh +++ b/k8s-tests/chainsaw/helm/install-cert-manager.sh @@ -21,18 +21,6 @@ # - - - - - - - - - - - - ## need to specify different paths for the helm binary ## depending on whether or not this is being ran in CI if [ -n "$GITLAB_CI" ]; then @@ -41,7 +29,7 @@ else HELM=$(which helm) fi -VERSION=${1:-v1.16.2} +VERSION=${1:-v1.17.0} ## add chart repo $HELM repo add jetstack https://charts.jetstack.io --force-update diff --git a/k8s-tests/chainsaw/helm/install-helm-chart.sh b/k8s-tests/chainsaw/helm/install-helm-chart.sh index 3bd3cbbd..a4f86f25 100755 --- a/k8s-tests/chainsaw/helm/install-helm-chart.sh +++ b/k8s-tests/chainsaw/helm/install-helm-chart.sh @@ -20,19 +20,6 @@ # LICENSE END # - - - - - - - - - - - - - OPERATOR_NAME=$1 VALUES_FILE_NAME=${2:-values.yaml} diff --git a/k8s-tests/chainsaw/helm/readme.md b/k8s-tests/chainsaw/helm/readme.md index 4912bcd1..dccab6bf 100644 --- a/k8s-tests/chainsaw/helm/readme.md +++ b/k8s-tests/chainsaw/helm/readme.md @@ -2,6 +2,6 @@ This directory holds all the tests for the skyhook operator's helm chart. Right now this mainly ensures that tolerations set in the helm chart actually work and that the operator can be deployed successfully under another deployment name than skyhook-operator. ## Test Image -The image that is used by these tests should be `nvcr.io/nvidian/swgpu-baseos/skyhook-operator:test` since this will be built in CI every time a commit is pushed to Gitlab and will make sure that you current changes to the operator are compatible with the helm chart still. +The image that is used by these tests should be ` ghcr.io/nvidia/skyhook/operator:test` (NOTE: this does not exist in the current github CI, this needs to be fixed) since this will be built in CI every time a commit is pushed to Gitlab and will make sure that you current changes to the operator are compatible with the helm chart still. **NOTE:** When you run the helm chart tests locally it may be using an outdated version of the test image since it hasn't been pushed and built by the CI. Be careful in the assumptions you make as your changes to the operator may pass the helm chart tests locally but fail in CI. \ No newline at end of file diff --git a/k8s-tests/chainsaw/helm/taint-nodes.sh b/k8s-tests/chainsaw/helm/taint-nodes.sh index 05730301..82042dd2 100755 --- a/k8s-tests/chainsaw/helm/taint-nodes.sh +++ b/k8s-tests/chainsaw/helm/taint-nodes.sh @@ -21,18 +21,6 @@ # - - - - - - - - - - - - TAINT=$1 for node in $(kubectl get nodes -o name); do diff --git a/k8s-tests/chainsaw/helm/uninstall-cert-manager.sh b/k8s-tests/chainsaw/helm/uninstall-cert-manager.sh index d58bb0a4..75be04a5 100755 --- a/k8s-tests/chainsaw/helm/uninstall-cert-manager.sh +++ b/k8s-tests/chainsaw/helm/uninstall-cert-manager.sh @@ -21,18 +21,6 @@ # - - - - - - - - - - - - ## need to specify different paths for the helm binary ## depending on whether or not this is being ran in CI if [ -n "$GITLAB_CI" ]; then diff --git a/k8s-tests/chainsaw/helm/uninstall-helm-chart.sh b/k8s-tests/chainsaw/helm/uninstall-helm-chart.sh index c9c7a2a2..6c8a975e 100755 --- a/k8s-tests/chainsaw/helm/uninstall-helm-chart.sh +++ b/k8s-tests/chainsaw/helm/uninstall-helm-chart.sh @@ -21,18 +21,6 @@ # - - - - - - - - - - - - OPERATOR_NAME=$1 ## need to specify different paths for the helm binary diff --git a/k8s-tests/chainsaw/helm/untaint-nodes.sh b/k8s-tests/chainsaw/helm/untaint-nodes.sh index bed3bfc5..50e63398 100755 --- a/k8s-tests/chainsaw/helm/untaint-nodes.sh +++ b/k8s-tests/chainsaw/helm/untaint-nodes.sh @@ -22,18 +22,6 @@ - - - - - - - - - - - - set -x -e -o pipefail TAINT=$1 diff --git a/k8s-tests/chainsaw/skyhook/cleanup-pods/assert-cleaned-pods.yaml b/k8s-tests/chainsaw/skyhook/cleanup-pods/assert-cleaned-pods.yaml new file mode 100644 index 00000000..d33ca9e7 --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/cleanup-pods/assert-cleaned-pods.yaml @@ -0,0 +1,165 @@ +# +# LICENSE START +# +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# LICENSE END +# +--- +kind: Pod +apiVersion: v1 +metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: cleanup-pods + skyhook.nvidia.com/package: aa-1.2.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "aa", + "version": "1.2.3", + "skyhook": "cleanup-pods", + "stage": "apply", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: cleanup-pods +spec: + nodeName: kind-worker + initContainers: + - name: aa-init + - name: aa-apply + args: + ([0]): apply + ([1]): /root + (length(@)): 3 + - name: aa-applycheck + args: + ([0]): apply-check + ([1]): /root + (length(@)): 3 +--- +kind: Pod +apiVersion: v1 +metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: cleanup-pods + skyhook.nvidia.com/package: aa-1.2.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "aa", + "version": "1.2.3", + "skyhook": "cleanup-pods", + "stage": "config", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: cleanup-pods +spec: + nodeName: kind-worker + initContainers: + - name: aa-init + - name: aa-config + args: + ([0]): config + ([1]): /root + (length(@)): 3 + - name: aa-configcheck + args: + ([0]): config-check + ([1]): /root + (length(@)): 3 +--- +kind: Pod +apiVersion: v1 +metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: cleanup-pods + skyhook.nvidia.com/package: bb-1.2 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "bb", + "version": "1.2", + "skyhook": "cleanup-pods", + "stage": "apply", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: cleanup-pods +spec: + nodeName: kind-worker + initContainers: + - name: bb-init + - name: bb-apply + args: + ([0]): apply + ([1]): /root + (length(@)): 3 + - name: bb-applycheck + args: + ([0]): apply-check + ([1]): /root + (length(@)): 3 +--- +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + name: cleanup-pods +status: + status: erroring + observedGeneration: 4 + nodeState: + kind-worker: + aa|1.2.3: + name: aa + state: complete + version: '1.2.3' + image: ghcr.io/nvidia/skyhook/agentless + stage: config + bb|1.2: + name: bb + state: erroring + version: '1.2' + image: ghcr.io/nvidia/skyhook/agentless + stage: apply + kind-worker2: + aa|1.2.3: + name: aa + state: complete + version: '1.2.3' + image: ghcr.io/nvidia/skyhook/agentless + stage: config + bb|1.2: + name: bb + state: erroring + version: '1.2' + image: ghcr.io/nvidia/skyhook/agentless + stage: config + + nodeStatus: + # grab values should be one and is complete + (values(@)): + - erroring + - erroring \ No newline at end of file diff --git a/k8s-tests/chainsaw/skyhook/cleanup-pods/assert-config-complete.yaml b/k8s-tests/chainsaw/skyhook/cleanup-pods/assert-config-complete.yaml new file mode 100644 index 00000000..b90574e4 --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/cleanup-pods/assert-config-complete.yaml @@ -0,0 +1,82 @@ +# +# LICENSE START +# +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# LICENSE END +# + +--- +kind: Pod +apiVersion: v1 +metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: cleanup-pods + skyhook.nvidia.com/package: bb-1.2 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "bb", + "version": "1.2", + "skyhook": "cleanup-pods", + "stage": "config", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: cleanup-pods +spec: + terminationGracePeriodSeconds: 46 + initContainers: + - name: bb-init + - name: bb-config + args: + ([0]): config + ([1]): /root + (length(@)): 3 + - name: bb-configcheck + args: + ([0]): config-check + ([1]): /root + (length(@)): 3 +--- +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + name: cleanup-pods +status: + status: complete + observedGeneration: 4 + nodeState: + (values(@)): + - aa|1.2.3: + image: ghcr.io/nvidia/skyhook/agentless + name: aa + stage: config + state: complete + version: 1.2.3 + - bb|1.2: + image: ghcr.io/nvidia/skyhook/agentless + name: bb + stage: config + state: complete + version: "1.2" + nodeStatus: + # grab values should be one and is complete + (values(@)): + - complete + - complete \ No newline at end of file diff --git a/k8s-tests/chainsaw/skyhook/cleanup-pods/assert-setup-complete.yaml b/k8s-tests/chainsaw/skyhook/cleanup-pods/assert-setup-complete.yaml new file mode 100644 index 00000000..17dc47b5 --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/cleanup-pods/assert-setup-complete.yaml @@ -0,0 +1,48 @@ +# +# LICENSE START +# +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# LICENSE END +# + +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + name: cleanup-pods +status: + status: complete + nodesInProgress: 0 + observedGeneration: 2 + packageList: aa:1.2.3,bb:1.2 + nodeState: + (values(@)): + - aa|1.2.3: + image: ghcr.io/nvidia/skyhook/agentless + name: aa + stage: config + state: complete + version: 1.2.3 + - bb|1.2: + image: ghcr.io/nvidia/skyhook/agentless + name: bb + stage: config + state: complete + version: "1.2" + nodeStatus: + # grab values should be one and is complete + (values(@)): + - complete + - complete diff --git a/k8s-tests/chainsaw/skyhook/cleanup-pods/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/cleanup-pods/chainsaw-test.yaml new file mode 100644 index 00000000..1098a79d --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/cleanup-pods/chainsaw-test.yaml @@ -0,0 +1,65 @@ +# +# LICENSE START +# +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# LICENSE END +# + +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: cleanup-pods +spec: + concurrent: true + description: | + This test runs a simple skyhook with dependsOn. We wait tell completed, then trigger update to force config cycle on package B. Once config + is complete, we update again to make the package error, and at the same clear out the node annotation to trigger cleanup. + The cleanup should remove pods that are running that should not be running because the node was "Reset". + timeouts: + assert: 240s + steps: + - try: + ## setup step, skyhook to complete + - script: + content: | + ## remove annotation from last run + ../rest_test.sh cleanup-pods "node-role.kubernetes.io/control-plane notin ()" + + - create: + file: setup.yaml + - assert: + file: assert-setup-complete.yaml + - sleep: + ## there is a race between pods marking the node complete, i think it will trigger eventually, but the event is missed + duration: 2s + - update: + file: force-config.yaml ## trigger config cycle + - assert: + file: assert-config-complete.yaml + - sleep: + ## there is a race between pods marking the node complete, i think it will trigger eventually, but the event is missed + duration: 2s + - try: + - update: + file: muck_up.yaml + ## wack node annotation to rest node + - script: + content: | + ## delete the node annotation on one node to trigger cleanup + kubectl annotate node/kind-worker skyhook.nvidia.com/nodeState_cleanup-pods- + - assert: + file: assert-cleaned-pods.yaml \ No newline at end of file diff --git a/k8s-tests/chainsaw/skyhook/cleanup-pods/force-config.yaml b/k8s-tests/chainsaw/skyhook/cleanup-pods/force-config.yaml new file mode 100644 index 00000000..68e1c4c2 --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/cleanup-pods/force-config.yaml @@ -0,0 +1,60 @@ +# +# LICENSE START +# +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# LICENSE END +# + +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + labels: + app.kubernetes.io/part-of: skyhook-operator + app.kubernetes.io/created-by: skyhook-operator + name: cleanup-pods +spec: + nodeSelectors: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist ## all worker nodes + packages: + aa: + version: "1.2.3" + image: ghcr.io/nvidia/skyhook/agentless:bogus + env: + - name: SLEEP_LEN + value: "3" + resources: + cpuLimit: 50m + cpuRequest: 50m + memoryLimit: 32Mi + memoryRequest: 32Mi + bb: + version: "1.2" + gracefulShutdown: 46s + image: ghcr.io/nvidia/skyhook/agentless + dependsOn: + aa: "1.2.3" + env: + - name: SLEEP_LEN + value: "2" + resources: + cpuLimit: 50m + cpuRequest: 50m + memoryLimit: 32Mi + memoryRequest: 32Mi + configMap: + foo: buz \ No newline at end of file diff --git a/k8s-tests/chainsaw/skyhook/cleanup-pods/muck_up.yaml b/k8s-tests/chainsaw/skyhook/cleanup-pods/muck_up.yaml new file mode 100644 index 00000000..63a10451 --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/cleanup-pods/muck_up.yaml @@ -0,0 +1,62 @@ +# +# LICENSE START +# +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# LICENSE END +# + +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + labels: + app.kubernetes.io/part-of: skyhook-operator + app.kubernetes.io/created-by: skyhook-operator + name: cleanup-pods +spec: + nodeSelectors: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist ## all worker nodes + packages: + aa: + version: "1.2.3" + image: ghcr.io/nvidia/skyhook/agentless:bogus + env: + - name: SLEEP_LEN + value: "3" + resources: + cpuLimit: 50m + cpuRequest: 50m + memoryLimit: 32Mi + memoryRequest: 32Mi + bb: + version: "1.2" + gracefulShutdown: 46s + image: ghcr.io/nvidia/skyhook/agentless + dependsOn: + aa: "1.2.3" + env: + - name: EXIT_CODE + value: "2" + - name: SLEEP_LEN + value: "2" + resources: + cpuLimit: 50m + cpuRequest: 50m + memoryLimit: 32Mi + memoryRequest: 32Mi + configMap: + foo: bur \ No newline at end of file diff --git a/k8s-tests/chainsaw/skyhook/cleanup-pods/setup.yaml b/k8s-tests/chainsaw/skyhook/cleanup-pods/setup.yaml new file mode 100644 index 00000000..cd734732 --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/cleanup-pods/setup.yaml @@ -0,0 +1,59 @@ +# +# LICENSE START +# +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# LICENSE END +# + +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + labels: + app.kubernetes.io/part-of: skyhook-operator + app.kubernetes.io/created-by: skyhook-operator + name: cleanup-pods +spec: + nodeSelectors: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist ## all worker nodes + packages: + aa: + version: "1.2.3" + image: ghcr.io/nvidia/skyhook/agentless:bogus + env: + - name: SLEEP_LEN + value: "2" + resources: + cpuLimit: 50m + cpuRequest: 50m + memoryLimit: 32Mi + memoryRequest: 32Mi + bb: + version: "1.2" + image: ghcr.io/nvidia/skyhook/agentless + dependsOn: + aa: "1.2.3" + env: + - name: SLEEP_LEN + value: "2" + resources: + cpuLimit: 50m + cpuRequest: 50m + memoryLimit: 32Mi + memoryRequest: 32Mi + configMap: + foo: bar \ No newline at end of file diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-no-interrupt.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-no-interrupt.yaml index 4430b96a..91a7cdb4 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-no-interrupt.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-no-interrupt.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - --- apiVersion: v1 kind: Node diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-while-running.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-while-running.yaml index adb92de6..bb8f81de 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-while-running.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-while-running.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - --- apiVersion: v1 kind: Node diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update.yaml index 00ad5dd5..b5ae5941 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - --- apiVersion: v1 kind: Node diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml index 87e57dc2..7e4eacf6 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Node metadata: diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/chainsaw-test.yaml index d787c1de..9a111dc0 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/skyhook.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/skyhook.yaml index d34c1fdb..44a65390 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/skyhook.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/update-no-interrupt.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/update-no-interrupt.yaml index ed514276..b6af9dd6 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/update-no-interrupt.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/update-no-interrupt.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/update-while-running.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/update-while-running.yaml index 976d19fd..a5e4ea32 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/update-while-running.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/update-while-running.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/update.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/update.yaml index 4616b6d5..3ea9ec2e 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/update.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/update.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/failure-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/failure-skyhook/chainsaw-test.yaml index ac53447b..44fddd65 100644 --- a/k8s-tests/chainsaw/skyhook/failure-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/failure-skyhook/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/failure-skyhook/node-assert.yaml b/k8s-tests/chainsaw/skyhook/failure-skyhook/node-assert.yaml index 04c6897e..9cecbcb3 100644 --- a/k8s-tests/chainsaw/skyhook/failure-skyhook/node-assert.yaml +++ b/k8s-tests/chainsaw/skyhook/failure-skyhook/node-assert.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Node metadata: diff --git a/k8s-tests/chainsaw/skyhook/failure-skyhook/skyhook.yaml b/k8s-tests/chainsaw/skyhook/failure-skyhook/skyhook.yaml index 340d406a..eb1e582e 100644 --- a/k8s-tests/chainsaw/skyhook/failure-skyhook/skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/failure-skyhook/skyhook.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/hello-world/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/hello-world/chainsaw-test.yaml index 5306dbee..8a2d609f 100644 --- a/k8s-tests/chainsaw/skyhook/hello-world/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/hello-world/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/hello-world/configmap-assert.yaml b/k8s-tests/chainsaw/skyhook/hello-world/configmap-assert.yaml index e9a7ab26..3fb524c2 100644 --- a/k8s-tests/chainsaw/skyhook/hello-world/configmap-assert.yaml +++ b/k8s-tests/chainsaw/skyhook/hello-world/configmap-assert.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: ConfigMap metadata: diff --git a/k8s-tests/chainsaw/skyhook/hello-world/configmap.yaml b/k8s-tests/chainsaw/skyhook/hello-world/configmap.yaml index e9a7ab26..3fb524c2 100644 --- a/k8s-tests/chainsaw/skyhook/hello-world/configmap.yaml +++ b/k8s-tests/chainsaw/skyhook/hello-world/configmap.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: ConfigMap metadata: diff --git a/k8s-tests/chainsaw/skyhook/interrupt-grouping/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/interrupt-grouping/chainsaw-test.yaml index e1f43350..fdfec731 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt-grouping/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt-grouping/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/interrupt-grouping/skyhook.yaml b/k8s-tests/chainsaw/skyhook/interrupt-grouping/skyhook.yaml index cfa30e26..15f2e08b 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt-grouping/skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt-grouping/skyhook.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - --- apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook diff --git a/k8s-tests/chainsaw/skyhook/interrupt/assert.yaml b/k8s-tests/chainsaw/skyhook/interrupt/assert.yaml index 7e4d8d30..5d7bd016 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt/assert.yaml @@ -18,40 +18,43 @@ # LICENSE END # ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: interrupt - skyhook.nvidia.com/package: invalid-1.2.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "invalid", - "version": "1.2.3", - "skyhook": "interrupt", - "stage": "uninstall", - "image": "ghcr.io/nvidia/skyhook/agentless" - } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: interrupt -spec: - initContainers: - - name: invalid-init - - name: invalid-uninstall - args: - ([0]): uninstall - ([1]): /root - (length(@)): 3 - - name: invalid-uninstallcheck - args: - ([0]): uninstall-check - ([1]): /root - (length(@)): 3 +## REMOVE THIS part of the test. When adding more around invalid package cleanup, it seems like this might not make sense to be a test. This commit does break this, +## but making this test pass has implications for resting a node. If you reset a node it would mean you want to uninstall all the packages on the node. +## I could go either way on this, but for now I'm going to remove it. +# --- +# kind: Pod +# apiVersion: v1 +# metadata: +# namespace: skyhook +# labels: +# skyhook.nvidia.com/name: interrupt +# skyhook.nvidia.com/package: invalid-1.2.3 +# annotations: +# ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): +# { +# "name": "invalid", +# "version": "1.2.3", +# "skyhook": "interrupt", +# "stage": "uninstall", +# "image": "ghcr.io/nvidia/skyhook/agentless" +# } +# ownerReferences: +# - apiVersion: skyhook.nvidia.com/v1alpha1 +# kind: Skyhook +# name: interrupt +# spec: +# initContainers: +# - name: invalid-init +# - name: invalid-uninstall +# args: +# ([0]): uninstall +# ([1]): /root +# (length(@)): 3 +# - name: invalid-uninstallcheck +# args: +# ([0]): uninstall-check +# ([1]): /root +# (length(@)): 3 --- kind: Pod apiVersion: v1 diff --git a/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml index 4fbdf9d2..b015519c 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/nodes_add_taint.sh b/k8s-tests/chainsaw/skyhook/nodes_add_taint.sh index fa3ea3df..080d6af0 100755 --- a/k8s-tests/chainsaw/skyhook/nodes_add_taint.sh +++ b/k8s-tests/chainsaw/skyhook/nodes_add_taint.sh @@ -21,18 +21,6 @@ # - - - - - - - - - - - - n_to_taint=$1 taint=$2 label_match=${3} diff --git a/k8s-tests/chainsaw/skyhook/nodes_remove_taint.sh b/k8s-tests/chainsaw/skyhook/nodes_remove_taint.sh index b263f97a..48d1edd0 100755 --- a/k8s-tests/chainsaw/skyhook/nodes_remove_taint.sh +++ b/k8s-tests/chainsaw/skyhook/nodes_remove_taint.sh @@ -21,18 +21,6 @@ # - - - - - - - - - - - - n_to_taint=$1 taint=$2 label_match=${3} diff --git a/operator/config/local-dev/dashboard.yaml b/k8s-tests/chainsaw/skyhook/pause_skyhook.sh old mode 100644 new mode 100755 similarity index 54% rename from operator/config/local-dev/dashboard.yaml rename to k8s-tests/chainsaw/skyhook/pause_skyhook.sh index 9c95003e..b1b43ffd --- a/operator/config/local-dev/dashboard.yaml +++ b/k8s-tests/chainsaw/skyhook/pause_skyhook.sh @@ -1,3 +1,5 @@ +#!/bin/bash -x + # # LICENSE START # @@ -18,38 +20,13 @@ # LICENSE END # +skyhook="$1" +pause="$2" +## assert is true or false +if [[ "$pause" != "true" && "$pause" != "false" ]]; then + echo "pause must be true or false" + exit 1 +fi - - - - -## local dev only ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: admin-user - namespace: dashboard ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: admin-user -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: cluster-admin -subjects: -- kind: ServiceAccount - name: admin-user - namespace: dashboard ---- -apiVersion: v1 -kind: Secret -metadata: - name: admin-user - namespace: dashboard - annotations: - kubernetes.io/service-account.name: "admin-user" -type: kubernetes.io/service-account-token \ No newline at end of file +kubectl patch skyhook ${skyhook} -p '{"spec":{"pause":'$pause'}}' --type=merge diff --git a/k8s-tests/chainsaw/skyhook/pod-finalizer/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/pod-finalizer/chainsaw-test.yaml index 0ee1329f..c1ce4ce5 100644 --- a/k8s-tests/chainsaw/skyhook/pod-finalizer/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/pod-finalizer/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/pod-finalizer/node-assert.yaml b/k8s-tests/chainsaw/skyhook/pod-finalizer/node-assert.yaml index 0ee1c39a..091b598f 100644 --- a/k8s-tests/chainsaw/skyhook/pod-finalizer/node-assert.yaml +++ b/k8s-tests/chainsaw/skyhook/pod-finalizer/node-assert.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Node metadata: diff --git a/k8s-tests/chainsaw/skyhook/pod-finalizer/pod.yaml b/k8s-tests/chainsaw/skyhook/pod-finalizer/pod.yaml index 657530c7..ab480b8e 100644 --- a/k8s-tests/chainsaw/skyhook/pod-finalizer/pod.yaml +++ b/k8s-tests/chainsaw/skyhook/pod-finalizer/pod.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Pod metadata: diff --git a/k8s-tests/chainsaw/skyhook/rest_nodes.sh b/k8s-tests/chainsaw/skyhook/rest_nodes.sh index 63f65d81..192f320c 100755 --- a/k8s-tests/chainsaw/skyhook/rest_nodes.sh +++ b/k8s-tests/chainsaw/skyhook/rest_nodes.sh @@ -21,18 +21,6 @@ # - - - - - - - - - - - - ## helper script ## clears labels and annotate from nodes with the prefix "skyhook.nvidia.com" ## note, a lot of tests have a label setup to target, so you might need to put that back diff --git a/k8s-tests/chainsaw/skyhook/rest_test.sh b/k8s-tests/chainsaw/skyhook/rest_test.sh index 9ce9a855..49634fa2 100755 --- a/k8s-tests/chainsaw/skyhook/rest_test.sh +++ b/k8s-tests/chainsaw/skyhook/rest_test.sh @@ -20,26 +20,23 @@ # LICENSE END # - - - - - - - - - - - - - ## helper script ## clears labels and annotate from nodes with the prefix for tests +## Usage: +## ./rest_test.sh [selector] +## Examples: +## ./rest_test.sh my-skyhook # uses default selector +## ./rest_test.sh my-skyhook "skyhook.nvidia.com/test-node=skyhooke2e" # label selector +## ./rest_test.sh my-skyhook "node-role.kubernetes.io/control-plane notin ()" # expression selector + +## the name of the skyhook to reset skyhook="$1" -label="${2:-skyhook.nvidia.com/test-node=skyhooke2e}" -for node in $(kubectl get nodes -l $label -o name); do +## the selector to use for finding nodes (defaults to test node selector) +selector="${2:-skyhook.nvidia.com/test-node=skyhooke2e}" + +for node in $(kubectl get nodes -l "$selector" -o name); do kubectl annotate ${node} skyhook.nvidia.com/nodeState_${skyhook}- kubectl annotate ${node} skyhook.nvidia.com/status_${skyhook}- kubectl annotate ${node} skyhook.nvidia.com/cordon_${skyhook}- diff --git a/k8s-tests/chainsaw/skyhook/runtime-required/assert.yaml b/k8s-tests/chainsaw/skyhook/runtime-required/assert.yaml index 9622a66b..a49a3177 100644 --- a/k8s-tests/chainsaw/skyhook/runtime-required/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/runtime-required/assert.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Node metadata: diff --git a/k8s-tests/chainsaw/skyhook/runtime-required/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/runtime-required/chainsaw-test.yaml index 2e01c07d..a08e90f9 100644 --- a/k8s-tests/chainsaw/skyhook/runtime-required/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/runtime-required/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/runtime-required/skyhook.yaml b/k8s-tests/chainsaw/skyhook/runtime-required/skyhook.yaml index 8e05e782..58edebe2 100644 --- a/k8s-tests/chainsaw/skyhook/runtime-required/skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/runtime-required/skyhook.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml index 88438721..6b2eac00 100644 --- a/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/simple-skyhook/skyhook.yaml b/k8s-tests/chainsaw/skyhook/simple-skyhook/skyhook.yaml index 2020b9d7..27a9ffc9 100644 --- a/k8s-tests/chainsaw/skyhook/simple-skyhook/skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-skyhook/skyhook.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert-update.yaml b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert-update.yaml index f855a0d6..ffc00daa 100644 --- a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert-update.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert-update.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - --- kind: ConfigMap apiVersion: v1 diff --git a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert.yaml b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert.yaml index 6de2e8e3..41210c2e 100644 --- a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Node metadata: diff --git a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/chainsaw-test.yaml index 13229051..bd3c923c 100644 --- a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/skyhook.yaml b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/skyhook.yaml index ba6838b8..2e03d623 100644 --- a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/skyhook.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/update.yaml b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/update.yaml index 381866a2..852c87b5 100644 --- a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/update.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/update.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/assert.yaml b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/assert.yaml index 1e05c5a7..36d7998f 100644 --- a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/assert.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Node metadata: diff --git a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/chainsaw-test.yaml index aaa792fe..74a9dee3 100644 --- a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/skyhook.yaml b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/skyhook.yaml index 97bb3b52..a518bd73 100644 --- a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/skyhook.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/taint-scheduling/assert-update.yaml b/k8s-tests/chainsaw/skyhook/taint-scheduling/assert-update.yaml index b657bed8..2b231d3d 100644 --- a/k8s-tests/chainsaw/skyhook/taint-scheduling/assert-update.yaml +++ b/k8s-tests/chainsaw/skyhook/taint-scheduling/assert-update.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Node metadata: diff --git a/k8s-tests/chainsaw/skyhook/taint-scheduling/assert.yaml b/k8s-tests/chainsaw/skyhook/taint-scheduling/assert.yaml index 05ed2baf..e4ae025e 100644 --- a/k8s-tests/chainsaw/skyhook/taint-scheduling/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/taint-scheduling/assert.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Node metadata: diff --git a/k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml index c3966aed..d4073cd5 100644 --- a/k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/taint-scheduling/skyhook.yaml b/k8s-tests/chainsaw/skyhook/taint-scheduling/skyhook.yaml index c5911d36..010d62e1 100644 --- a/k8s-tests/chainsaw/skyhook/taint-scheduling/skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/taint-scheduling/skyhook.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/taint-scheduling/update-skyhook.yaml b/k8s-tests/chainsaw/skyhook/taint-scheduling/update-skyhook.yaml index c7415c84..dd9e5c82 100644 --- a/k8s-tests/chainsaw/skyhook/taint-scheduling/update-skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/taint-scheduling/update-skyhook.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert.yaml index a97574c6..43ecd83f 100644 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Node metadata: diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/chainsaw-test.yaml index eb84a7a9..02c84761 100644 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/skyhook.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/skyhook.yaml index 16a09845..6175cf6c 100644 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/skyhook.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/update-no-packages.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/update-no-packages.yaml index 32625103..287dcffd 100644 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/update-no-packages.yaml +++ b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/update-no-packages.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/update.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/update.yaml index 82a47f0a..b7aec3a7 100644 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/update.yaml +++ b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/update.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/validate-packages/assert.yaml b/k8s-tests/chainsaw/skyhook/validate-packages/assert.yaml index 46c044a6..10cd2bd6 100644 --- a/k8s-tests/chainsaw/skyhook/validate-packages/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/validate-packages/assert.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Node metadata: diff --git a/k8s-tests/chainsaw/skyhook/validate-packages/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/validate-packages/chainsaw-test.yaml index 720de55e..d41d936d 100644 --- a/k8s-tests/chainsaw/skyhook/validate-packages/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/validate-packages/chainsaw-test.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json apiVersion: chainsaw.kyverno.io/v1alpha1 kind: Test diff --git a/k8s-tests/chainsaw/skyhook/validate-packages/skyhook.yaml b/k8s-tests/chainsaw/skyhook/validate-packages/skyhook.yaml index ade772ff..f0d56a73 100644 --- a/k8s-tests/chainsaw/skyhook/validate-packages/skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/validate-packages/skyhook.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/chainsaw/skyhook/validate-packages/update.yaml b/k8s-tests/chainsaw/skyhook/validate-packages/update.yaml index 76f448c7..261062f7 100644 --- a/k8s-tests/chainsaw/skyhook/validate-packages/update.yaml +++ b/k8s-tests/chainsaw/skyhook/validate-packages/update.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: diff --git a/k8s-tests/time_travel_tests/migration_0.5.0/migration.sh b/k8s-tests/time_travel_tests/migration_0.5.0/migration.sh index e7681b86..83b51039 100755 --- a/k8s-tests/time_travel_tests/migration_0.5.0/migration.sh +++ b/k8s-tests/time_travel_tests/migration_0.5.0/migration.sh @@ -20,19 +20,6 @@ # LICENSE END # - - - - - - - - - - - - - ## this test is to verify the migration from 0.4.0 to 0.5.0 works as expected ## NOTE: making this a script to be more reproducible, but today making this work in CI, might be hard diff --git a/kyverno/skyhook-viewer-binding.yaml b/kyverno/skyhook-viewer-binding.yaml index d1c159d4..b2eb132f 100644 --- a/kyverno/skyhook-viewer-binding.yaml +++ b/kyverno/skyhook-viewer-binding.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: diff --git a/operator/.golangci.yml b/operator/.golangci.yml index 3cb95eea..798df9a0 100644 --- a/operator/.golangci.yml +++ b/operator/.golangci.yml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - run: deadline: 5m allow-parallel-runners: true diff --git a/operator/Makefile b/operator/Makefile index ee1813c9..69fd34c3 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -19,10 +19,10 @@ VERSION ?= $(GIT_TAG_LAST) # Image URL to use all building/pushing image targets ## TODO: update this to the correct image location -IMG ?= nvcr.io/nvidian/swgpu-baseos/skyhook-operator:latest +IMG ?= ghcr.io/nvidia/skyhook/operator:latest ## default version of kind to use -KIND_VERSION?=1.30.8 +KIND_VERSION?=1.30.10 PLATFORM := $(shell uname -s 2>/dev/null || echo unknown) SKYHOOK_NAMESPACE ?= skyhook @@ -101,6 +101,7 @@ confirm: .PHONY: manifests manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases + $(MAKE) license-fmt ## fix up license headers .PHONY: generate generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. @@ -233,18 +234,6 @@ lint: golangci-lint license-check ## Run golangci-lint linter & yamllint lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes $(GOLANGCI_LINT) run --fix -create-dashboard: ## create kubernetes dashboard for local testing - helm repo add kubernetes-dashboard https://kubernetes.github.io/dashboard/ - helm upgrade --install kubernetes-dashboard kubernetes-dashboard/kubernetes-dashboard --create-namespace --namespace dashboard - $(KUBECTL) apply -f config/local-dev/dashboard.yaml - -access-dashboard: ## portforwards and gets token for dashboard for local testing - token=$$($(KUBECTL) get secret admin-user -n dashboard -o jsonpath={".data.token"} | base64 -d); echo ""; echo "$$token" - $(KUBECTL) -n dashboard port-forward svc/kubernetes-dashboard-kong-proxy 8443:443 - -remove-dashboard: - $(KUBECTL) delete namespace/dashboard - ##@ Build .PHONY: build @@ -289,7 +278,7 @@ create-namespace: ## Create the namespace in the K8s cluster specified in ~/.kub $(KUBECTL) create namespace $(SKYHOOK_NAMESPACE) --dry-run=client -o yaml | kubectl apply -f - .PHONY: install -install: manifests kustomize create-namespace ## Install CRDs into the K8s cluster specified in ~/.kube/config. +install: manifests kustomize create-namespace license-fmt ## Install CRDs into the K8s cluster specified in ~/.kube/config. $(KUSTOMIZE) build config/crd | $(KUBECTL) apply -f - cert-manager-yaml-url=https://github.com/cert-manager/cert-manager/releases/download/v1.17.0/cert-manager.yaml diff --git a/operator/README.md b/operator/README.md index 10052481..57883026 100644 --- a/operator/README.md +++ b/operator/README.md @@ -90,10 +90,10 @@ Packages can depend on each other, so if you needed `something_important` to be ## Development ### Prerequisites -- go version v1.23.7+ +- go version v1.23.8+ - docker version 17.03+ or podman 4.9.4+ (project makefile kind of assumes podman) - kubectl version v1.27.3+. -- Access to a Kubernetes v1.27+ cluster. (we test on 1.27, should work on older if needed, just not tested.) +- Access to a Kubernetes v1.27+ cluster. (We test on 1.30, could work on older, not tested. Could be api compatibilities issues.) **Install the CRDs into the cluster:** @@ -166,8 +166,6 @@ Development podman-create-machine creates a podman machine lint Run golangci-lint linter & yamllint lint-fix Run golangci-lint linter and perform fixes - create-dashboard create kubernetes dashboard for local testing - access-dashboard portforwards and gets token for dashboard for local testing Build build Build manager binary. diff --git a/operator/api/v1alpha1/skyhook_types.go b/operator/api/v1alpha1/skyhook_types.go index 1db6ca2a..a369fb72 100644 --- a/operator/api/v1alpha1/skyhook_types.go +++ b/operator/api/v1alpha1/skyhook_types.go @@ -227,6 +227,10 @@ type Package struct { // More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ // +kubebuilder:default={} Resources ResourceRequirements `json:"resources,omitempty"` + + // GracefulShutdown is the graceful shutdown timeout for the package, if not set, uses k8s default + //+optional + GracefulShutdown *metav1.Duration `json:"gracefulShutdown,omitempty"` } func (f *Package) HasInterrupt() bool { @@ -652,6 +656,6 @@ func init() { } // WasUpdated returns true if this instance of skyhook has been updated -// func (s *Skyhook) WasUpdated() bool { -// return s.Generation > 1 && s.Generation > s.Status.ObservedGeneration -// } +func (s *Skyhook) WasUpdated() bool { + return s.Generation > 1 && s.Generation > s.Status.ObservedGeneration +} diff --git a/operator/api/v1alpha1/zz_generated.deepcopy.go b/operator/api/v1alpha1/zz_generated.deepcopy.go index b912688f..cb088b1c 100644 --- a/operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/operator/api/v1alpha1/zz_generated.deepcopy.go @@ -1,5 +1,25 @@ //go:build !ignore_autogenerated +/* + * LICENSE START + * + * Copyright (c) NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * LICENSE END + */ + // Code generated by controller-gen. DO NOT EDIT. package v1alpha1 @@ -114,6 +134,11 @@ func (in *Package) DeepCopyInto(out *Package) { } } in.Resources.DeepCopyInto(&out.Resources) + if in.GracefulShutdown != nil { + in, out := &in.GracefulShutdown, &out.GracefulShutdown + *out = new(metav1.Duration) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Package. diff --git a/operator/config/certmanager/certificate.yaml b/operator/config/certmanager/certificate.yaml index ab4d48a2..2d9dec87 100644 --- a/operator/config/certmanager/certificate.yaml +++ b/operator/config/certmanager/certificate.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # The following manifests contain a self-signed issuer CR and a certificate CR. # More document can be found at https://docs.cert-manager.io # WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes. diff --git a/operator/config/certmanager/kustomization.yaml b/operator/config/certmanager/kustomization.yaml index a7b34351..77da82dd 100644 --- a/operator/config/certmanager/kustomization.yaml +++ b/operator/config/certmanager/kustomization.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - resources: - certificate.yaml diff --git a/operator/config/certmanager/kustomizeconfig.yaml b/operator/config/certmanager/kustomizeconfig.yaml index 1f6d3c42..ad57806a 100644 --- a/operator/config/certmanager/kustomizeconfig.yaml +++ b/operator/config/certmanager/kustomizeconfig.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # This configuration is for teaching kustomize how to update name ref substitution nameReference: - kind: Issuer diff --git a/operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml b/operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml index 2224a108..c08b7b93 100644 --- a/operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml +++ b/operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml @@ -17,6 +17,7 @@ # # LICENSE END # + --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition @@ -355,6 +356,10 @@ spec: - name type: object type: array + gracefulShutdown: + description: GracefulShutdown is the graceful shutdown timeout + for the package, if not set, uses k8s default + type: string image: description: Image is the container image to run. Do not included the tag, that is set in the version. diff --git a/operator/config/crd/kustomization.yaml b/operator/config/crd/kustomization.yaml index b6653223..541e9ea6 100644 --- a/operator/config/crd/kustomization.yaml +++ b/operator/config/crd/kustomization.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # This kustomization.yaml is not intended to be run by itself, # since it depends on service name and namespace that are out of this kustomize package. # It should be run by config/default diff --git a/operator/config/crd/kustomizeconfig.yaml b/operator/config/crd/kustomizeconfig.yaml index b1bb587d..5c0cd726 100644 --- a/operator/config/crd/kustomizeconfig.yaml +++ b/operator/config/crd/kustomizeconfig.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # This file is for teaching kustomize how to substitute name and namespace reference in CRD nameReference: - kind: Service diff --git a/operator/config/crd/patches/cainjection_in_skyhooks.yaml b/operator/config/crd/patches/cainjection_in_skyhooks.yaml index 081a569b..94eb7449 100644 --- a/operator/config/crd/patches/cainjection_in_skyhooks.yaml +++ b/operator/config/crd/patches/cainjection_in_skyhooks.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # The following patch adds a directive for certmanager to inject CA into the CRD apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition diff --git a/operator/config/crd/patches/webhook_in_skyhooks.yaml b/operator/config/crd/patches/webhook_in_skyhooks.yaml index ab4e29f0..be928302 100644 --- a/operator/config/crd/patches/webhook_in_skyhooks.yaml +++ b/operator/config/crd/patches/webhook_in_skyhooks.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # The following patch enables a conversion webhook for the CRD apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition diff --git a/operator/config/default/kustomization.yaml b/operator/config/default/kustomization.yaml index 3d9a2685..a50b4f71 100644 --- a/operator/config/default/kustomization.yaml +++ b/operator/config/default/kustomization.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # Adds namespace to all resources. namespace: skyhook-operator-system diff --git a/operator/config/default/manager_auth_proxy_patch.yaml b/operator/config/default/manager_auth_proxy_patch.yaml index cc7b066a..778c0c52 100644 --- a/operator/config/default/manager_auth_proxy_patch.yaml +++ b/operator/config/default/manager_auth_proxy_patch.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # This patch inject a sidecar container which is a HTTP proxy for the # controller manager, it performs RBAC authorization against the Kubernetes API using SubjectAccessReviews. apiVersion: apps/v1 diff --git a/operator/config/default/manager_config_patch.yaml b/operator/config/default/manager_config_patch.yaml index 31ffba13..32c55d06 100644 --- a/operator/config/default/manager_config_patch.yaml +++ b/operator/config/default/manager_config_patch.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: apps/v1 kind: Deployment metadata: diff --git a/operator/config/default/manager_webhook_patch.yaml b/operator/config/default/manager_webhook_patch.yaml index 3f6e0ef0..7103ef70 100644 --- a/operator/config/default/manager_webhook_patch.yaml +++ b/operator/config/default/manager_webhook_patch.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: apps/v1 kind: Deployment metadata: diff --git a/operator/config/default/webhookcainjection_patch.yaml b/operator/config/default/webhookcainjection_patch.yaml index f35efa32..dfce8a9c 100644 --- a/operator/config/default/webhookcainjection_patch.yaml +++ b/operator/config/default/webhookcainjection_patch.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # This patch add annotation to admission webhook config and # CERTIFICATE_NAMESPACE and CERTIFICATE_NAME will be substituted by kustomize apiVersion: admissionregistration.k8s.io/v1 diff --git a/operator/config/local-dev/kind-config.yaml b/operator/config/local-dev/kind-config.yaml index 7444a8b5..e3e7b8cf 100644 --- a/operator/config/local-dev/kind-config.yaml +++ b/operator/config/local-dev/kind-config.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # three node (two workers) local kind cluster config for testing # makefile references this to create this cluster kind: Cluster diff --git a/operator/config/manager/kustomization.yaml b/operator/config/manager/kustomization.yaml index 0c5f091a..959d4907 100644 --- a/operator/config/manager/kustomization.yaml +++ b/operator/config/manager/kustomization.yaml @@ -18,17 +18,11 @@ # LICENSE END # - - - - - - resources: - manager.yaml apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization images: - name: controller - newName: nvcr.io/nvidian/swgpu-baseos/skyhook-operator + newName: ghcr.io/nvidia/skyhook/operator newTag: latest diff --git a/operator/config/manager/manager.yaml b/operator/config/manager/manager.yaml index 1e525588..0e972907 100644 --- a/operator/config/manager/manager.yaml +++ b/operator/config/manager/manager.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Namespace metadata: @@ -118,7 +112,7 @@ spec: - name: PAUSE_IMAGE value: registry.k8s.io/pause:3.10 - name: AGENT_IMAGE - value: nvcr.io/nvidian/swgpu-baseos/skyhook-agent:latest + value: ghcr.io/nvidia/skyhook/agent:latest image: controller:latest name: manager securityContext: diff --git a/operator/config/prometheus/kustomization.yaml b/operator/config/prometheus/kustomization.yaml index 3518ad80..f0cc65b7 100644 --- a/operator/config/prometheus/kustomization.yaml +++ b/operator/config/prometheus/kustomization.yaml @@ -18,11 +18,5 @@ # LICENSE END # - - - - - - resources: - monitor.yaml diff --git a/operator/config/prometheus/monitor.yaml b/operator/config/prometheus/monitor.yaml index 119717cc..8df07144 100644 --- a/operator/config/prometheus/monitor.yaml +++ b/operator/config/prometheus/monitor.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # Prometheus Monitor Service (Metrics) apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/operator/config/rbac/auth_proxy_client_clusterrole.yaml b/operator/config/rbac/auth_proxy_client_clusterrole.yaml index a1615d17..a6dc2f0c 100644 --- a/operator/config/rbac/auth_proxy_client_clusterrole.yaml +++ b/operator/config/rbac/auth_proxy_client_clusterrole.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: diff --git a/operator/config/rbac/auth_proxy_role.yaml b/operator/config/rbac/auth_proxy_role.yaml index 4f969364..8f597408 100644 --- a/operator/config/rbac/auth_proxy_role.yaml +++ b/operator/config/rbac/auth_proxy_role.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: diff --git a/operator/config/rbac/auth_proxy_role_binding.yaml b/operator/config/rbac/auth_proxy_role_binding.yaml index 9e132192..5d8b7261 100644 --- a/operator/config/rbac/auth_proxy_role_binding.yaml +++ b/operator/config/rbac/auth_proxy_role_binding.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: diff --git a/operator/config/rbac/auth_proxy_service.yaml b/operator/config/rbac/auth_proxy_service.yaml index 1f9e5324..4ae2a176 100644 --- a/operator/config/rbac/auth_proxy_service.yaml +++ b/operator/config/rbac/auth_proxy_service.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: Service metadata: diff --git a/operator/config/rbac/kustomization.yaml b/operator/config/rbac/kustomization.yaml index b01374a6..862d78b9 100644 --- a/operator/config/rbac/kustomization.yaml +++ b/operator/config/rbac/kustomization.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - resources: # All RBAC will be applied under this service account in # the deployment namespace. You may comment out this resource diff --git a/operator/config/rbac/leader_election_role.yaml b/operator/config/rbac/leader_election_role.yaml index 498b9ce2..cdc7771f 100644 --- a/operator/config/rbac/leader_election_role.yaml +++ b/operator/config/rbac/leader_election_role.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # permissions to do leader election. apiVersion: rbac.authorization.k8s.io/v1 kind: Role diff --git a/operator/config/rbac/leader_election_role_binding.yaml b/operator/config/rbac/leader_election_role_binding.yaml index c9883567..81048069 100644 --- a/operator/config/rbac/leader_election_role_binding.yaml +++ b/operator/config/rbac/leader_election_role_binding.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: diff --git a/operator/config/rbac/role.yaml b/operator/config/rbac/role.yaml index 06033f26..a3d5ff6f 100644 --- a/operator/config/rbac/role.yaml +++ b/operator/config/rbac/role.yaml @@ -17,6 +17,7 @@ # # LICENSE END # + --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole diff --git a/operator/config/rbac/role_binding.yaml b/operator/config/rbac/role_binding.yaml index 46662b3b..308a49db 100644 --- a/operator/config/rbac/role_binding.yaml +++ b/operator/config/rbac/role_binding.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: diff --git a/operator/config/rbac/service_account.yaml b/operator/config/rbac/service_account.yaml index 7359132e..99753a3c 100644 --- a/operator/config/rbac/service_account.yaml +++ b/operator/config/rbac/service_account.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: v1 kind: ServiceAccount metadata: diff --git a/operator/config/rbac/skyhook_editor_role.yaml b/operator/config/rbac/skyhook_editor_role.yaml index d24030e2..8ec1cf2d 100644 --- a/operator/config/rbac/skyhook_editor_role.yaml +++ b/operator/config/rbac/skyhook_editor_role.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # permissions for end users to edit skyhooks. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole diff --git a/operator/config/rbac/skyhook_viewer_role.yaml b/operator/config/rbac/skyhook_viewer_role.yaml index d89a96e9..3a8b69de 100644 --- a/operator/config/rbac/skyhook_viewer_role.yaml +++ b/operator/config/rbac/skyhook_viewer_role.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - # permissions for end users to view skyhooks. apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole diff --git a/operator/config/samples/interrupt_group.yaml b/operator/config/samples/interrupt_group.yaml index 801eb596..45b0a1b5 100644 --- a/operator/config/samples/interrupt_group.yaml +++ b/operator/config/samples/interrupt_group.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: @@ -51,4 +45,4 @@ spec: image: ghcr.io/nvidia/skyhook/agentless:latest interrupt: type: service - services: [containerd, foobar] \ No newline at end of file + services: [containerd, foobar] diff --git a/operator/config/samples/kustomization.yaml b/operator/config/samples/kustomization.yaml index e498d8d7..739e4186 100644 --- a/operator/config/samples/kustomization.yaml +++ b/operator/config/samples/kustomization.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - ## Append samples of your project ## resources: - skyhook_v1alpha1_skyhook.yaml diff --git a/operator/config/webhook/manifests.yaml b/operator/config/webhook/manifests.yaml index e7b42a80..0b5c0ed5 100644 --- a/operator/config/webhook/manifests.yaml +++ b/operator/config/webhook/manifests.yaml @@ -17,6 +17,7 @@ # # LICENSE END # + --- apiVersion: admissionregistration.k8s.io/v1 kind: MutatingWebhookConfiguration diff --git a/operator/internal/controller/annotations.go b/operator/internal/controller/annotations.go index bc2952ef..66d3468f 100644 --- a/operator/internal/controller/annotations.go +++ b/operator/internal/controller/annotations.go @@ -33,8 +33,10 @@ type PackageSkyhook struct { Skyhook string `json:"skyhook"` Stage v1alpha1.Stage `json:"stage"` Image string `json:"image"` + Invalid bool `json:"invalid,omitempty"` } +// GetPackage returns the package from the pod annotations func GetPackage(pod *corev1.Pod) (*PackageSkyhook, error) { if pod == nil { return nil, nil @@ -52,6 +54,7 @@ func GetPackage(pod *corev1.Pod) (*PackageSkyhook, error) { return ret, nil } +// SetPackages sets the package in the pod annotations func SetPackages(pod *corev1.Pod, skyhook *v1alpha1.Skyhook, image string, stage v1alpha1.Stage, _package *v1alpha1.Package) error { if pod == nil || _package == nil { return nil @@ -76,3 +79,39 @@ func SetPackages(pod *corev1.Pod, skyhook *v1alpha1.Skyhook, image string, stage return nil } + +// InvalidatePackage invalidates a package and updates the pod, which will trigger the pod to be deleted +func InvalidatePackage(pod *corev1.Pod) error { + if pod == nil { + return nil + } + + pkg, err := GetPackage(pod) + if err != nil { + return fmt.Errorf("error getting package: %w", err) + } + + pkg.Invalid = true + + data, err := json.Marshal(pkg) + if err != nil { + return fmt.Errorf("error marshalling package: %w", err) + } + + pod.Annotations[fmt.Sprintf("%s/package", v1alpha1.METADATA_PREFIX)] = string(data) + + return nil +} + +// IsInvalidPackage returns true if the package is invalid +func IsInvalidPackage(pod *corev1.Pod) (bool, error) { + if pod == nil { + return false, nil + } + + pkg, err := GetPackage(pod) + if err != nil { + return false, fmt.Errorf("error getting package: %w", err) + } + return pkg.Invalid, nil +} diff --git a/operator/internal/controller/pod_controller.go b/operator/internal/controller/pod_controller.go index d4cc75c1..b21c74e0 100644 --- a/operator/internal/controller/pod_controller.go +++ b/operator/internal/controller/pod_controller.go @@ -36,6 +36,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" ) +// this is a sudo controller, it used to be one, but now are just functions of the skyhook controller +// the reason for this was to less issues around race conditions since they would be handled by one controller +// not sure that actually helped to be honest, but was the reason its acts like one. + // moved here for easier testing, vs being anonymous inline func podHandlerFunc(ctx context.Context, o client.Object) []reconcile.Request { // logger := log.FromContext(ctx) @@ -67,6 +71,14 @@ func podHandlerFunc(ctx context.Context, o client.Object) []reconcile.Request { func (r *SkyhookReconciler) PodReconcile(ctx context.Context, pod *corev1.Pod) (ctrl.Result, error) { logger := log.FromContext(ctx).WithName("pod-reconcile") + // check if the package is invalid, if it is then delete the pod and return + if invalid, err := r.HandleInvalidPackage(ctx, pod); invalid || err != nil { + if err != nil { + logger.Error(err, "error handling invalid package", "pod", pod.Name) + } + return ctrl.Result{}, err + } + containerName, state, restarts := containerExitedSuccessfully(pod) switch state { case containerStateSuccess: @@ -104,6 +116,24 @@ func (r *SkyhookReconciler) PodReconcile(ctx context.Context, pod *corev1.Pod) ( return ctrl.Result{}, nil } +// HandleInvalidPackage deletes invalid packages +func (r *SkyhookReconciler) HandleInvalidPackage(ctx context.Context, pod *corev1.Pod) (bool, error) { + invalid, err := IsInvalidPackage(pod) + if err != nil { + return false, err + } + + if invalid { + err := r.Delete(ctx, pod) + if err != nil { + return false, err + } + return true, nil + } + + return false, nil +} + // UpdateNodeState returns error and if to requeue, not all errors should be requeued, and some times there is no error but should be requeued func (r *SkyhookReconciler) UpdateNodeState(ctx context.Context, pod *corev1.Pod, state v1alpha1.State, containerName string, restarts int32) (bool, error) { packagePtr, err := GetPackage(pod) @@ -157,6 +187,8 @@ func (r *SkyhookReconciler) UpdateNodeState(ctx context.Context, pod *corev1.Pod return false, nil } +// HandleCompletePod handles the complete pod, this is called when the pod has exited successfully +// and we need to update the node state to complete and handles special cases like interrupts, upgrades, and uninstalls func (r *SkyhookReconciler) HandleCompletePod(ctx context.Context, skyhookNode wrapper.SkyhookNodeOnly, packagePtr *PackageSkyhook, containerName string) (bool, error) { updated := false diff --git a/operator/internal/controller/skyhook_controller.go b/operator/internal/controller/skyhook_controller.go index 221fdd9b..438dced7 100644 --- a/operator/internal/controller/skyhook_controller.go +++ b/operator/internal/controller/skyhook_controller.go @@ -81,7 +81,7 @@ type SkyhookOperatorOptions struct { ReapplyOnReboot bool `env:"REAPPLY_ON_REBOOT, default=false"` RuntimeRequiredTaint string `env:"RUNTIME_REQUIRED_TAINT, default=skyhook.nvidia.com=runtime-required:NoSchedule"` PauseImage string `env:"PAUSE_IMAGE, default=registry.k8s.io/pause:3.10"` - AgentImage string `env:"AGENT_IMAGE, default=nvcr.io/nvidian/swgpu-baseos/skyhook-agent:latest"` // TODO: this needs to be updated with a working default + AgentImage string `env:"AGENT_IMAGE, default=ghcr.io/nvidia/skyhook/agent:latest"` // TODO: this needs to be updated with a working default } func (o *SkyhookOperatorOptions) Validate() error { @@ -296,14 +296,16 @@ func (r *SkyhookReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct nodePicker := NewNodePicker(r.opts.GetRuntimeRequiredToleration()) errs := make([]error, 0) - // requeueNeeded := false var result *ctrl.Result for _, skyhook := range clusterState.skyhooks { - if yes, result, err := shouldReturn(r.HandleFinalizer(ctx, skyhook)); yes { return result, err } + if skyhook.skyhook.Spec.Pause { + continue + } + if yes, result, err := shouldReturn(r.ValidateRunningPackages(ctx, skyhook)); yes { return result, err } @@ -534,11 +536,6 @@ func (r *SkyhookReconciler) RunSkyhookPackages(ctx context.Context, clusterState logger := log.FromContext(ctx) requeue := false - if skyhook.skyhook.Spec.Pause { - // continue, paused - return nil, nil - } - toUninstall, err := HandleVersionChange(skyhook) if err != nil { return nil, fmt.Errorf("error getting packages to uninstall: %w", err) @@ -1760,6 +1757,9 @@ func (r *SkyhookReconciler) CreatePodFromPackage(_package *v1alpha1.Package, sky }, skyhook.Spec.AdditionalTolerations...), }, } + if _package.GracefulShutdown != nil { + pod.Spec.TerminationGracePeriodSeconds = ptr(int64(_package.GracefulShutdown.Duration.Seconds())) + } return pod } @@ -1821,7 +1821,7 @@ func (r *SkyhookReconciler) PodMatchesPackage(_package *v1alpha1.Package, pod co // compare the containers env vars except for the ones that are inserted // by the operator by default as the SKYHOOK_RESOURCE_ID will change every // time the skyhook is updated and would cause every pod to be removed - excludeEnvs := []string{"SKYHOOK_RESOURCE_ID", "NODEOS_DO_NOT_UPDATE_LABEL", "COPY_RESOLV"} + excludeEnvs := []string{"SKYHOOK_RESOURCE_ID", "NODEOS_DO_NOT_UPDATE_LABEL", "COPY_RESOLV", "SKYHOOK_DIR"} expectedFilteredEnv := FilterEnv(expectedContainer.Env, excludeEnvs...) actualFilteredEnv := FilterEnv(actualContainer.Env, excludeEnvs...) if !reflect.DeepEqual(expectedFilteredEnv, actualFilteredEnv) { @@ -1837,9 +1837,13 @@ func (r *SkyhookReconciler) PodMatchesPackage(_package *v1alpha1.Package, pod co return true } -// ValidateRunningPackages deletes pods that don't match the current spec +// ValidateRunningPackages deletes pods that don't match the current spec and checks if there are pods running +// that don't match the node state and removes them if they exist func (r *SkyhookReconciler) ValidateRunningPackages(ctx context.Context, skyhook *skyhookNodes) (bool, error) { + update := false + errs := make([]error, 0) + // get all pods for this skyhook packages pods, err := r.dal.GetPods(ctx, client.MatchingLabels{ fmt.Sprintf("%s/name", v1alpha1.METADATA_PREFIX): skyhook.skyhook.Name, @@ -1852,33 +1856,72 @@ func (r *SkyhookReconciler) ValidateRunningPackages(ctx context.Context, skyhook return false, nil // nothing running for this skyhook on this node } - errs := make([]error, 0) - update := false - + // group pods by node + podsbyNode := make(map[string][]corev1.Pod) for _, pod := range pods.Items { - found := false + podsbyNode[pod.Spec.NodeName] = append(podsbyNode[pod.Spec.NodeName], pod) + } - f, err := GetPackage(&pod) + for _, node := range skyhook.nodes { + nodeState, err := node.State() if err != nil { - errs = append(errs, fmt.Errorf("error getting package from pod [%s:%s] while validating packages: %w", pod.Namespace, pod.Name, err)) + return false, fmt.Errorf("error getting node state: %w", err) } - for _, v := range skyhook.skyhook.Spec.Packages { - if r.PodMatchesPackage(&v, pod, skyhook.skyhook, f.Stage) { + for _, pod := range podsbyNode[node.GetNode().Name] { + found := false + + runningPackage, err := GetPackage(&pod) + if err != nil { + errs = append(errs, fmt.Errorf("error getting package from pod [%s:%s] while validating packages: %w", pod.Namespace, pod.Name, err)) + } + + // check if the package is part of the skyhook spec, if not we need to delete it + for _, v := range skyhook.skyhook.Spec.Packages { + if r.PodMatchesPackage(&v, pod, skyhook.skyhook, runningPackage.Stage) { + found = true + } + } + + // uninstall is by definition not part of the skyhook spec, so we cant delete it (because it used to be but was removed, hence uninstalling it) + if runningPackage.Stage == v1alpha1.StageUninstall { found = true } - } - if f.Stage == v1alpha1.StageUninstall { - found = true - } + if !found { + update = true - if !found { - update = true + err := r.InvalidPackage(ctx, &pod) + if err != nil { + errs = append(errs, fmt.Errorf("error invalidating package: %w", err)) + } + continue + } - err = r.Delete(ctx, &pod) - if err != nil { - errs = append(errs, fmt.Errorf("error deleting invalid pod [%s:%s] while validating packages: %w", pod.Namespace, pod.Name, err)) + // Check if package exists in node state, ie a package running that the node state doesn't know about + // something that is often done to try to fix bad node state is to clear the node state completely + // which if a package is running, we want to terminate it gracefully. Ofthen what leads to this is + // the package is in a crashloop and the operator want to restart it the whole package. + // when we apply a package it just check if there is a running package on the node for the state of the package + // this can cause to leave a pod running in say config mode, and it there is a depends on you might not correctly + // run thins in the correct order. + deleteMe := false + packageStatus, exists := nodeState[runningPackage.GetUniqueName()] + if !exists { // package not in node state, so we need to delete it + deleteMe = true + } else { // package in node state, so we need to check if it's running + // need check if the stats match, if not we need to delete it + if packageStatus.Stage != runningPackage.Stage { + deleteMe = true + } + } + + if deleteMe { + update = true + err := r.InvalidPackage(ctx, &pod) + if err != nil { + errs = append(errs, fmt.Errorf("error invalidating package: %w", err)) + } } } } @@ -1886,6 +1929,21 @@ func (r *SkyhookReconciler) ValidateRunningPackages(ctx context.Context, skyhook return update, utilerrors.NewAggregate(errs) } +// InvalidPackage invalidates a package and updates the pod, which will trigger the pod to be deleted +func (r *SkyhookReconciler) InvalidPackage(ctx context.Context, pod *corev1.Pod) error { + err := InvalidatePackage(pod) + if err != nil { + return fmt.Errorf("error invalidating package: %w", err) + } + + err = r.Update(ctx, pod) + if err != nil { + return fmt.Errorf("error updating pod: %w", err) + } + + return nil +} + // ProcessInterrupt will check and do the interrupt if need, and returns // false means we are waiting // true means we are good to proceed @@ -2020,6 +2078,20 @@ func (r *SkyhookReconciler) ApplyPackage(ctx context.Context, logger logr.Logger } } + // if stage != v1alpha1.StageApply { + // // If a node gets rest by a user, the about method will return the wrong node state. Above sources it from the skyhook status. + // // check if the node has nothing, reset it then apply the package. + // nodeState, err := skyhookNode.State() + // if err != nil { + // return fmt.Errorf("error getting node state: %w", err) + // } + + // _, found := nodeState[_package.GetUniqueName()] + // if !found { + // stage = v1alpha1.StageApply + // } + // } + nextStage := skyhookNode.NextStage(_package) if nextStage != nil { stage = *nextStage diff --git a/scripts/format_license.py b/scripts/format_license.py index 4624338d..0bbbb542 100755 --- a/scripts/format_license.py +++ b/scripts/format_license.py @@ -170,15 +170,21 @@ def insert_license(file_path: str, formatted_license: str, verbose: bool = False lines = lines[:start_line] + lines[end_line:] content = '\n'.join(lines) + # Strip any leading/trailing whitespace from content + content = content.strip() + # For Python/Shell files, preserve any shebang line if file_path.endswith(('.py', '.sh')): lines = content.split('\n') if lines and lines[0].startswith('#!'): - content = lines[0] + '\n\n' + formatted_license + '\n' + '\n'.join(lines[1:]) + content = lines[0] + '\n\n' + formatted_license + '\n\n' + '\n'.join(lines[1:]) else: - content = formatted_license + '\n' + content + content = formatted_license + '\n\n' + content else: - content = formatted_license + '\n' + content + content = formatted_license + '\n\n' + content + + # Ensure file ends with exactly one newline + content = content.rstrip('\n') + '\n' with open(file_path, 'w') as f: f.write(content)