diff --git a/.github/workflows/security-checkov.yaml b/.github/workflows/security-checkov.yaml new file mode 100644 index 00000000..948d3cd1 --- /dev/null +++ b/.github/workflows/security-checkov.yaml @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Checkov Security Scan + +on: + pull_request: + branches: [main] + paths: + - 'chart/**' + push: + branches: [main, checkov-testing] + paths: + - 'chart/**' + +jobs: + build: + name: checkov + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Test with Checkov + id: checkov + uses: bridgecrewio/checkov-action@master + env: + HELM_NAMESPACE: skyhook + with: + directory: chart + framework: helm + output_format: cli diff --git a/chart/LICENSE b/chart/LICENSE deleted file mode 120000 index ea5b6064..00000000 --- a/chart/LICENSE +++ /dev/null @@ -1 +0,0 @@ -../LICENSE \ No newline at end of file diff --git a/chart/LICENSE b/chart/LICENSE new file mode 100644 index 00000000..371c80cd --- /dev/null +++ b/chart/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright (c) NVIDIA CORPORATION. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/chart/README.md b/chart/README.md index 111355ac..ac7a8c66 100644 --- a/chart/README.md +++ b/chart/README.md @@ -30,8 +30,10 @@ Settings | Description | Default | | controllerManager.manager.env.runtimeRequiredTaint | This feature assumes nodes are added to the cluster with `--register-with-taints` kubelet flag. This taint is assume to be all new nodes, and skyhook pods will tolerate this taint, and remove it one the nodes packages are complete. | skyhook.nvidia.com=runtime-required:NoSchedule | | controllerManager.manager.image.repository | Where to get the image from | "ghcr.io/nvidia/skyhook/operator" | | controllerManager.manager.image.tag | what version of the operator to run | defaults to appVersion | +| controllerManager.manager.image.digest | content-addressable pin for the operator image. If set, the digest determines the pulled image. If both tag and digest are provided, the digest takes precedence; the rendered image may include `tag@digest` but the digest controls selection. | "" | | controllerManager.manager.agent.repository | Where to get the image from | "ghcr.io/nvidia/skyhook/agent" | | controllerManager.manager.agent.tag | what version of the agent to run | defaults to the current latest, but is not latest example v6.1.5 | +| controllerManager.manager.agent.digest | content-addressable pin for the agent image. Same precedence rules as above: if both tag and digest are provided, the digest controls which image is pulled. | "" | | imagePullSecret | the secret used to pull the operator controller image, agent image, and package images. | node-init-secret | | estimatedPackageCount | estimated number of packages to be installed on the cluster, this is used to calculate the resources for the operator controller. | 1 | | estimatedNodeCount | estimated number of nodes in the cluster, this is used to calculate the resources for the operator controller | 1 | @@ -40,6 +42,7 @@ Settings | Description | Default | - **estimatedPackageCount** and **estimatedNodeCount** are used to size the resource requirements. Default setting should be good for nodes > 1000 and packages 1-2 or nodes > 500 and packages >= 4. If your approaching this size deployment it would make sense to set these. You can also override them by explicitly with `controllerManager.manager.resources` the values file has an example. - **runtimeRequired**: If your systems nodes have this taint make sure to add the toleration to the controllerManager.tolerations - **CRD**: This project currently has one CRD and its not managed the ["recommended" way](https://helm.sh/docs/chart_best_practices/custom_resource_definitions/). Its part of the templates. Meaning it will be updated with the `helm upgrade`. We decided it was better do it this way for this project. Doing it either way has consequences and this route has worked well for upgrades so far our deployments. +- **Image pinning (tag vs digest)**: You can set either an image tag or a digest. If both are set, the digest is prioritized; the tag is ignored for selection and may appear as `tag@digest` only for readability. This applies to both operator and agent images. ### Resource Management Skyhook uses Kubernetes LimitRange to set default CPU/memory requests/limits for all containers in the namespace. You can override these per-package in your Skyhook CR. Strict validation is enforced. See [../docs/resource_management.md](../docs/resource_management.md) for details and examples. diff --git a/chart/templates/cleanup-webhook-job.yaml b/chart/templates/cleanup-webhook-job.yaml index e85fd612..c0557bba 100644 --- a/chart/templates/cleanup-webhook-job.yaml +++ b/chart/templates/cleanup-webhook-job.yaml @@ -3,6 +3,7 @@ apiVersion: batch/v1 kind: Job metadata: name: "{{ include "chart.fullname" . }}-webhook-cleanup" + namespace: "{{ .Release.Namespace }}" annotations: "helm.sh/hook": pre-delete "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded @@ -10,19 +11,63 @@ spec: template: spec: restartPolicy: Never + automountServiceAccountToken: false serviceAccountName: {{ include "chart.fullname" . }}-controller-manager + securityContext: + runAsNonRoot: true + runAsUser: 10001 + seccompProfile: + type: RuntimeDefault + volumes: + - name: kube-api-access + projected: + defaultMode: 420 + sources: + - serviceAccountToken: + path: token + expirationSeconds: 3607 + - configMap: + items: + - key: ca.crt + path: ca.crt + name: kube-root-ca.crt + - downwardAPI: + items: + - fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + path: namespace containers: - name: cleanup - image: {{ .Values.webhook.removalImage | default "bitnami/kubectl" }}:{{ .Values.webhook.removalTag | default "latest" }} + image: {{ .Values.webhook.removalImage | default "bitnami/kubectl" }}{{- if .Values.webhook.removalDigest }}{{- if .Values.webhook.removalTag }}:{{ .Values.webhook.removalTag | default "1.33.1" }}@{{ .Values.webhook.removalDigest }}{{- else }}@{{ .Values.webhook.removalDigest }}{{- end }}{{- else }}:{{ .Values.webhook.removalTag | default "1.33.1" }}{{- end }} + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + volumeMounts: + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: kube-api-access + readOnly: true + resources: + limits: + cpu: {{ .Values.limitRange.default.cpu }} + memory: {{ .Values.limitRange.default.memory }} + requests: + cpu: {{ .Values.limitRange.defaultRequest.cpu }} + memory: {{ .Values.limitRange.defaultRequest.memory }} command: - /bin/sh - -c - | NAMESPACE="{{ .Release.Namespace }}" - WEBHOOK_SECRET_NAME="{{ .Values.webhook.secretName | default "webhook-cert" }}" VALIDATING_WEBHOOK_CONFIGURATION_NAME="skyhook-operator-validating-webhook" MUTATING_WEBHOOK_CONFIGURATION_NAME="skyhook-operator-mutating-webhook" - kubectl delete secret -n $NAMESPACE $WEBHOOK_SECRET_NAME || true + kubectl delete secret -n $NAMESPACE "{{ .Values.webhook.secretName | default "webhook-cert" }}" || true kubectl delete validatingwebhookconfiguration $VALIDATING_WEBHOOK_CONFIGURATION_NAME || true kubectl delete mutatingwebhookconfiguration $MUTATING_WEBHOOK_CONFIGURATION_NAME || true {{- end }} \ No newline at end of file diff --git a/chart/templates/deployment.yaml b/chart/templates/deployment.yaml index 5ab5295f..bedca2a7 100644 --- a/chart/templates/deployment.yaml +++ b/chart/templates/deployment.yaml @@ -2,6 +2,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: {{ include "chart.fullname" . }}-controller-manager + namespace: "{{ .Release.Namespace }}" labels: app: {{ include "chart.fullname" . }}-controller-manager app.kubernetes.io/component: manager @@ -24,9 +25,8 @@ spec: annotations: kubectl.kubernetes.io/default-container: manager spec: - {{- if and .Values.controllerManager.selectors .Values.controllerManager.nodeAffinity.matchExpressions }} - {{- fail "Error: Cannot specify both controllerManager.selectors and controllerManager.nodeAffinity.matchExpressions. Use nodeAffinity.matchExpressions for complex node selection or selectors for simple key-value matching." }} - {{- end }} + + automountServiceAccountToken: false affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -89,10 +89,11 @@ spec: - name: PAUSE_IMAGE value: {{ quote .Values.controllerManager.manager.env.pauseImage }} - name: AGENT_IMAGE - value: {{ .Values.controllerManager.manager.agent.repository }}:{{ .Values.controllerManager.manager.agent.tag}} + value: {{ .Values.controllerManager.manager.agent.repository }}{{- if and (.Values.controllerManager.manager.agent.tag) (.Values.controllerManager.manager.agent.digest) }}:{{ .Values.controllerManager.manager.agent.tag }}@{{ .Values.controllerManager.manager.agent.digest }}{{- else if .Values.controllerManager.manager.agent.digest }}@{{ .Values.controllerManager.manager.agent.digest }}{{- else }}:{{ .Values.controllerManager.manager.agent.tag }}{{- end }} - name: KUBERNETES_CLUSTER_DOMAIN value: {{ quote .Values.kubernetesClusterDomain }} - image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag | default .Chart.AppVersion }} + image: {{ .Values.controllerManager.manager.image.repository }}{{- if .Values.controllerManager.manager.image.digest }}{{- if .Values.controllerManager.manager.image.tag }}:{{ .Values.controllerManager.manager.image.tag }}@{{ .Values.controllerManager.manager.image.digest }}{{- else }}@{{ .Values.controllerManager.manager.image.digest }}{{- end }}{{- else }}:{{ .Values.controllerManager.manager.image.tag | default .Chart.AppVersion }}{{- end }} + imagePullPolicy: IfNotPresent livenessProbe: httpGet: path: /healthz @@ -104,6 +105,14 @@ spec: - containerPort: 9443 name: webhook-server protocol: TCP + volumeMounts: + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: kube-api-access + readOnly: true + {{- if .Values.webhook.enable }} + - mountPath: /tmp + name: webhook-certs + {{- end }} readinessProbe: httpGet: path: /readyz @@ -127,26 +136,66 @@ spec: env: - name: KUBERNETES_CLUSTER_DOMAIN value: {{ quote .Values.kubernetesClusterDomain }} - image: {{ .Values.controllerManager.kubeRbacProxy.image.repository }}:{{ .Values.controllerManager.kubeRbacProxy.image.tag - | default .Chart.AppVersion }} + image: {{ .Values.controllerManager.kubeRbacProxy.image.repository }}{{- if .Values.controllerManager.kubeRbacProxy.image.digest }}{{- if .Values.controllerManager.kubeRbacProxy.image.tag }}:{{ .Values.controllerManager.kubeRbacProxy.image.tag }}@{{ .Values.controllerManager.kubeRbacProxy.image.digest }}{{- else }}@{{ .Values.controllerManager.kubeRbacProxy.image.digest }}{{- end }}{{- else }}:{{ .Values.controllerManager.kubeRbacProxy.image.tag | default .Chart.AppVersion }}{{- end }} + imagePullPolicy: Always + livenessProbe: + tcpSocket: + port: 8443 + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + tcpSocket: + port: 8443 + initialDelaySeconds: 5 + periodSeconds: 20 + successThreshold: 1 + failureThreshold: 2 + timeoutSeconds: 3 name: kube-rbac-proxy ports: - containerPort: 8443 name: https protocol: TCP + volumeMounts: + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: kube-api-access + readOnly: true resources: {{- toYaml .Values.controllerManager.kubeRbacProxy.resources | nindent 10 }} securityContext: {{- toYaml .Values.controllerManager.kubeRbacProxy.containerSecurityContext | nindent 10 }} imagePullSecrets: - name: {{ quote .Values.imagePullSecret }} + volumes: + - name: kube-api-access + projected: + defaultMode: 420 + sources: + - serviceAccountToken: + path: token + expirationSeconds: 3607 + - configMap: + items: + - key: ca.crt + path: ca.crt + name: kube-root-ca.crt + - downwardAPI: + items: + - fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + path: namespace + {{- if .Values.webhook.enable }} + - name: webhook-certs + emptyDir: {} + {{- end }} securityContext: runAsNonRoot: true + runAsUser: 10001 + seccompProfile: + type: RuntimeDefault serviceAccountName: {{ include "chart.fullname" . }}-controller-manager terminationGracePeriodSeconds: 10 {{ if ((.Values.controllerManager.podDisruptionBudget).minAvailable) }} -{{ if ge .Values.controllerManager.podDisruptionBudget.minAvailable .Values.controllerManager.replicas }} -{{- $_ := required "minAvailable to be less than replicas" .nil }} -{{ end }} --- apiVersion: policy/v1 kind: PodDisruptionBudget diff --git a/chart/templates/leader-election-rbac.yaml b/chart/templates/leader-election-rbac.yaml index fd7731b6..d01862e6 100644 --- a/chart/templates/leader-election-rbac.yaml +++ b/chart/templates/leader-election-rbac.yaml @@ -2,6 +2,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: skyhook-operator-leader-election-role + namespace: "{{ .Release.Namespace }}" labels: app.kubernetes.io/component: rbac app.kubernetes.io/created-by: skyhook-operator @@ -44,6 +45,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: skyhook-operator-leader-election-rolebinding + namespace: "{{ .Release.Namespace }}" labels: app.kubernetes.io/component: rbac app.kubernetes.io/created-by: skyhook-operator diff --git a/chart/templates/manager-rbac.yaml b/chart/templates/manager-rbac.yaml index 9fcf1b35..aee633f5 100644 --- a/chart/templates/manager-rbac.yaml +++ b/chart/templates/manager-rbac.yaml @@ -2,6 +2,8 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: skyhook-operator-manager-role + annotations: + checkov.io/skip1: CKV_K8S_155=Operator must manage webhook configs for cert rotation labels: {{- include "chart.labels" . | nindent 4 }} rules: @@ -10,13 +12,19 @@ rules: resources: - mutatingwebhookconfigurations - validatingwebhookconfigurations + resourceNames: + - skyhook-operator-validating-webhook + - skyhook-operator-mutating-webhook verbs: - - create - - delete - get - - list - - patch - update +- apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: + - list - watch - apiGroups: - "" diff --git a/chart/templates/metrics-service.yaml b/chart/templates/metrics-service.yaml index 472474ce..708dd615 100644 --- a/chart/templates/metrics-service.yaml +++ b/chart/templates/metrics-service.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Service metadata: name: skyhook-operator-controller-manager-metrics-service + namespace: "{{ .Release.Namespace }}" labels: app.kubernetes.io/component: kube-rbac-proxy app.kubernetes.io/created-by: skyhook-operator diff --git a/chart/templates/mutating-webhook.yaml b/chart/templates/mutating-webhook.yaml new file mode 100644 index 00000000..ff819deb --- /dev/null +++ b/chart/templates/mutating-webhook.yaml @@ -0,0 +1,35 @@ +{{- if .Values.webhook.enable }} +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: skyhook-operator-mutating-webhook +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + caBundle: "" + service: + name: skyhook-operator-webhook-service + namespace: {{ .Release.Namespace }} + path: /mutate-skyhook-nvidia-com-v1alpha1-skyhook + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + name: mutate-skyhook.nvidia.com + namespaceSelector: {} + objectSelector: {} + reinvocationPolicy: Never + rules: + - apiGroups: + - skyhook.nvidia.com + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - skyhooks + scope: '*' + sideEffects: None + timeoutSeconds: 10 +{{- end }} \ No newline at end of file diff --git a/chart/templates/networkpolicy.yaml b/chart/templates/networkpolicy.yaml new file mode 100644 index 00000000..5a11abe2 --- /dev/null +++ b/chart/templates/networkpolicy.yaml @@ -0,0 +1,20 @@ +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "chart.fullname" . }}-controller-manager-allow-all + namespace: "{{ .Release.Namespace }}" + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + podSelector: + matchLabels: + control-plane: controller-manager + {{- include "chart.selectorLabels" . | nindent 6 }} + policyTypes: + - Ingress + - Egress + ingress: + - {} + egress: + - {} + diff --git a/chart/templates/serviceaccount.yaml b/chart/templates/serviceaccount.yaml index 6db143ec..7a785749 100644 --- a/chart/templates/serviceaccount.yaml +++ b/chart/templates/serviceaccount.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: ServiceAccount metadata: name: {{ include "chart.fullname" . }}-controller-manager + namespace: "{{ .Release.Namespace }}" labels: app.kubernetes.io/component: rbac app.kubernetes.io/created-by: skyhook-operator diff --git a/chart/templates/validating-webhook.yaml b/chart/templates/validating-webhook.yaml new file mode 100644 index 00000000..cdb75d9f --- /dev/null +++ b/chart/templates/validating-webhook.yaml @@ -0,0 +1,34 @@ +{{- if .Values.webhook.enable }} +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: skyhook-operator-validating-webhook +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + caBundle: "" + service: + name: skyhook-operator-webhook-service + namespace: {{ .Release.Namespace }} + path: /validate-skyhook-nvidia-com-v1alpha1-skyhook + port: 443 + failurePolicy: Fail + matchPolicy: Equivalent + name: validate-skyhook.nvidia.com + namespaceSelector: {} + objectSelector: {} + rules: + - apiGroups: + - skyhook.nvidia.com + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - skyhooks + scope: '*' + sideEffects: None + timeoutSeconds: 10 +{{- end }} \ No newline at end of file diff --git a/chart/templates/validations.yaml b/chart/templates/validations.yaml new file mode 100644 index 00000000..e56f14ad --- /dev/null +++ b/chart/templates/validations.yaml @@ -0,0 +1,18 @@ +{{- if eq .Release.Namespace "default" }} +{{- fail "Deployment to 'default' namespace is not allowed for security reasons. Please specify a different namespace." }} +{{- end }} + + +{{- /* Prevent conflicting node selection config */ -}} +{{- if and .Values.controllerManager.selectors .Values.controllerManager.nodeAffinity.matchExpressions }} +{{- fail "Error: Cannot specify both controllerManager.selectors and controllerManager.nodeAffinity.matchExpressions. Use nodeAffinity.matchExpressions for complex node selection or selectors for simple key-value matching." }} +{{- end }} + +{{- /* Validate PodDisruptionBudget: minAvailable must be < replicas */ -}} +{{- if ((.Values.controllerManager.podDisruptionBudget).minAvailable) -}} + {{- if ge .Values.controllerManager.podDisruptionBudget.minAvailable .Values.controllerManager.replicas -}} + {{- fail "Error: controllerManager.podDisruptionBudget.minAvailable must be less than controllerManager.replicas" -}} + {{- end -}} +{{- end -}} + + diff --git a/chart/templates/webhook-service.yaml b/chart/templates/webhook-service.yaml index fa8dd51b..8cdc2e12 100644 --- a/chart/templates/webhook-service.yaml +++ b/chart/templates/webhook-service.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Service metadata: name: {{ .Values.webhook.serviceName }} + namespace: "{{ .Release.Namespace }}" labels: app.kubernetes.io/component: webhook app.kubernetes.io/created-by: skyhook-operator diff --git a/chart/values.yaml b/chart/values.yaml index 38269b63..4e0c4d66 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -37,12 +37,14 @@ controllerManager: - --v=0 containerSecurityContext: allowPrivilegeEscalation: false + readOnlyRootFilesystem: true capabilities: drop: - ALL image: repository: quay.io/brancz/kube-rbac-proxy tag: v0.15.0 + digest: "sha256:2c7b120590cbe9f634f5099f2cbb91d0b668569023a81505ca124a5c437e7663" # manifest list digest (multi-arch) resources: limits: cpu: 500m @@ -53,6 +55,7 @@ controllerManager: manager: containerSecurityContext: allowPrivilegeEscalation: false + readOnlyRootFilesystem: true capabilities: drop: - ALL @@ -81,11 +84,13 @@ controllerManager: pauseImage: registry.k8s.io/pause:3.10 image: repository: nvcr.io/nvidia/skyhook/operator - tag: "" ## if omitted, default to the chart appVersion + tag: "" ## if both tag and digest are omitted, defaults to the chart appVersion + digest: "sha256:412880d97eab314275590068f993a371c772b19a1cb4b965fd6b9ca101f21b43" # manifest list digest (multi-arch) ## agentImage: is the image used for the agent container. This image is the default for this install, but can be overridden in the CR at package level. agent: repository: nvcr.io/nvidia/skyhook/agent tag: "v6.3.1" + digest: "sha256:c034866d1382c3372989c09839e5a0c0837cf2836d2a20a038649469a9dae18b" # manifest list digest (multi-arch) # resources: If this is defined it will override the default calculation for resources # from estimatedNodeCount and estimatedPackageCount. The below values are @@ -153,7 +158,8 @@ webhook: ## uninstall image for cleaning up webhook resources removalImage: bitnami/kubectl - removalTag: latest + removalTag: 1.33.1 + removalDigest: "sha256:9081a6f83f4febf47369fc46b6f0f7683c7db243df5b43fc9defe51b0471a950" metrics: addServiceAccountBinding: false diff --git a/docs/release-process.md b/docs/release-process.md index d551d13c..b31dce2e 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -115,6 +115,42 @@ git push origin chart/v1.2.3 - [ ] Tests passing - [ ] Documentation updated +### Pin multi-arch image digests in the chart + +Starting with digest pinning, the chart references images using tag@digest (or digest-only where applicable). For each image, fetch the multi-arch manifest digest and update `chart/values.yaml` so our releases are reproducible across architectures. + +Prerequisites: + +- Docker buildx (`docker-buildx version`) + +Fetch a multi-arch digest (example for bitnami/kubectl used by the webhook cleanup job): + +```bash +docker-buildx imagetools inspect bitnami/kubectl:1.33.1 +``` + +Example output (look for the top-level Digest): + +``` +Name: docker.io/bitnami/kubectl:1.33.1 +MediaType: application/vnd.docker.distribution.manifest.list.v2+json +Digest: sha256:9081a6f83f4febf47369fc46b6f0f7683c7db243df5b43fc9defe51b0471a950 + +Manifests: + Name: docker.io/bitnami/kubectl:1.33.1@sha256:c8efec87588c7a2d84c760d54446b2e081e607a709f16f19283774d5612191b7 + MediaType: application/vnd.docker.distribution.manifest.v2+json + Platform: linux/amd64 + + Name: docker.io/bitnami/kubectl:1.33.1@sha256:2af8ed9feaeada845f4d60f1fe4db951df2e5334ea01bec4b5ef4f191ad20d65 + MediaType: application/vnd.docker.distribution.manifest.v2+json + Platform: linux/arm64 +``` + +Update the digest in `chart/values.yaml` for kube-rbac-proxy, operator, and agent images: + +Note: +- Always use the multi-arch manifest digest (top-level Digest from imagetools), not a single-arch child manifest digest. + **After tagging:** - [ ] CI/CD pipeline completes - [ ] Images published successfully diff --git a/docs/versioning.md b/docs/versioning.md index 1822b9c0..e558b918 100644 --- a/docs/versioning.md +++ b/docs/versioning.md @@ -47,6 +47,10 @@ image: image: "ghcr.io/nvidia/skyhook/operator:0.7.0" ``` +### Image Pinning: Tag vs Digest +- You can specify either a tag or a digest for images. +- If both are provided, the **digest takes precedence** and determines the image pulled. The rendered image reference may display as `:tag@sha256:...`, but the digest controls selection. + ## Release Branching Strategy Skyhook uses **release branches** to manage patches and maintenance releases: diff --git a/k8s-tests/chainsaw/helm/helm-chart-test/assert-no-schedule.yaml b/k8s-tests/chainsaw/helm/helm-chart-test/assert-no-schedule.yaml index 113b9aec..65420488 100644 --- a/k8s-tests/chainsaw/helm/helm-chart-test/assert-no-schedule.yaml +++ b/k8s-tests/chainsaw/helm/helm-chart-test/assert-no-schedule.yaml @@ -97,7 +97,7 @@ spec: env: - name: KUBERNETES_CLUSTER_DOMAIN value: cluster.local - image: quay.io/brancz/kube-rbac-proxy:v0.15.0 + image: quay.io/brancz/kube-rbac-proxy:v0.15.0@sha256:2c7b120590cbe9f634f5099f2cbb91d0b668569023a81505ca124a5c437e7663 name: kube-rbac-proxy ports: - containerPort: 8443 diff --git a/k8s-tests/chainsaw/helm/helm-chart-test/assert-scheduled.yaml b/k8s-tests/chainsaw/helm/helm-chart-test/assert-scheduled.yaml index 808b9f96..6f6d38fa 100644 --- a/k8s-tests/chainsaw/helm/helm-chart-test/assert-scheduled.yaml +++ b/k8s-tests/chainsaw/helm/helm-chart-test/assert-scheduled.yaml @@ -49,6 +49,7 @@ spec: - command: - /manager ((env[?name == 'RUNTIME_REQUIRED_TAINT'].value)[0] == 'skyhook.nvidia.com=runtime-required:NoSchedule'): true + image: ghcr.io/nvidia/skyhook/operator:latest livenessProbe: failureThreshold: 3 httpGet: @@ -96,7 +97,7 @@ spec: env: - name: KUBERNETES_CLUSTER_DOMAIN value: cluster.local - image: quay.io/brancz/kube-rbac-proxy:v0.15.0 + image: quay.io/brancz/kube-rbac-proxy:v0.15.0@sha256:2c7b120590cbe9f634f5099f2cbb91d0b668569023a81505ca124a5c437e7663 name: kube-rbac-proxy ports: - containerPort: 8443 diff --git a/k8s-tests/chainsaw/helm/helm-chart-test/values.yaml b/k8s-tests/chainsaw/helm/helm-chart-test/values.yaml index d73aa355..6ea40ddb 100644 --- a/k8s-tests/chainsaw/helm/helm-chart-test/values.yaml +++ b/k8s-tests/chainsaw/helm/helm-chart-test/values.yaml @@ -27,5 +27,6 @@ controllerManager: image: repository: ghcr.io/nvidia/skyhook/operator tag: latest ## THIS should change to be like a tag so it can point at a specific commit + digest: "" webhook: enable: false diff --git a/k8s-tests/chainsaw/helm/helm-node-affinity-test/values-conflict-test.yaml b/k8s-tests/chainsaw/helm/helm-node-affinity-test/values-conflict-test.yaml index a27053c2..9cd621df 100644 --- a/k8s-tests/chainsaw/helm/helm-node-affinity-test/values-conflict-test.yaml +++ b/k8s-tests/chainsaw/helm/helm-node-affinity-test/values-conflict-test.yaml @@ -29,5 +29,6 @@ controllerManager: image: repository: ghcr.io/nvidia/skyhook/operator tag: latest + digest: "" webhook: enable: false diff --git a/k8s-tests/chainsaw/helm/helm-node-affinity-test/values-match.yaml b/k8s-tests/chainsaw/helm/helm-node-affinity-test/values-match.yaml index 1dd8d920..825b7bcd 100644 --- a/k8s-tests/chainsaw/helm/helm-node-affinity-test/values-match.yaml +++ b/k8s-tests/chainsaw/helm/helm-node-affinity-test/values-match.yaml @@ -31,5 +31,6 @@ controllerManager: image: repository: ghcr.io/nvidia/skyhook/operator tag: latest ## THIS should change to be like a tag so it can point at a specific commit + digest: "" webhook: enable: false diff --git a/k8s-tests/chainsaw/helm/helm-node-affinity-test/values-no-match.yaml b/k8s-tests/chainsaw/helm/helm-node-affinity-test/values-no-match.yaml index fdd71a1f..667cc23c 100644 --- a/k8s-tests/chainsaw/helm/helm-node-affinity-test/values-no-match.yaml +++ b/k8s-tests/chainsaw/helm/helm-node-affinity-test/values-no-match.yaml @@ -31,5 +31,6 @@ controllerManager: image: repository: ghcr.io/nvidia/skyhook/operator tag: latest ## THIS should change to be like a tag so it can point at a specific commit + digest: "" webhook: enable: false diff --git a/k8s-tests/chainsaw/helm/helm-scale-test/values-scale.yaml b/k8s-tests/chainsaw/helm/helm-scale-test/values-scale.yaml index 095c0659..bb47aa52 100644 --- a/k8s-tests/chainsaw/helm/helm-scale-test/values-scale.yaml +++ b/k8s-tests/chainsaw/helm/helm-scale-test/values-scale.yaml @@ -21,6 +21,7 @@ controllerManager: image: repository: ghcr.io/nvidia/skyhook/operator tag: latest ## THIS should change to be like a tag so it can point at a specific commit + digest: "" estimatedNodeCount: 400 estimatedPackageCount: 5 webhook: diff --git a/k8s-tests/chainsaw/helm/helm-webhook-test/values.yaml b/k8s-tests/chainsaw/helm/helm-webhook-test/values.yaml index eb1a87e6..e8c9afd8 100644 --- a/k8s-tests/chainsaw/helm/helm-webhook-test/values.yaml +++ b/k8s-tests/chainsaw/helm/helm-webhook-test/values.yaml @@ -21,5 +21,6 @@ controllerManager: image: repository: ghcr.io/nvidia/skyhook/operator tag: v0.7.6-1ec0890 ## TODO: update this to latest onces this is merged + digest: "" webhook: enable: true diff --git a/operator/internal/controller/webhook_controller.go b/operator/internal/controller/webhook_controller.go index 8f7e7053..dd64f3af 100644 --- a/operator/internal/controller/webhook_controller.go +++ b/operator/internal/controller/webhook_controller.go @@ -216,53 +216,61 @@ func (r *WebhookController) CheckOrUpdateWebhookCertSecret(ctx context.Context, } func (r *WebhookController) CheckOrUpdateWebhookConfigurations(ctx context.Context, secret *corev1.Secret) (bool, error) { + // Update only CABundle fields of existing webhook configurations created by Helm + caBundle := secret.Data["ca.crt"] + changed := false + // ValidatingWebhookConfiguration - validatingWebhookConfiguration := webhookValidatingWebhookConfiguration(r.namespace, r.opts.ServiceName, secret) - existingValidatingWebhookConfiguration := &admissionregistrationv1.ValidatingWebhookConfiguration{} - err := r.Get(ctx, types.NamespacedName{Name: validatingWebhookConfiguration.Name}, existingValidatingWebhookConfiguration) - if err != nil { + validatingName := webhookValidatingWebhookConfiguration(r.namespace, r.opts.ServiceName, secret).GetName() + existingValidating := &admissionregistrationv1.ValidatingWebhookConfiguration{} + if err := r.Get(ctx, types.NamespacedName{Name: validatingName}, existingValidating); err != nil { if errors.IsNotFound(err) { - err := r.Create(ctx, validatingWebhookConfiguration) - if err != nil && !errors.IsAlreadyExists(err) { // race condition, ignore - return false, err - } - } else { - return false, err + return false, fmt.Errorf("ValidatingWebhookConfiguration %q not found; creation is handled by the Helm chart. Ensure the chart is installed and webhooks are enabled: %w", validatingName, err) } - } else { - if compareValidatingWebhookConfigurations(existingValidatingWebhookConfiguration, validatingWebhookConfiguration) { - existingValidatingWebhookConfiguration.Webhooks = validatingWebhookConfiguration.Webhooks - err := r.Update(ctx, existingValidatingWebhookConfiguration) - if err != nil { - return false, err - } + return false, fmt.Errorf("failed to get ValidatingWebhookConfiguration %q: %w", validatingName, err) + } + + needUpdate := false + for i := range existingValidating.Webhooks { + if len(existingValidating.Webhooks[i].ClientConfig.CABundle) == 0 { + existingValidating.Webhooks[i].ClientConfig.CABundle = caBundle + needUpdate = true + } + } + if needUpdate { + if err := r.Update(ctx, existingValidating); err != nil { + return false, err + } else { + changed = true } } // MutatingWebhookConfiguration - mutatingWebhookConfiguration := webhookMutatingWebhookConfiguration(r.namespace, r.opts.ServiceName, secret) - existingMutatingWebhookConfiguration := &admissionregistrationv1.MutatingWebhookConfiguration{} - err = r.Get(ctx, types.NamespacedName{Name: mutatingWebhookConfiguration.Name}, existingMutatingWebhookConfiguration) - if err != nil { + mutatingName := webhookMutatingWebhookConfiguration(r.namespace, r.opts.ServiceName, secret).GetName() + existingMutating := &admissionregistrationv1.MutatingWebhookConfiguration{} + if err := r.Get(ctx, types.NamespacedName{Name: mutatingName}, existingMutating); err != nil { if errors.IsNotFound(err) { - err := r.Create(ctx, mutatingWebhookConfiguration) - if err != nil && !errors.IsAlreadyExists(err) { // race condition, ignore - return false, err - } - } else { - return false, err + return changed, fmt.Errorf("MutatingWebhookConfiguration %q not found; creation is handled by the Helm chart. Ensure the chart is installed and webhooks are enabled: %w", mutatingName, err) } - } else { - if compareMutatingWebhookConfigurations(existingMutatingWebhookConfiguration, mutatingWebhookConfiguration) { - existingMutatingWebhookConfiguration.Webhooks = mutatingWebhookConfiguration.Webhooks - err := r.Update(ctx, existingMutatingWebhookConfiguration) - if err != nil { - return false, err - } + return false, fmt.Errorf("failed to get MutatingWebhookConfiguration %q: %w", mutatingName, err) + } + + needUpdate = false + for i := range existingMutating.Webhooks { + if len(existingMutating.Webhooks[i].ClientConfig.CABundle) == 0 { + existingMutating.Webhooks[i].ClientConfig.CABundle = caBundle + needUpdate = true + } + } + if needUpdate { + if err := r.Update(ctx, existingMutating); err != nil { + return false, err + } else { + changed = true } } - return false, nil + return changed, nil } // webhookValidatingWebhookConfiguration returns a new validating webhook configuration. @@ -383,6 +391,9 @@ func (r *WebhookController) WebhookSecretReadyzCheck(_ *http.Request) error { validatingWebhookConfiguration := &admissionregistrationv1.ValidatingWebhookConfiguration{} err = r.Get(context.Background(), types.NamespacedName{Name: validatingWebhookName}, validatingWebhookConfiguration) if err != nil { + if errors.IsNotFound(err) { + return fmt.Errorf("ValidatingWebhookConfiguration %q not found. Either disable webhooks (not recommended) or reinstall the operator via the Helm chart to provision webhooks", validatingWebhookName) + } return err } @@ -393,6 +404,9 @@ func (r *WebhookController) WebhookSecretReadyzCheck(_ *http.Request) error { mutatingWebhookConfiguration := webhookMutatingWebhookConfiguration(r.namespace, r.opts.ServiceName, secret) err = r.Get(context.Background(), types.NamespacedName{Name: mutatingWebhookConfiguration.Name}, mutatingWebhookConfiguration) if err != nil { + if errors.IsNotFound(err) { + return fmt.Errorf("MutatingWebhookConfiguration %q not found. Either disable webhooks (not recommended) or reinstall the operator via the Helm chart to provision webhooks", mutatingWebhookConfiguration.Name) + } return err }