Skip to content

Commit c0b728c

Browse files
authored
Merge pull request #201 from klueska/update-gke-demo
Update all demo scripts for use on GKE with a k8s 1.31 alpha cluster
2 parents 32805fe + 51040c4 commit c0b728c

File tree

9 files changed

+48
-7
lines changed

9 files changed

+48
-7
lines changed

demo/clusters/gke/create-cluster.sh

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")
3535

3636
NETWORK_NAME="${DRIVER_NAME}-net"
3737
CLUSTER_NAME="${DRIVER_NAME}-cluster"
38+
NODE_VERSION="1.31.1"
3839

3940
## Create the Network for the cluster
4041
gcloud compute networks create "${NETWORK_NAME}" \
@@ -52,16 +53,18 @@ gcloud container clusters create "${CLUSTER_NAME}" \
5253
--no-enable-autorepair \
5354
--no-enable-autoupgrade \
5455
--region us-west1 \
56+
--num-nodes "1" \
5557
--network "${NETWORK_NAME}" \
56-
--node-labels=nvidia.com/dra.controller=true
58+
--cluster-version "${NODE_VERSION}" \
59+
--node-version "${NODE_VERSION}"
5760

5861
# Create t4 node pool
5962
gcloud beta container node-pools create "pool-1" \
6063
--quiet \
6164
--project "${PROJECT_NAME}" \
6265
--cluster "${CLUSTER_NAME}" \
6366
--region "us-west1" \
64-
--node-version "1.27.3-gke.100" \
67+
--node-version "${NODE_VERSION}" \
6568
--machine-type "n1-standard-8" \
6669
--accelerator "type=nvidia-tesla-t4,count=1" \
6770
--image-type "UBUNTU_CONTAINERD" \
@@ -79,15 +82,15 @@ gcloud beta container node-pools create "pool-1" \
7982
--max-surge-upgrade 1 \
8083
--max-unavailable-upgrade 0 \
8184
--node-locations "us-west1-a" \
82-
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu=present,nvidia.com/dra.kubelet-plugin=true
85+
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu.present=true
8386

8487
# Create v100 node pool
8588
gcloud beta container node-pools create "pool-2" \
8689
--quiet \
8790
--project "${PROJECT_NAME}" \
8891
--cluster "${CLUSTER_NAME}" \
8992
--region "us-west1" \
90-
--node-version "1.27.3-gke.100" \
93+
--node-version "${NODE_VERSION}" \
9194
--machine-type "n1-standard-8" \
9295
--accelerator "type=nvidia-tesla-v100,count=1" \
9396
--image-type "UBUNTU_CONTAINERD" \
@@ -105,7 +108,7 @@ gcloud beta container node-pools create "pool-2" \
105108
--max-surge-upgrade 1 \
106109
--max-unavailable-upgrade 0 \
107110
--node-locations "us-west1-a" \
108-
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu=present,nvidia.com/dra.kubelet-plugin=true
111+
--node-labels=gke-no-default-nvidia-gpu-device-plugin=true,nvidia.com/gpu.present=true
109112

110113
## Allow the GPU nodes access to the internet
111114
gcloud compute routers create ${NETWORK_NAME}-nat-router \
@@ -126,10 +129,11 @@ gcloud compute routers nats create "${NETWORK_NAME}-nat-config" \
126129
gcloud container clusters get-credentials "${CLUSTER_NAME}" --location="us-west1"
127130

128131
## Launch the nvidia-driver-installer daemonset to install the GPU drivers on any GPU nodes that come online:
132+
kubectl label node --overwrite -l nvidia.com/gpu.present=true cloud.google.com/gke-gpu-driver-version-
129133
kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded.yaml
130134

131135
## Create the nvidia namespace
132136
kubectl create namespace nvidia
133137

134138
## Deploy a custom daemonset that prepares a node for use with DRA
135-
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver/456d097feb452cca1351817bab2ccd0782e96c9f/demo/prepare-gke-nodes-for-dra.yaml
139+
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver/3498c9a91cb594af94c9e8d65177b131e380e116/demo/prepare-gke-nodes-for-dra.yaml

demo/clusters/gke/install-dra-driver.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,15 @@ DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")
2727

2828
: ${IMAGE_REGISTRY:=ghcr.io/nvidia}
2929
: ${IMAGE_NAME:=${DRIVER_NAME}}
30-
: ${IMAGE_TAG:=9323da2d-ubuntu20.04}
30+
: ${IMAGE_TAG:=32805fec-ubi8}
3131

3232
helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \
3333
--set image.repository=${IMAGE_REGISTRY}/${IMAGE_NAME} \
3434
--set image.tag=${IMAGE_TAG} \
3535
--set image.pullPolicy=Always \
3636
--set controller.priorityClassName="" \
3737
--set kubeletPlugin.priorityClassName="" \
38+
--set deviceClasses="{gpu,mig}" \
3839
--set nvidiaDriverRoot="/opt/nvidia" \
3940
--set kubeletPlugin.tolerations[0].key=nvidia.com/gpu \
4041
--set kubeletPlugin.tolerations[0].operator=Exists \

demo/specs/quickstart/gpu-test-mps.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,7 @@ spec:
5757
resourceClaims:
5858
- name: shared-gpu
5959
resourceClaimTemplateName: shared-gpu
60+
tolerations:
61+
- key: "nvidia.com/gpu"
62+
operator: "Exists"
63+
effect: "NoSchedule"

demo/specs/quickstart/gpu-test1.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ spec:
4040
resourceClaims:
4141
- name: gpu
4242
resourceClaimTemplateName: single-gpu
43+
tolerations:
44+
- key: "nvidia.com/gpu"
45+
operator: "Exists"
46+
effect: "NoSchedule"
4347

4448
---
4549
apiVersion: v1
@@ -61,3 +65,7 @@ spec:
6165
resourceClaims:
6266
- name: gpu
6367
resourceClaimTemplateName: single-gpu
68+
tolerations:
69+
- key: "nvidia.com/gpu"
70+
operator: "Exists"
71+
effect: "NoSchedule"

demo/specs/quickstart/gpu-test2.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,7 @@ spec:
4545
resourceClaims:
4646
- name: shared-gpu
4747
resourceClaimTemplateName: single-gpu
48+
tolerations:
49+
- key: "nvidia.com/gpu"
50+
operator: "Exists"
51+
effect: "NoSchedule"

demo/specs/quickstart/gpu-test3.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ spec:
3939
resourceClaims:
4040
- name: shared-gpu
4141
resourceClaimName: single-gpu
42+
tolerations:
43+
- key: "nvidia.com/gpu"
44+
operator: "Exists"
45+
effect: "NoSchedule"
4246

4347
---
4448
apiVersion: v1
@@ -60,3 +64,7 @@ spec:
6064
resourceClaims:
6165
- name: shared-gpu
6266
resourceClaimName: single-gpu
67+
tolerations:
68+
- key: "nvidia.com/gpu"
69+
operator: "Exists"
70+
effect: "NoSchedule"

demo/specs/quickstart/gpu-test4.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,7 @@ spec:
9797
claims:
9898
- name: mig-devices
9999
request: mig-3g-20gb
100+
tolerations:
101+
- key: "nvidia.com/gpu"
102+
operator: "Exists"
103+
effect: "NoSchedule"

demo/specs/quickstart/gpu-test5.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,7 @@ spec:
8787
resourceClaims:
8888
- name: shared-gpus
8989
resourceClaimTemplateName: multiple-gpus
90+
tolerations:
91+
- key: "nvidia.com/gpu"
92+
operator: "Exists"
93+
effect: "NoSchedule"

demo/specs/quickstart/gpu-test6.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,7 @@ spec:
7070
resourceClaims:
7171
- name: a100
7272
resourceClaimTemplateName: a100
73+
tolerations:
74+
- key: "nvidia.com/gpu"
75+
operator: "Exists"
76+
effect: "NoSchedule"

0 commit comments

Comments
 (0)