@@ -52,7 +52,7 @@ iupgrade_wait() {
5252 --timeout=1m5s \
5353 --create-namespace \
5454 --namespace nvidia-dra-driver-gpu \
55- --set resources.gpus.enabled=false \
55+ --set gpuResourcesEnabledOverride=true \
5656 --set nvidiaDriverRoot=" ${TEST_NVIDIA_DRIVER_ROOT} " " ${ADDITIONAL_INSTALL_ARGS[@]} "
5757
5858 # Valueable output to have in the logs in case things went pearshaped.
@@ -139,7 +139,8 @@ show_kubelet_plugin_error_logs() {
139139 kubectl logs \
140140 -l nvidia-dra-driver-gpu-component=kubelet-plugin \
141141 -n nvidia-dra-driver-gpu \
142- --prefix --tail=-1 | grep -E " ^(E|W)[0-9]{4}"
142+ --all-containers \
143+ --prefix --tail=-1 | grep -E " ^(E|W)[0-9]{4}" -iE " error"
143144 ) || true
144145 echo -e " KUBELET PLUGIN ERROR LOGS END\n\n"
145146}
@@ -172,3 +173,70 @@ apply_check_delete_workload_imex_chan_inject() {
172173 kubectl delete -f demo/specs/imex/channel-injection.yaml
173174 kubectl wait --for=delete pods imex-channel-injection --timeout=10s
174175}
176+
177+
178+ # Run cmd in nvidia-mig-manager pod because that one has highest privileges. I
179+ # use this for example to run `nvcnt gb-nvl-027-compute06 nvidia-smi`
180+ nvmm () {
181+ if [ -z " $1 " ]; then
182+ echo " Usage: nvcnt <node-name> [command...]"
183+ return 1
184+ fi
185+ local node=" $1 "
186+ shift # Remove first argument, leaving remaining args in $@
187+
188+ local pod
189+ pod=$( kubectl get pod -n gpu-operator -l app=nvidia-mig-manager \
190+ --field-selector spec.nodeName=" $node " \
191+ --no-headers -o custom-columns=" :metadata.name" )
192+
193+ if [ -z " $pod " ]; then
194+ echo " get pod -n gpu-operator -l app=nvidia-mig-manager: no pod found on node $node "
195+ return 1
196+ fi
197+
198+ echo " Executing on pod $pod (node: $node )..."
199+ kubectl -n gpu-operator exec -it " $pod " -- " $@ "
200+ }
201+
202+ restart_kubelet_on_node () {
203+ local NODEIP=" $1 "
204+ echo " sytemctl restart kubelet.service on ${NODEIP} "
205+ # Assume that current user has password-less sudo privileges
206+ ssh " ${USER} @${NODEIP} " ' sudo systemctl restart kubelet.service'
207+ }
208+
209+ restart_kubelet_all_nodes () {
210+ for nodeip in $( kubectl get nodes -o jsonpath=' {range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}' ) ; do
211+ restart_kubelet_on_node " $nodeip "
212+ done
213+ # wait
214+ echo " restart kubelets: done"
215+ }
216+
217+ kplog () {
218+ if [[ -z " $1 " || -z " $2 " ]]; then
219+ echo " Usage: kplog [gpus|compute-domains] <node-hint-for-grep> [args]"
220+ return 1
221+ fi
222+ local nodehint=" $2 "
223+ local cont=" $1 "
224+ shift
225+ shift # Remove first argument, leaving remaining args in $@
226+
227+ local node=$( kubectl get nodes | grep " $nodehint " | awk ' {print $1}' )
228+ echo " identified node: $node "
229+
230+ local pod
231+ pod=$( kubectl get pod -n nvidia-dra-driver-gpu -l nvidia-dra-driver-gpu-component=kubelet-plugin \
232+ --field-selector spec.nodeName=" $node " \
233+ --no-headers -o custom-columns=" :metadata.name" )
234+
235+ if [ -z " $pod " ]; then
236+ echo " get pod -n nvidia-dra-driver-gpu -l nvidia-dra-driver-gpu-component=kubelet-plugin: no pod found on node $node "
237+ return 1
238+ fi
239+
240+ echo " Executing on pod $pod (node: $node )..."
241+ kubectl logs -n nvidia-dra-driver-gpu " $pod " -c " $cont " " $@ "
242+ }
0 commit comments