Skip to content

Commit f372804

Browse files
committed
Merge branch 'fix_namespace_flag' into 'master'
Avoid operator specific namespace flags See merge request nvidia/cloud-native/mig-parted!57
2 parents 552c8b3 + b1a7b23 commit f372804

File tree

2 files changed

+34
-34
lines changed

2 files changed

+34
-34
lines changed

deployments/gpu-operator/main.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ const (
4545
DefaultHostNvidiaDir = "/usr/local/nvidia"
4646
DefaultHostMigManagerStateFile = "/etc/systemd/system/nvidia-mig-manager.service.d/override.conf"
4747
DefaultHostKubeletSystemdService = "kubelet.service"
48-
DefaultOperatorNamespace = "gpu-operator-resources"
48+
DefaultGPUClientsNamespace = "default"
4949
)
5050

5151
var (
@@ -60,7 +60,7 @@ var (
6060
hostNvidiaDirFlag string
6161
hostMigManagerStateFileFlag string
6262
hostKubeletSystemdServiceFlag string
63-
operatorNamespaceFlag string
63+
defaultGPUClientsNamespaceFlag string
6464
)
6565

6666
type GPUClients struct {
@@ -194,12 +194,12 @@ func main() {
194194
EnvVars: []string{"WITH_SHUTDOWN_HOST_GPU_CLIENTS"},
195195
},
196196
&cli.StringFlag{
197-
Name: "operator-namespace",
197+
Name: "default-gpu-clients-namespace",
198198
Aliases: []string{"p"},
199-
Value: DefaultOperatorNamespace,
200-
Usage: "name of the Kubernetes namespace in which the GPU Operator operands are installed in",
201-
Destination: &operatorNamespaceFlag,
202-
EnvVars: []string{"OPERATOR_NAMESPACE"},
199+
Value: DefaultGPUClientsNamespace,
200+
Usage: "Default name of the Kubernetes namespace in which the GPU client Pods are installed in",
201+
Destination: &defaultGPUClientsNamespaceFlag,
202+
EnvVars: []string{"DEFAULT_GPU_CLIENTS_NAMESPACE"},
203203
},
204204
}
205205

@@ -283,7 +283,7 @@ func runScript(migConfigValue string) error {
283283
"-o", hostMigManagerStateFileFlag,
284284
"-g", strings.Join(gpuClients.SystemdServices, ","),
285285
"-k", hostKubeletSystemdServiceFlag,
286-
"-p", operatorNamespaceFlag,
286+
"-p", defaultGPUClientsNamespaceFlag,
287287
}
288288
if withRebootFlag {
289289
args = append(args, "-r")

deployments/gpu-operator/reconfigure-mig.sh

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,28 +24,28 @@ HOST_KUBELET_SERVICE=""
2424
NODE_NAME=""
2525
MIG_CONFIG_FILE=""
2626
SELECTED_MIG_CONFIG=""
27-
OPERATOR_NAMESPACE=""
27+
DEFAULT_GPU_CLIENTS_NAMESPACE=""
2828

2929
export SYSTEMD_LOG_LEVEL="info"
3030

3131
function usage() {
3232
echo "USAGE:"
3333
echo " ${0} -h "
34-
echo " ${0} -n <node> -f <config-file> -c <selected-config> -p <operator-namespace> [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
34+
echo " ${0} -n <node> -f <config-file> -c <selected-config> -p <default-gpu-clients-namespace> [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
3535
echo ""
3636
echo "OPTIONS:"
37-
echo " -h Display this help message"
38-
echo " -r Automatically reboot the node if changing the MIG mode fails for any reason"
39-
echo " -d Automatically shutdown/restart any required host GPU clients across a MIG configuration"
40-
echo " -n <node> The kubernetes node to change the MIG configuration on"
41-
echo " -f <config-file> The mig-parted configuration file"
42-
echo " -c <selected-config> The selected mig-parted configuration to apply to the node"
43-
echo " -m <host-root-mount> Container path where host root directory is mounted"
44-
echo " -i <host-nvidia-dir> Host path of the directory where NVIDIA managed software directory is typically located"
45-
echo " -o <host-mig-manager-state-file> Host path where the systemd mig-manager state file is located"
46-
echo " -g <host-gpu-client-services> Comma separated list of host systemd services to shutdown/restart across a MIG reconfiguration"
47-
echo " -k <host-kubelet-service> Name of the host's 'kubelet' systemd service which may need to be shutdown/restarted across a MIG mode reconfiguration"
48-
echo " -p <operator-namespace> Name of the Kubernetes namespace in which the GPU Operator operands are installed in"
37+
echo " -h Display this help message"
38+
echo " -r Automatically reboot the node if changing the MIG mode fails for any reason"
39+
echo " -d Automatically shutdown/restart any required host GPU clients across a MIG configuration"
40+
echo " -n <node> The kubernetes node to change the MIG configuration on"
41+
echo " -f <config-file> The mig-parted configuration file"
42+
echo " -c <selected-config> The selected mig-parted configuration to apply to the node"
43+
echo " -m <host-root-mount> Container path where host root directory is mounted"
44+
echo " -i <host-nvidia-dir> Host path of the directory where NVIDIA managed software directory is typically located"
45+
echo " -o <host-mig-manager-state-file> Host path where the systemd mig-manager state file is located"
46+
echo " -g <host-gpu-client-services> Comma separated list of host systemd services to shutdown/restart across a MIG reconfiguration"
47+
echo " -k <host-kubelet-service> Name of the host's 'kubelet' systemd service which may need to be shutdown/restarted across a MIG mode reconfiguration"
48+
echo " -p <default-gpu-clients-namespace> Default name of the Kubernetes Namespace in which the GPU client Pods are installed in"
4949
}
5050

5151
while getopts "hrdn:f:c:m:i:o:g:k:p:" opt; do
@@ -84,9 +84,9 @@ while getopts "hrdn:f:c:m:i:o:g:k:p:" opt; do
8484
HOST_KUBELET_SERVICE=${OPTARG}
8585
;;
8686
p ) # process option p
87-
OPERATOR_NAMESPACE=${OPTARG}
87+
DEFAULT_GPU_CLIENTS_NAMESPACE=${OPTARG}
8888
;;
89-
\? ) echo "Usage: ${0} -n <node> -f <config-file> -c <selected-config> -p <operator-namespace> [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
89+
\? ) echo "Usage: ${0} -n <node> -f <config-file> -c <selected-config> -p <default-gpu-clients-namespace> [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
9090
;;
9191
esac
9292
done
@@ -103,8 +103,8 @@ if [ "${SELECTED_MIG_CONFIG}" = "" ]; then
103103
echo "Error: missing -c <selected-config> flag"
104104
usage; exit 1
105105
fi
106-
if [ "${OPERATOR_NAMESPACE}" = "" ]; then
107-
echo "Error: missing -p <operator-namespace> flag"
106+
if [ "${DEFAULT_GPU_CLIENTS_NAMESPACE}" = "" ]; then
107+
echo "Error: missing -p <default-gpu-clients-namespace> flag"
108108
usage; exit 1
109109
fi
110110

@@ -144,7 +144,7 @@ function __set_state_and_exit() {
144144
nvidia.com/gpu.deploy.dcgm-exporter=$(maybe_set_true ${DCGM_EXPORTER_DEPLOYED}) \
145145
nvidia.com/gpu.deploy.dcgm=$(maybe_set_true ${DCGM_DEPLOYED})
146146
if [ "${?}" != "0" ]; then
147-
echo "Unable to bring up GPU operator components by setting their daemonset labels"
147+
echo "Unable to bring up GPU client pods by setting their daemonset labels"
148148
exit_code=1
149149
fi
150150
fi
@@ -403,36 +403,36 @@ kubectl label --overwrite \
403403
nvidia.com/gpu.deploy.dcgm-exporter=$(maybe_set_paused ${DCGM_EXPORTER_DEPLOYED}) \
404404
nvidia.com/gpu.deploy.dcgm=$(maybe_set_paused ${DCGM_DEPLOYED})
405405
if [ "${?}" != "0" ]; then
406-
echo "Unable to tear down GPU operator components by setting their daemonset labels"
406+
echo "Unable to tear down GPU client pods by setting their daemonset labels"
407407
exit_failed
408408
fi
409409

410410
echo "Waiting for the device-plugin to shutdown"
411411
kubectl wait --for=delete pod \
412412
--timeout=5m \
413413
--field-selector "spec.nodeName=${NODE_NAME}" \
414-
-n "${OPERATOR_NAMESPACE}" \
414+
-n "${DEFAULT_GPU_CLIENTS_NAMESPACE}" \
415415
-l app=nvidia-device-plugin-daemonset
416416

417417
echo "Waiting for gpu-feature-discovery to shutdown"
418418
kubectl wait --for=delete pod \
419419
--timeout=5m \
420420
--field-selector "spec.nodeName=${NODE_NAME}" \
421-
-n "${OPERATOR_NAMESPACE}" \
421+
-n "${DEFAULT_GPU_CLIENTS_NAMESPACE}" \
422422
-l app=gpu-feature-discovery
423423

424424
echo "Waiting for dcgm-exporter to shutdown"
425425
kubectl wait --for=delete pod \
426426
--timeout=5m \
427427
--field-selector "spec.nodeName=${NODE_NAME}" \
428-
-n "${OPERATOR_NAMESPACE}" \
428+
-n "${DEFAULT_GPU_CLIENTS_NAMESPACE}" \
429429
-l app=nvidia-dcgm-exporter
430430

431431
echo "Waiting for dcgm to shutdown"
432432
kubectl wait --for=delete pod \
433433
--timeout=5m \
434434
--field-selector "spec.nodeName=${NODE_NAME}" \
435-
-n "${OPERATOR_NAMESPACE}" \
435+
-n "${DEFAULT_GPU_CLIENTS_NAMESPACE}" \
436436
-l app=nvidia-dcgm
437437

438438
if [ "${WITH_SHUTDOWN_HOST_GPU_CLIENTS}" = "true" ]; then
@@ -496,14 +496,14 @@ kubectl label --overwrite \
496496
nvidia.com/gpu.deploy.dcgm-exporter=$(maybe_set_true ${DCGM_EXPORTER_DEPLOYED}) \
497497
nvidia.com/gpu.deploy.dcgm=$(maybe_set_true ${DCGM_DEPLOYED})
498498
if [ "${?}" != "0" ]; then
499-
echo "Unable to bring up GPU operator components by setting their daemonset labels"
499+
echo "Unable to bring up GPU client components by setting their daemonset labels"
500500
exit_failed
501501
fi
502502

503503
echo "Restarting validator pod to re-run all validations"
504504
kubectl delete pod \
505505
--field-selector "spec.nodeName=${NODE_NAME}" \
506-
-n "${OPERATOR_NAMESPACE}" \
506+
-n "${DEFAULT_GPU_CLIENTS_NAMESPACE}" \
507507
-l app=nvidia-operator-validator
508508

509509
exit_success

0 commit comments

Comments
 (0)