Merge branch 'fix_namespace_flag' into 'master'

klueska · klueska · commit f37280455f60 · 2021-11-16T16:32:34.000Z
Avoid operator specific namespace flags

See merge request nvidia/cloud-native/mig-parted!57
diff --git a/deployments/gpu-operator/main.go b/deployments/gpu-operator/main.go
@@ -45,7 +45,7 @@ const (
 	DefaultHostNvidiaDir             = "/usr/local/nvidia"
 	DefaultHostMigManagerStateFile   = "/etc/systemd/system/nvidia-mig-manager.service.d/override.conf"
 	DefaultHostKubeletSystemdService = "kubelet.service"
-	DefaultOperatorNamespace         = "gpu-operator-resources"
+	DefaultGPUClientsNamespace       = "default"
 )
 
 var (
@@ -60,7 +60,7 @@ var (
 	hostNvidiaDirFlag              string
 	hostMigManagerStateFileFlag    string
 	hostKubeletSystemdServiceFlag  string
-	operatorNamespaceFlag          string
+	defaultGPUClientsNamespaceFlag string
 )
 
 type GPUClients struct {
@@ -194,12 +194,12 @@ func main() {
 			EnvVars:     []string{"WITH_SHUTDOWN_HOST_GPU_CLIENTS"},
 		},
 		&cli.StringFlag{
-			Name:        "operator-namespace",
+			Name:        "default-gpu-clients-namespace",
 			Aliases:     []string{"p"},
-			Value:       DefaultOperatorNamespace,
-			Usage:       "name of the Kubernetes namespace in which the GPU Operator operands are installed in",
-			Destination: &operatorNamespaceFlag,
-			EnvVars:     []string{"OPERATOR_NAMESPACE"},
+			Value:       DefaultGPUClientsNamespace,
+			Usage:       "Default name of the Kubernetes namespace in which the GPU client Pods are installed in",
+			Destination: &defaultGPUClientsNamespaceFlag,
+			EnvVars:     []string{"DEFAULT_GPU_CLIENTS_NAMESPACE"},
 		},
 	}
 
@@ -283,7 +283,7 @@ func runScript(migConfigValue string) error {
 		"-o", hostMigManagerStateFileFlag,
 		"-g", strings.Join(gpuClients.SystemdServices, ","),
 		"-k", hostKubeletSystemdServiceFlag,
-		"-p", operatorNamespaceFlag,
+		"-p", defaultGPUClientsNamespaceFlag,
 	}
 	if withRebootFlag {
 		args = append(args, "-r")
diff --git a/deployments/gpu-operator/reconfigure-mig.sh b/deployments/gpu-operator/reconfigure-mig.sh
@@ -24,28 +24,28 @@ HOST_KUBELET_SERVICE=""
 NODE_NAME=""
 MIG_CONFIG_FILE=""
 SELECTED_MIG_CONFIG=""
-OPERATOR_NAMESPACE=""
+DEFAULT_GPU_CLIENTS_NAMESPACE=""
 
 export SYSTEMD_LOG_LEVEL="info"
 
 function usage() {
   echo "USAGE:"
   echo "    ${0} -h "
-  echo "    ${0} -n <node> -f <config-file> -c <selected-config> -p <operator-namespace> [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
+  echo "    ${0} -n <node> -f <config-file> -c <selected-config> -p <default-gpu-clients-namespace> [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
   echo ""
   echo "OPTIONS:"
-  echo "    -h                               Display this help message"
-  echo "    -r                               Automatically reboot the node if changing the MIG mode fails for any reason"
-  echo "    -d                               Automatically shutdown/restart any required host GPU clients across a MIG configuration"
-  echo "    -n <node>                        The kubernetes node to change the MIG configuration on"
-  echo "    -f <config-file>                 The mig-parted configuration file"
-  echo "    -c <selected-config>             The selected mig-parted configuration to apply to the node"
-  echo "    -m <host-root-mount>             Container path where host root directory is mounted"
-  echo "    -i <host-nvidia-dir>             Host path of the directory where NVIDIA managed software directory is typically located"
-  echo "    -o <host-mig-manager-state-file> Host path where the systemd mig-manager state file is located"
-  echo "    -g <host-gpu-client-services>    Comma separated list of host systemd services to shutdown/restart across a MIG reconfiguration"
-  echo "    -k <host-kubelet-service>        Name of the host's 'kubelet' systemd service which may need to be shutdown/restarted across a MIG mode reconfiguration"
-  echo "    -p <operator-namespace>          Name of the Kubernetes namespace in which the GPU Operator operands are installed in"
+  echo "    -h                                   Display this help message"
+  echo "    -r                                   Automatically reboot the node if changing the MIG mode fails for any reason"
+  echo "    -d                                   Automatically shutdown/restart any required host GPU clients across a MIG configuration"
+  echo "    -n <node>                            The kubernetes node to change the MIG configuration on"
+  echo "    -f <config-file>                     The mig-parted configuration file"
+  echo "    -c <selected-config>                 The selected mig-parted configuration to apply to the node"
+  echo "    -m <host-root-mount>                 Container path where host root directory is mounted"
+  echo "    -i <host-nvidia-dir>                 Host path of the directory where NVIDIA managed software directory is typically located"
+  echo "    -o <host-mig-manager-state-file>     Host path where the systemd mig-manager state file is located"
+  echo "    -g <host-gpu-client-services>        Comma separated list of host systemd services to shutdown/restart across a MIG reconfiguration"
+  echo "    -k <host-kubelet-service>            Name of the host's 'kubelet' systemd service which may need to be shutdown/restarted across a MIG mode reconfiguration"
+  echo "    -p <default-gpu-clients-namespace>   Default name of the Kubernetes Namespace in which the GPU client Pods are installed in"
 }
 
 while getopts "hrdn:f:c:m:i:o:g:k:p:" opt; do
@@ -84,9 +84,9 @@ while getopts "hrdn:f:c:m:i:o:g:k:p:" opt; do
       HOST_KUBELET_SERVICE=${OPTARG}
       ;;
     p ) # process option p
-      OPERATOR_NAMESPACE=${OPTARG}
+      DEFAULT_GPU_CLIENTS_NAMESPACE=${OPTARG}
       ;;
-    \? ) echo "Usage: ${0} -n <node> -f <config-file> -c <selected-config> -p <operator-namespace> [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
+    \? ) echo "Usage: ${0} -n <node> -f <config-file> -c <selected-config> -p <default-gpu-clients-namespace> [ -m <host-root-mount> -i <host-nvidia-dir> -o <host-mig-manager-state-file> -g <host-gpu-client-services> -k <host-kubelet-service> -r -s ]"
       ;;
   esac
 done
@@ -103,8 +103,8 @@ if [ "${SELECTED_MIG_CONFIG}" = "" ]; then
   echo "Error: missing -c <selected-config> flag"
   usage; exit 1
 fi
-if [ "${OPERATOR_NAMESPACE}" = "" ]; then
-  echo "Error: missing -p <operator-namespace> flag"
+if [ "${DEFAULT_GPU_CLIENTS_NAMESPACE}" = "" ]; then
+  echo "Error: missing -p <default-gpu-clients-namespace> flag"
   usage; exit 1
 fi
 
@@ -144,7 +144,7 @@ function __set_state_and_exit() {
 			nvidia.com/gpu.deploy.dcgm-exporter=$(maybe_set_true ${DCGM_EXPORTER_DEPLOYED}) \
 			nvidia.com/gpu.deploy.dcgm=$(maybe_set_true ${DCGM_DEPLOYED})
 			if [ "${?}" != "0" ]; then
-				echo "Unable to bring up GPU operator components by setting their daemonset labels"
+				echo "Unable to bring up GPU client pods by setting their daemonset labels"
 				exit_code=1
 			fi
 	fi
@@ -403,36 +403,36 @@ kubectl label --overwrite \
 	nvidia.com/gpu.deploy.dcgm-exporter=$(maybe_set_paused ${DCGM_EXPORTER_DEPLOYED}) \
 	nvidia.com/gpu.deploy.dcgm=$(maybe_set_paused ${DCGM_DEPLOYED})
 if [ "${?}" != "0" ]; then
-	echo "Unable to tear down GPU operator components by setting their daemonset labels"
+	echo "Unable to tear down GPU client pods by setting their daemonset labels"
 	exit_failed
 fi
 
 echo "Waiting for the device-plugin to shutdown"
 kubectl wait --for=delete pod \
 	--timeout=5m \
 	--field-selector "spec.nodeName=${NODE_NAME}" \
-	-n "${OPERATOR_NAMESPACE}" \
+	-n "${DEFAULT_GPU_CLIENTS_NAMESPACE}" \
 	-l app=nvidia-device-plugin-daemonset
 
 echo "Waiting for gpu-feature-discovery to shutdown"
 kubectl wait --for=delete pod \
 	--timeout=5m \
 	--field-selector "spec.nodeName=${NODE_NAME}" \
-	-n "${OPERATOR_NAMESPACE}" \
+	-n "${DEFAULT_GPU_CLIENTS_NAMESPACE}" \
 	-l app=gpu-feature-discovery
 
 echo "Waiting for dcgm-exporter to shutdown"
 kubectl wait --for=delete pod \
 	--timeout=5m \
 	--field-selector "spec.nodeName=${NODE_NAME}" \
-	-n "${OPERATOR_NAMESPACE}" \
+	-n "${DEFAULT_GPU_CLIENTS_NAMESPACE}" \
 	-l app=nvidia-dcgm-exporter
 
 echo "Waiting for dcgm to shutdown"
 kubectl wait --for=delete pod \
 	--timeout=5m \
 	--field-selector "spec.nodeName=${NODE_NAME}" \
-	-n "${OPERATOR_NAMESPACE}" \
+	-n "${DEFAULT_GPU_CLIENTS_NAMESPACE}" \
 	-l app=nvidia-dcgm
 
 if [ "${WITH_SHUTDOWN_HOST_GPU_CLIENTS}" = "true" ]; then
@@ -496,14 +496,14 @@ kubectl label --overwrite \
 	nvidia.com/gpu.deploy.dcgm-exporter=$(maybe_set_true ${DCGM_EXPORTER_DEPLOYED}) \
 	nvidia.com/gpu.deploy.dcgm=$(maybe_set_true ${DCGM_DEPLOYED})
 if [ "${?}" != "0" ]; then
-	echo "Unable to bring up GPU operator components by setting their daemonset labels"
+	echo "Unable to bring up GPU client components by setting their daemonset labels"
 	exit_failed
 fi
 
 echo "Restarting validator pod to re-run all validations"
 kubectl delete pod \
 	--field-selector "spec.nodeName=${NODE_NAME}" \
-	-n "${OPERATOR_NAMESPACE}" \
+	-n "${DEFAULT_GPU_CLIENTS_NAMESPACE}" \
 	-l app=nvidia-operator-validator
 
 exit_success