File tree Expand file tree Collapse file tree 8 files changed +200
-6
lines changed
ods_ci/tasks/Resources/Provisioning/GPU Expand file tree Collapse file tree 8 files changed +200
-6
lines changed Original file line number Diff line number Diff line change 33
44NFD_INSTALL_DIR=" $( dirname " $0 " ) "
55NFD_INSTANCE=$NFD_INSTALL_DIR /nfd_deploy.yaml
6+
67echo " Installing NFD operator"
78oc apply -f " $NFD_INSTALL_DIR /nfd_operator.yaml"
89oc wait --timeout=3m --for jsonpath=' {.status.state}' =AtLatestKnown -n openshift-nfd sub nfd
910
11+ echo " Installing SR-IOV Network Operator"
12+ oc apply -f " $NFD_INSTALL_DIR /sriov_operator.yaml"
13+ oc wait --timeout=8m --for jsonpath=' {.status.state}' =AtLatestKnown -n openshift-sriov-network-operator sub sriov-network-operator-subscription
14+
1015ocpVersion=$( oc version --output json | jq ' .openshiftVersion' | tr -d ' "' )
1116IFS=' .' read -ra ocpVersionSplit <<< " $ocpVersion"
1217xyVersion=" ${ocpVersionSplit[0]} .${ocpVersionSplit[1]} "
@@ -30,3 +35,6 @@ sed -i'' -e "s/<imageUrl>/$imageUrl/g" $NFD_INSTANCE
3035# temporary sleep until latest oc binary is available and --for=create is supported
3136sleep 10s
3237oc apply -f " $NFD_INSTANCE "
38+
39+ echo " Configuring SR-IOV Operator"
40+ oc apply -f " $NFD_INSTALL_DIR /sriov_network_node_policy.yaml"
Original file line number Diff line number Diff line change 5151 - "DMI"
5252 pci:
5353 deviceClassWhitelist:
54- - "0200"
55- - "03"
56- - "12"
54+ - "0200" # Network controller
55+ - "0207" # Serial controller
56+ - "03" # Display controllers (includes 0300, 0302, 0380)
57+ - "12" # Processing accelerators
5758 deviceLabelFields:
5859 - "vendor"
Original file line number Diff line number Diff line change 1+ apiVersion : sriovnetwork.openshift.io/v1
2+ kind : SriovOperatorConfig
3+ metadata :
4+ name : default
5+ namespace : openshift-sriov-network-operator
6+ spec :
7+ configurationMode : daemon
8+ disableDrain : false
9+ enableInjector : true
10+ enableOperatorWebhook : true
11+ configDaemonNodeSelector :
12+ node-role.kubernetes.io/worker : " "
Original file line number Diff line number Diff line change 1+ apiVersion : v1
2+ kind : Namespace
3+ metadata :
4+ name : openshift-sriov-network-operator
5+
6+ ---
7+ apiVersion : operators.coreos.com/v1
8+ kind : OperatorGroup
9+ metadata :
10+ name : sriov-network-operators
11+ namespace : openshift-sriov-network-operator
12+ spec :
13+ targetNamespaces :
14+ - openshift-sriov-network-operator
15+
16+ ---
17+ apiVersion : operators.coreos.com/v1alpha1
18+ kind : Subscription
19+ metadata :
20+ name : sriov-network-operator-subscription
21+ namespace : openshift-sriov-network-operator
22+ spec :
23+ channel : " stable"
24+ installPlanApproval : Automatic
25+ name : sriov-network-operator
26+ source : redhat-operators
27+ sourceNamespace : openshift-marketplace
Original file line number Diff line number Diff line change 1+ apiVersion : v1
2+ kind : ConfigMap
3+ metadata :
4+ name : kernel-module-params
5+ namespace : nvidia-gpu-operator
6+ data :
7+ nvidia.conf : |-
8+ NVreg_RegistryDwords="PeerMappingOverride=1;"
9+ NVreg_EnableStreamMemOPs=1
10+ ---
11+ apiVersion : nvidia.com/v1
12+ kind : ClusterPolicy
13+ metadata :
14+ name : gpu-cluster-policy
15+ spec :
16+ daemonsets :
17+ rollingUpdate :
18+ maxUnavailable : " 1"
19+ updateStrategy : RollingUpdate
20+ dcgm :
21+ enabled : true
22+ dcgmExporter :
23+ config :
24+ name : " "
25+ enabled : true
26+ serviceMonitor :
27+ enabled : true
28+ devicePlugin :
29+ config :
30+ default : " "
31+ name : " "
32+ enabled : true
33+ mps :
34+ root : /run/nvidia/mps
35+ driver :
36+ certConfig :
37+ name : " "
38+ enabled : true
39+ kernelModuleConfig :
40+ name : kernel-module-params
41+ kernelModuleType : auto
42+ licensingConfig :
43+ configMapName : " "
44+ nlsEnabled : false
45+ rdma :
46+ enabled : true
47+ useHostMofed : false
48+ repoConfig :
49+ configMapName : " "
50+ useNvidiaDriverCRD : false
51+ upgradePolicy :
52+ autoUpgrade : true
53+ drain :
54+ deleteEmptyDir : false
55+ enable : false
56+ force : false
57+ timeoutSeconds : 300
58+ maxParallelUpgrades : 1
59+ maxUnavailable : 25%
60+ podDeletion :
61+ deleteEmptyDir : false
62+ force : false
63+ timeoutSeconds : 300
64+ waitForCompletion :
65+ timeoutSeconds : 0
66+ virtualTopology :
67+ config : " "
68+ gdrcopy :
69+ enabled : true
70+ gds :
71+ enabled : false
72+ gfd :
73+ enabled : true
74+ mig :
75+ strategy : single
76+ migManager :
77+ enabled : true
78+ nodeStatusExporter :
79+ enabled : true
80+ operator :
81+ defaultRuntime : crio
82+ initContainer : { }
83+ runtimeClass : nvidia
84+ use_ocp_driver_toolkit : true
85+ sandboxDevicePlugin :
86+ enabled : true
87+ sandboxWorkloads :
88+ defaultWorkload : container
89+ enabled : false
90+ toolkit :
91+ enabled : true
92+ installDir : /usr/local/nvidia
93+ validator :
94+ driver :
95+ env :
96+ - name : DISABLE_DEV_CHAR_SYMLINK_CREATION
97+ value : " true"
98+ plugin :
99+ env :
100+ - name : WITH_WORKLOAD
101+ value : " true"
102+ vfioManager :
103+ enabled : true
104+ vgpuDeviceManager :
105+ enabled : true
106+ vgpuManager :
107+ enabled : false
Original file line number Diff line number Diff line change @@ -85,8 +85,33 @@ function rerun_accelerator_migration() {
8585}
8686
8787wait_until_pod_ready_status " gpu-operator"
88- oc get csv -n nvidia-gpu-operator " $CSVNAME " -o jsonpath=' {.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
89- oc apply -f clusterpolicy.json
88+
89+ echo " Applying NVIDIA vendor label NodeFeatureRule for GPU detection"
90+ oc apply -f " ${GPU_INSTALL_DIR} /nvidia-vendor-label-rule.yaml"
91+
92+ echo " Waiting for NFD to add NVIDIA vendor labels to GPU nodes..."
93+ timeout=300
94+ elapsed=0
95+ gpu_nodes_found=false
96+ while [ $elapsed -lt $timeout ]; do
97+ gpu_node_count=$( oc get nodes -l feature.node.kubernetes.io/pci-10de.present=true --no-headers 2> /dev/null | wc -l)
98+ if [ " $gpu_node_count " -gt 0 ]; then
99+ echo " Found $gpu_node_count GPU node(s) with NVIDIA vendor label"
100+ gpu_nodes_found=true
101+ break
102+ fi
103+ echo " Waiting for NVIDIA vendor labels on GPU nodes... ($elapsed /$timeout )"
104+ sleep 5
105+ elapsed=$(( elapsed + 5 ))
106+ done
107+
108+ if [ " $gpu_nodes_found " = false ]; then
109+ echo " WARNING: No GPU nodes found with NVIDIA vendor label after ${timeout} s"
110+ echo " GPU operator may not be able to deploy to nodes"
111+ fi
112+
113+ echo " Applying NVIDIA GPU ClusterPolicy"
114+ oc apply -f " ${GPU_INSTALL_DIR} /cluster-policy.yaml"
90115wait_until_pod_ready_status " nvidia-device-plugin-daemonset" 600
91116wait_until_pod_ready_status " nvidia-container-toolkit-daemonset"
92117wait_until_pod_ready_status " nvidia-dcgm-exporter"
Original file line number Diff line number Diff line change @@ -22,7 +22,7 @@ metadata:
2222 name : gpu-operator-certified
2323 namespace : nvidia-gpu-operator
2424spec :
25- channel : " v1.11 "
25+ channel : " v25.3 "
2626 installPlanApproval : Automatic
2727 name : gpu-operator-certified
2828 source : certified-operators
Original file line number Diff line number Diff line change 1+ apiVersion : nfd.openshift.io/v1alpha1
2+ kind : NodeFeatureRule
3+ metadata :
4+ name : nvidia-gpu-vendor-label
5+ namespace : openshift-nfd
6+ spec :
7+ rules :
8+ - name : " nvidia.pci.vendor"
9+ labels :
10+ " feature.node.kubernetes.io/pci-10de.present " : " true"
11+ matchFeatures :
12+ - feature : pci.device
13+ matchExpressions :
14+ vendor : {op: In, value: ["10de"]}
You can’t perform that action at this time.
0 commit comments