Skip to content

Commit 85c42da

Browse files
pierDipibdattoma
andauthored
Deploy SR-IOV operator and update ClusterPolicy for RDMA (#2668)
* Increase timeouts for GPU and NFD installation I've noticed a few times that the subscription status isn't reached in the given timeouts but after waiting and re-running the script it works as expected, signaling that the timeout in some cases is too low. Signed-off-by: Pierangelo Di Pilato <[email protected]> * Deploy SR-IOV operator and update ClusterPolicy for RDMA for testing llm-d, we might need to use SR-IOV and the Nvidia kernel parameters `EnableStreamMemOPs=1` and `PeerMappingOverride=1`. Signed-off-by: Pierangelo Di Pilato <[email protected]> * Revert "Increase timeouts for GPU and NFD installation" This reverts commit 7e7f47b. Signed-off-by: Pierangelo Di Pilato <[email protected]> --------- Signed-off-by: Pierangelo Di Pilato <[email protected]> Co-authored-by: Berto D'Attoma <[email protected]>
1 parent a495cb2 commit 85c42da

File tree

8 files changed

+200
-6
lines changed

8 files changed

+200
-6
lines changed

ods_ci/tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh

100644100755
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,15 @@ set -e
33

44
NFD_INSTALL_DIR="$(dirname "$0")"
55
NFD_INSTANCE=$NFD_INSTALL_DIR/nfd_deploy.yaml
6+
67
echo "Installing NFD operator"
78
oc apply -f "$NFD_INSTALL_DIR/nfd_operator.yaml"
89
oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd
910

11+
echo "Installing SR-IOV Network Operator"
12+
oc apply -f "$NFD_INSTALL_DIR/sriov_operator.yaml"
13+
oc wait --timeout=8m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-sriov-network-operator sub sriov-network-operator-subscription
14+
1015
ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"')
1116
IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion"
1217
xyVersion="${ocpVersionSplit[0]}.${ocpVersionSplit[1]}"
@@ -30,3 +35,6 @@ sed -i'' -e "s/<imageUrl>/$imageUrl/g" $NFD_INSTANCE
3035
# temporary sleep until latest oc binary is available and --for=create is supported
3136
sleep 10s
3237
oc apply -f "$NFD_INSTANCE"
38+
39+
echo "Configuring SR-IOV Operator"
40+
oc apply -f "$NFD_INSTALL_DIR/sriov_network_node_policy.yaml"

ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,9 @@ spec:
5151
- "DMI"
5252
pci:
5353
deviceClassWhitelist:
54-
- "0200"
55-
- "03"
56-
- "12"
54+
- "0200" # Network controller
55+
- "0207" # Serial controller
56+
- "03" # Display controllers (includes 0300, 0302, 0380)
57+
- "12" # Processing accelerators
5758
deviceLabelFields:
5859
- "vendor"
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: sriovnetwork.openshift.io/v1
2+
kind: SriovOperatorConfig
3+
metadata:
4+
name: default
5+
namespace: openshift-sriov-network-operator
6+
spec:
7+
configurationMode: daemon
8+
disableDrain: false
9+
enableInjector: true
10+
enableOperatorWebhook: true
11+
configDaemonNodeSelector:
12+
node-role.kubernetes.io/worker: ""
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
apiVersion: v1
2+
kind: Namespace
3+
metadata:
4+
name: openshift-sriov-network-operator
5+
6+
---
7+
apiVersion: operators.coreos.com/v1
8+
kind: OperatorGroup
9+
metadata:
10+
name: sriov-network-operators
11+
namespace: openshift-sriov-network-operator
12+
spec:
13+
targetNamespaces:
14+
- openshift-sriov-network-operator
15+
16+
---
17+
apiVersion: operators.coreos.com/v1alpha1
18+
kind: Subscription
19+
metadata:
20+
name: sriov-network-operator-subscription
21+
namespace: openshift-sriov-network-operator
22+
spec:
23+
channel: "stable"
24+
installPlanApproval: Automatic
25+
name: sriov-network-operator
26+
source: redhat-operators
27+
sourceNamespace: openshift-marketplace
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: kernel-module-params
5+
namespace: nvidia-gpu-operator
6+
data:
7+
nvidia.conf: |-
8+
NVreg_RegistryDwords="PeerMappingOverride=1;"
9+
NVreg_EnableStreamMemOPs=1
10+
---
11+
apiVersion: nvidia.com/v1
12+
kind: ClusterPolicy
13+
metadata:
14+
name: gpu-cluster-policy
15+
spec:
16+
daemonsets:
17+
rollingUpdate:
18+
maxUnavailable: "1"
19+
updateStrategy: RollingUpdate
20+
dcgm:
21+
enabled: true
22+
dcgmExporter:
23+
config:
24+
name: ""
25+
enabled: true
26+
serviceMonitor:
27+
enabled: true
28+
devicePlugin:
29+
config:
30+
default: ""
31+
name: ""
32+
enabled: true
33+
mps:
34+
root: /run/nvidia/mps
35+
driver:
36+
certConfig:
37+
name: ""
38+
enabled: true
39+
kernelModuleConfig:
40+
name: kernel-module-params
41+
kernelModuleType: auto
42+
licensingConfig:
43+
configMapName: ""
44+
nlsEnabled: false
45+
rdma:
46+
enabled: true
47+
useHostMofed: false
48+
repoConfig:
49+
configMapName: ""
50+
useNvidiaDriverCRD: false
51+
upgradePolicy:
52+
autoUpgrade: true
53+
drain:
54+
deleteEmptyDir: false
55+
enable: false
56+
force: false
57+
timeoutSeconds: 300
58+
maxParallelUpgrades: 1
59+
maxUnavailable: 25%
60+
podDeletion:
61+
deleteEmptyDir: false
62+
force: false
63+
timeoutSeconds: 300
64+
waitForCompletion:
65+
timeoutSeconds: 0
66+
virtualTopology:
67+
config: ""
68+
gdrcopy:
69+
enabled: true
70+
gds:
71+
enabled: false
72+
gfd:
73+
enabled: true
74+
mig:
75+
strategy: single
76+
migManager:
77+
enabled: true
78+
nodeStatusExporter:
79+
enabled: true
80+
operator:
81+
defaultRuntime: crio
82+
initContainer: { }
83+
runtimeClass: nvidia
84+
use_ocp_driver_toolkit: true
85+
sandboxDevicePlugin:
86+
enabled: true
87+
sandboxWorkloads:
88+
defaultWorkload: container
89+
enabled: false
90+
toolkit:
91+
enabled: true
92+
installDir: /usr/local/nvidia
93+
validator:
94+
driver:
95+
env:
96+
- name: DISABLE_DEV_CHAR_SYMLINK_CREATION
97+
value: "true"
98+
plugin:
99+
env:
100+
- name: WITH_WORKLOAD
101+
value: "true"
102+
vfioManager:
103+
enabled: true
104+
vgpuDeviceManager:
105+
enabled: true
106+
vgpuManager:
107+
enabled: false

ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,33 @@ function rerun_accelerator_migration() {
8585
}
8686

8787
wait_until_pod_ready_status "gpu-operator"
88-
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
89-
oc apply -f clusterpolicy.json
88+
89+
echo "Applying NVIDIA vendor label NodeFeatureRule for GPU detection"
90+
oc apply -f "${GPU_INSTALL_DIR}/nvidia-vendor-label-rule.yaml"
91+
92+
echo "Waiting for NFD to add NVIDIA vendor labels to GPU nodes..."
93+
timeout=300
94+
elapsed=0
95+
gpu_nodes_found=false
96+
while [ $elapsed -lt $timeout ]; do
97+
gpu_node_count=$(oc get nodes -l feature.node.kubernetes.io/pci-10de.present=true --no-headers 2>/dev/null | wc -l)
98+
if [ "$gpu_node_count" -gt 0 ]; then
99+
echo "Found $gpu_node_count GPU node(s) with NVIDIA vendor label"
100+
gpu_nodes_found=true
101+
break
102+
fi
103+
echo "Waiting for NVIDIA vendor labels on GPU nodes... ($elapsed/$timeout)"
104+
sleep 5
105+
elapsed=$((elapsed + 5))
106+
done
107+
108+
if [ "$gpu_nodes_found" = false ]; then
109+
echo "WARNING: No GPU nodes found with NVIDIA vendor label after ${timeout}s"
110+
echo "GPU operator may not be able to deploy to nodes"
111+
fi
112+
113+
echo "Applying NVIDIA GPU ClusterPolicy"
114+
oc apply -f "${GPU_INSTALL_DIR}/cluster-policy.yaml"
90115
wait_until_pod_ready_status "nvidia-device-plugin-daemonset" 600
91116
wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
92117
wait_until_pod_ready_status "nvidia-dcgm-exporter"

ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_install.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ metadata:
2222
name: gpu-operator-certified
2323
namespace: nvidia-gpu-operator
2424
spec:
25-
channel: "v1.11"
25+
channel: "v25.3"
2626
installPlanApproval: Automatic
2727
name: gpu-operator-certified
2828
source: certified-operators
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
apiVersion: nfd.openshift.io/v1alpha1
2+
kind: NodeFeatureRule
3+
metadata:
4+
name: nvidia-gpu-vendor-label
5+
namespace: openshift-nfd
6+
spec:
7+
rules:
8+
- name: "nvidia.pci.vendor"
9+
labels:
10+
"feature.node.kubernetes.io/pci-10de.present": "true"
11+
matchFeatures:
12+
- feature: pci.device
13+
matchExpressions:
14+
vendor: {op: In, value: ["10de"]}

0 commit comments

Comments
 (0)