Deploy SR-IOV operator and update ClusterPolicy for RDMA (#2668)

pierDipi · bdattoma · web-flow · commit 85c42da4ad59 · 2025-11-17T10:30:36.000+01:00
* Increase timeouts for GPU and NFD installation I've noticed a few times that the subscription status isn't reached in the given timeouts but after waiting and re-running the script it works as expected, signaling that the timeout in some cases is too low. Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> * Deploy SR-IOV operator and update ClusterPolicy for RDMA for testing llm-d, we might need to use SR-IOV and the Nvidia kernel parameters `EnableStreamMemOPs=1` and `PeerMappingOverride=1`. Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> * Revert "Increase timeouts for GPU and NFD installation" This reverts commit 7e7f47b. Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> --------- Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> Co-authored-by: Berto D'Attoma <bdattoma@redhat.com>
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh
@@ -3,10 +3,15 @@ set -e
 
 NFD_INSTALL_DIR="$(dirname "$0")"
 NFD_INSTANCE=$NFD_INSTALL_DIR/nfd_deploy.yaml
+
 echo "Installing NFD operator"
 oc apply -f "$NFD_INSTALL_DIR/nfd_operator.yaml"
 oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd
 
+echo "Installing SR-IOV Network Operator"
+oc apply -f "$NFD_INSTALL_DIR/sriov_operator.yaml"
+oc wait --timeout=8m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-sriov-network-operator sub sriov-network-operator-subscription
+
 ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"')
 IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion"
 xyVersion="${ocpVersionSplit[0]}.${ocpVersionSplit[1]}"
@@ -30,3 +35,6 @@ sed -i'' -e "s/<imageUrl>/$imageUrl/g" $NFD_INSTANCE
 # temporary sleep until latest oc binary is available and --for=create is supported
 sleep 10s
 oc apply -f "$NFD_INSTANCE"
+
+echo "Configuring SR-IOV Operator"
+oc apply -f "$NFD_INSTALL_DIR/sriov_network_node_policy.yaml"
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/nfd_deploy.yaml
@@ -51,8 +51,9 @@ spec:
             - "DMI"
         pci:
           deviceClassWhitelist:
-            - "0200"
-            - "03"
-            - "12"
+            - "0200"  # Network controller
+            - "0207"  # Serial controller
+            - "03"    # Display controllers (includes 0300, 0302, 0380)
+            - "12"    # Processing accelerators
           deviceLabelFields:
              - "vendor"
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NFD/sriov_network_node_policy.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/sriov_network_node_policy.yaml
@@ -0,0 +1,12 @@
+apiVersion: sriovnetwork.openshift.io/v1
+kind: SriovOperatorConfig
+metadata:
+  name: default
+  namespace: openshift-sriov-network-operator
+spec:
+  configurationMode: daemon
+  disableDrain: false
+  enableInjector: true
+  enableOperatorWebhook: true
+  configDaemonNodeSelector:
+    node-role.kubernetes.io/worker: ""
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NFD/sriov_operator.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NFD/sriov_operator.yaml
@@ -0,0 +1,27 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: openshift-sriov-network-operator
+
+---
+apiVersion: operators.coreos.com/v1
+kind: OperatorGroup
+metadata:
+  name: sriov-network-operators
+  namespace: openshift-sriov-network-operator
+spec:
+  targetNamespaces:
+    - openshift-sriov-network-operator
+
+---
+apiVersion: operators.coreos.com/v1alpha1
+kind: Subscription
+metadata:
+  name: sriov-network-operator-subscription
+  namespace: openshift-sriov-network-operator
+spec:
+  channel: "stable"
+  installPlanApproval: Automatic
+  name: sriov-network-operator
+  source: redhat-operators
+  sourceNamespace: openshift-marketplace
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/cluster-policy.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/cluster-policy.yaml
@@ -0,0 +1,107 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+    name: kernel-module-params
+    namespace: nvidia-gpu-operator
+data:
+    nvidia.conf: |-
+        NVreg_RegistryDwords="PeerMappingOverride=1;"
+        NVreg_EnableStreamMemOPs=1
+---
+apiVersion: nvidia.com/v1
+kind: ClusterPolicy
+metadata:
+    name: gpu-cluster-policy
+spec:
+    daemonsets:
+        rollingUpdate:
+            maxUnavailable: "1"
+        updateStrategy: RollingUpdate
+    dcgm:
+        enabled: true
+    dcgmExporter:
+        config:
+            name: ""
+        enabled: true
+        serviceMonitor:
+            enabled: true
+    devicePlugin:
+        config:
+            default: ""
+            name: ""
+        enabled: true
+        mps:
+            root: /run/nvidia/mps
+    driver:
+        certConfig:
+            name: ""
+        enabled: true
+        kernelModuleConfig:
+            name: kernel-module-params
+        kernelModuleType: auto
+        licensingConfig:
+            configMapName: ""
+            nlsEnabled: false
+        rdma:
+            enabled: true
+            useHostMofed: false
+        repoConfig:
+            configMapName: ""
+        useNvidiaDriverCRD: false
+        upgradePolicy:
+            autoUpgrade: true
+            drain:
+                deleteEmptyDir: false
+                enable: false
+                force: false
+                timeoutSeconds: 300
+            maxParallelUpgrades: 1
+            maxUnavailable: 25%
+            podDeletion:
+                deleteEmptyDir: false
+                force: false
+                timeoutSeconds: 300
+            waitForCompletion:
+                timeoutSeconds: 0
+        virtualTopology:
+            config: ""
+    gdrcopy:
+        enabled: true
+    gds:
+        enabled: false
+    gfd:
+        enabled: true
+    mig:
+        strategy: single
+    migManager:
+        enabled: true
+    nodeStatusExporter:
+        enabled: true
+    operator:
+        defaultRuntime: crio
+        initContainer: { }
+        runtimeClass: nvidia
+        use_ocp_driver_toolkit: true
+    sandboxDevicePlugin:
+        enabled: true
+    sandboxWorkloads:
+        defaultWorkload: container
+        enabled: false
+    toolkit:
+        enabled: true
+        installDir: /usr/local/nvidia
+    validator:
+        driver:
+            env:
+                -   name: DISABLE_DEV_CHAR_SYMLINK_CREATION
+                    value: "true"
+        plugin:
+            env:
+                -   name: WITH_WORKLOAD
+                    value: "true"
+    vfioManager:
+        enabled: true
+    vgpuDeviceManager:
+        enabled: true
+    vgpuManager:
+        enabled: false
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
@@ -85,8 +85,33 @@ function rerun_accelerator_migration() {
 }
 
 wait_until_pod_ready_status  "gpu-operator"
-oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
-oc apply -f clusterpolicy.json
+
+echo "Applying NVIDIA vendor label NodeFeatureRule for GPU detection"
+oc apply -f "${GPU_INSTALL_DIR}/nvidia-vendor-label-rule.yaml"
+
+echo "Waiting for NFD to add NVIDIA vendor labels to GPU nodes..."
+timeout=300
+elapsed=0
+gpu_nodes_found=false
+while [ $elapsed -lt $timeout ]; do
+  gpu_node_count=$(oc get nodes -l feature.node.kubernetes.io/pci-10de.present=true --no-headers 2>/dev/null | wc -l)
+  if [ "$gpu_node_count" -gt 0 ]; then
+    echo "Found $gpu_node_count GPU node(s) with NVIDIA vendor label"
+    gpu_nodes_found=true
+    break
+  fi
+  echo "Waiting for NVIDIA vendor labels on GPU nodes... ($elapsed/$timeout)"
+  sleep 5
+  elapsed=$((elapsed + 5))
+done
+
+if [ "$gpu_nodes_found" = false ]; then
+  echo "WARNING: No GPU nodes found with NVIDIA vendor label after ${timeout}s"
+  echo "GPU operator may not be able to deploy to nodes"
+fi
+
+echo "Applying NVIDIA GPU ClusterPolicy"
+oc apply -f "${GPU_INSTALL_DIR}/cluster-policy.yaml"
 wait_until_pod_ready_status "nvidia-device-plugin-daemonset" 600
 wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
 wait_until_pod_ready_status "nvidia-dcgm-exporter"
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_install.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_install.yaml
@@ -22,7 +22,7 @@ metadata:
   name: gpu-operator-certified
   namespace: nvidia-gpu-operator
 spec:
-  channel: "v1.11"
+  channel: "v25.3"
   installPlanApproval: Automatic
   name: gpu-operator-certified
   source: certified-operators
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/nvidia-vendor-label-rule.yaml b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/nvidia-vendor-label-rule.yaml
@@ -0,0 +1,14 @@
+apiVersion: nfd.openshift.io/v1alpha1
+kind: NodeFeatureRule
+metadata:
+  name: nvidia-gpu-vendor-label
+  namespace: openshift-nfd
+spec:
+  rules:
+    - name: "nvidia.pci.vendor"
+      labels:
+        "feature.node.kubernetes.io/pci-10de.present": "true"
+      matchFeatures:
+        - feature: pci.device
+          matchExpressions:
+            vendor: {op: In, value: ["10de"]}