Skip to content

Commit 30caa73

Browse files
authored
Merge pull request #291 from rollandf/quick-start-rdma
fix: update quick start for sriov-rdma
2 parents 436d838 + 217deca commit 30caa73

File tree

6 files changed

+207
-30
lines changed

6 files changed

+207
-30
lines changed

docs/quick-start/sriov-network-rdma.rst

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,51 @@ Deploy SR-IOV Network with RDMA
6666
6767
kubectl apply -f pod.yaml
6868
69-
Verify the deployment:
69+
**Step 6**: Verify the deployment
70+
71+
Check that the pods are running on different nodes:
72+
73+
.. code-block:: bash
74+
75+
kubectl get pods -n default -o wide
76+
77+
Verify RDMA devices are available in the pods:
78+
79+
.. code-block:: bash
80+
81+
kubectl -n default exec sriov-rdma-server -- ibv_devices
82+
kubectl -n default exec sriov-rdma-client -- ibv_devices
83+
84+
Capture the server IP and RDMA device names in environment variables:
85+
86+
.. code-block:: bash
87+
88+
export SERVER_IP=$(kubectl get pod sriov-rdma-server -n default -o jsonpath='{.metadata.annotations.k8s\.v1\.cni\.cncf\.io/network-status}' | jq -r '.[] | select(.name=="default/sriov-rdma-network") | .ips[0]')
89+
export SERVER_RDMA_DEV=$(kubectl -n default exec sriov-rdma-server -- ibv_devices | awk 'NR==3 {print $1}')
90+
export CLIENT_RDMA_DEV=$(kubectl -n default exec sriov-rdma-client -- ibv_devices | awk 'NR==3 {print $1}')
91+
92+
echo "Server IP: $SERVER_IP"
93+
echo "Server RDMA Device: $SERVER_RDMA_DEV"
94+
echo "Client RDMA Device: $CLIENT_RDMA_DEV"
95+
96+
97+
**Step 7**: Test RDMA connectivity
98+
99+
Start the RDMA bandwidth test server:
70100

71101
.. code-block:: bash
72102
73-
kubectl exec -it sriov-test-pod -- ip addr show
74-
kubectl exec -it sriov-test-pod -- ibv_devices
103+
kubectl -n default exec -it sriov-rdma-server -- bash -lc "ib_write_bw -d $SERVER_RDMA_DEV -R -a --report_gbits"
104+
105+
In a separate terminal, run the RDMA bandwidth test client:
106+
107+
.. code-block:: bash
108+
109+
kubectl -n default exec -it sriov-rdma-client -- bash -lc "ib_write_bw -d $CLIENT_RDMA_DEV -R -a --report_gbits $SERVER_IP"
110+
111+
.. note::
112+
The commands above automatically use the first available RDMA device from each pod.
113+
If you need to use a different device, manually set the environment variables or replace them in the command.
75114

76115
**Complete Configuration**
77116

examples/processed/sriov-network-rdma/10-nicclusterpolicy.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,6 @@ kind: NicClusterPolicy
33
metadata:
44
name: nic-cluster-policy
55
spec:
6-
sriovDevicePlugin:
7-
image: sriov-network-device-plugin
8-
repository: nvcr.io/nvstaging/mellanox
9-
version: network-operator-v25.10.0-rc.2
106
nvIpam:
117
image: nvidia-k8s-ipam
128
repository: nvcr.io/nvstaging/mellanox
Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,69 @@
1+
---
12
apiVersion: v1
23
kind: Pod
34
metadata:
4-
name: sriov-test-pod
5+
name: sriov-rdma-server
6+
namespace: default
7+
labels:
8+
app: sriov-rdma
9+
role: server
510
annotations:
611
k8s.v1.cni.cncf.io/networks: sriov-rdma-network
712
spec:
13+
tolerations:
14+
- key: "node-role.kubernetes.io/control-plane"
15+
operator: "Exists"
16+
effect: "NoSchedule"
17+
- key: "node-role.kubernetes.io/master"
18+
operator: "Exists"
19+
effect: "NoSchedule"
20+
restartPolicy: Never
821
containers:
9-
- name: test-container
10-
image: mellanox/rping-test
22+
- name: rdma-test
23+
image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
1124
command: ["/bin/bash", "-c", "sleep infinity"]
1225
securityContext:
1326
capabilities:
1427
add: ["IPC_LOCK"]
28+
privileged: true
1529
resources:
1630
requests:
17-
nvidia.com/sriov_resource: '1'
31+
nvidia.com/sriov_resource: "1"
1832
limits:
19-
nvidia.com/sriov_resource: '1'
33+
nvidia.com/sriov_resource: "1"
34+
---
35+
apiVersion: v1
36+
kind: Pod
37+
metadata:
38+
name: sriov-rdma-client
39+
namespace: default
40+
labels:
41+
app: sriov-rdma
42+
role: client
43+
annotations:
44+
k8s.v1.cni.cncf.io/networks: sriov-rdma-network
45+
spec:
46+
affinity:
47+
podAntiAffinity:
48+
requiredDuringSchedulingIgnoredDuringExecution:
49+
- labelSelector:
50+
matchExpressions:
51+
- key: role
52+
operator: In
53+
values:
54+
- server
55+
topologyKey: kubernetes.io/hostname
56+
restartPolicy: Never
57+
containers:
58+
- name: rdma-test
59+
image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
60+
command: ["/bin/bash", "-c", "sleep infinity"]
61+
securityContext:
62+
capabilities:
63+
add: ["IPC_LOCK"]
64+
privileged: true
65+
resources:
66+
requests:
67+
nvidia.com/sriov_resource: "1"
68+
limits:
69+
nvidia.com/sriov_resource: "1"

examples/processed/sriov-network-rdma/complete.yaml

Lines changed: 55 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,6 @@ kind: NicClusterPolicy
33
metadata:
44
name: nic-cluster-policy
55
spec:
6-
sriovDevicePlugin:
7-
image: sriov-network-device-plugin
8-
repository: nvcr.io/nvstaging/mellanox
9-
version: network-operator-v25.10.0-rc.2
106
nvIpam:
117
image: nvidia-k8s-ipam
128
repository: nvcr.io/nvstaging/mellanox
@@ -64,22 +60,72 @@ spec:
6460
networkNamespace: default
6561
resourceName: sriov_resource
6662
---
63+
---
64+
apiVersion: v1
65+
kind: Pod
66+
metadata:
67+
name: sriov-rdma-server
68+
namespace: default
69+
labels:
70+
app: sriov-rdma
71+
role: server
72+
annotations:
73+
k8s.v1.cni.cncf.io/networks: sriov-rdma-network
74+
spec:
75+
tolerations:
76+
- key: "node-role.kubernetes.io/control-plane"
77+
operator: "Exists"
78+
effect: "NoSchedule"
79+
- key: "node-role.kubernetes.io/master"
80+
operator: "Exists"
81+
effect: "NoSchedule"
82+
restartPolicy: Never
83+
containers:
84+
- name: rdma-test
85+
image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
86+
command: ["/bin/bash", "-c", "sleep infinity"]
87+
securityContext:
88+
capabilities:
89+
add: ["IPC_LOCK"]
90+
privileged: true
91+
resources:
92+
requests:
93+
nvidia.com/sriov_resource: "1"
94+
limits:
95+
nvidia.com/sriov_resource: "1"
96+
---
6797
apiVersion: v1
6898
kind: Pod
6999
metadata:
70-
name: sriov-test-pod
100+
name: sriov-rdma-client
101+
namespace: default
102+
labels:
103+
app: sriov-rdma
104+
role: client
71105
annotations:
72106
k8s.v1.cni.cncf.io/networks: sriov-rdma-network
73107
spec:
108+
affinity:
109+
podAntiAffinity:
110+
requiredDuringSchedulingIgnoredDuringExecution:
111+
- labelSelector:
112+
matchExpressions:
113+
- key: role
114+
operator: In
115+
values:
116+
- server
117+
topologyKey: kubernetes.io/hostname
118+
restartPolicy: Never
74119
containers:
75-
- name: test-container
76-
image: mellanox/rping-test
120+
- name: rdma-test
121+
image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
77122
command: ["/bin/bash", "-c", "sleep infinity"]
78123
securityContext:
79124
capabilities:
80125
add: ["IPC_LOCK"]
126+
privileged: true
81127
resources:
82128
requests:
83-
nvidia.com/sriov_resource: '1'
129+
nvidia.com/sriov_resource: "1"
84130
limits:
85-
nvidia.com/sriov_resource: '1'
131+
nvidia.com/sriov_resource: "1"

examples/templates/sriov-network-rdma/10-nicclusterpolicy.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,6 @@ kind: NicClusterPolicy
33
metadata:
44
name: nic-cluster-policy
55
spec:
6-
sriovDevicePlugin:
7-
image: sriov-network-device-plugin
8-
repository: |sriovnetop-repository|
9-
version: |sriovnetop-sriov-device-plugin-version|
106
nvIpam:
117
image: nvidia-k8s-ipam
128
repository: |nvidia-ipam-repository|
Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,69 @@
1+
---
12
apiVersion: v1
23
kind: Pod
34
metadata:
4-
name: sriov-test-pod
5+
name: sriov-rdma-server
6+
namespace: default
7+
labels:
8+
app: sriov-rdma
9+
role: server
510
annotations:
611
k8s.v1.cni.cncf.io/networks: sriov-rdma-network
712
spec:
13+
tolerations:
14+
- key: "node-role.kubernetes.io/control-plane"
15+
operator: "Exists"
16+
effect: "NoSchedule"
17+
- key: "node-role.kubernetes.io/master"
18+
operator: "Exists"
19+
effect: "NoSchedule"
20+
restartPolicy: Never
821
containers:
9-
- name: test-container
10-
image: mellanox/rping-test
22+
- name: rdma-test
23+
image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
1124
command: ["/bin/bash", "-c", "sleep infinity"]
1225
securityContext:
1326
capabilities:
1427
add: ["IPC_LOCK"]
28+
privileged: true
1529
resources:
1630
requests:
17-
nvidia.com/sriov_resource: '1'
31+
nvidia.com/sriov_resource: "1"
1832
limits:
19-
nvidia.com/sriov_resource: '1'
33+
nvidia.com/sriov_resource: "1"
34+
---
35+
apiVersion: v1
36+
kind: Pod
37+
metadata:
38+
name: sriov-rdma-client
39+
namespace: default
40+
labels:
41+
app: sriov-rdma
42+
role: client
43+
annotations:
44+
k8s.v1.cni.cncf.io/networks: sriov-rdma-network
45+
spec:
46+
affinity:
47+
podAntiAffinity:
48+
requiredDuringSchedulingIgnoredDuringExecution:
49+
- labelSelector:
50+
matchExpressions:
51+
- key: role
52+
operator: In
53+
values:
54+
- server
55+
topologyKey: kubernetes.io/hostname
56+
restartPolicy: Never
57+
containers:
58+
- name: rdma-test
59+
image: nvcr.io/nvidia/doca/doca:3.1.0-full-rt-host
60+
command: ["/bin/bash", "-c", "sleep infinity"]
61+
securityContext:
62+
capabilities:
63+
add: ["IPC_LOCK"]
64+
privileged: true
65+
resources:
66+
requests:
67+
nvidia.com/sriov_resource: "1"
68+
limits:
69+
nvidia.com/sriov_resource: "1"

0 commit comments

Comments
 (0)