Skip to content

Commit bf9602c

Browse files
authored
feat(nvidia): add nvbandwidth tests (#638)
1 parent 14087f3 commit bf9602c

File tree

7 files changed

+28
-19
lines changed

7 files changed

+28
-19
lines changed

hack/update-nvidia-dependencies.sh

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,15 @@ set -o nounset
99
set -o errexit
1010
set -o pipefail
1111

12+
echo "Updating aws-ofi-nccl"
1213
AWS_OFI_NCCL_TAG=$(curl -s https://api.github.com/repos/aws/aws-ofi-nccl/releases/latest | jq -r .tag_name | sed 's/^v//')
13-
LIB_NCCL_TAG=$(curl -s https://api.github.com/repos/aws/aws-ofi-nccl/releases/latest | jq -r .body | grep -oP '\[NCCL \K(\S*)(?=\])' | head -n 1 | sed 's/^v//')
14-
1514
find . -type f -name Dockerfile -exec sed -i "s/AWS_OFI_NCCL_VERSION=.*/AWS_OFI_NCCL_VERSION=$AWS_OFI_NCCL_TAG/g" {} +
15+
16+
echo "Updating nccl"
17+
LIB_NCCL_TAG=$(curl -s https://api.github.com/repos/aws/aws-ofi-nccl/releases/latest | jq -r .body | grep -oP '\[NCCL \K(\S*)(?=\])' | head -n 1 | sed 's/^v//')
1618
find . -type f -name Dockerfile -exec sed -i "s/LIBNCCL_VERSION=.*/LIBNCCL_VERSION=$LIB_NCCL_TAG/g" {} +
1719

18-
CUDA=$(curl -s https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/Packages | grep 'Package: libnccl2' -A 1 | grep $LIB_NCCL_TAG | grep -oP 'Version: \K(\S*)(?=)' | sort -r | head -n 1 | sed "s/$LIB_NCCL_TAG+cuda//")
19-
CUDA_VERSION_PARTS=($(echo $CUDA | tr -s '.' ' '))
20-
CUDA_MAJOR_VERSION=${CUDA_VERSION_PARTS[0]}
21-
CUDA_MINOR_VERSION=${CUDA_VERSION_PARTS[1]}
20+
echo "Updating nvbandwidth"
21+
NVBANDWIDTH_TAG=$(curl -s https://api.github.com/repos/NVIDIA/nvbandwidth/releases/latest | jq -r .tag_name)
22+
find . -type f -name Dockerfile -exec sed -i "s/NVBANDWIDTH_VERSION=.*/NVBANDWIDTH_VERSION=$NVBANDWIDTH_TAG/g" {} +
2223

23-
find . -type f -name Dockerfile -exec sed -i "s/CUDA_MINOR_VERSION=.*/CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION/g" {} +
24-
find . -type f -name Dockerfile -exec sed -i "s/CUDA_MAJOR_VERSION=.*/CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION/g" {} +

test/cases/nvidia/manifests/job-unit-test-single-node.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@ spec:
2828
imagePullPolicy: Always
2929
resources:
3030
limits:
31-
cpu: "4"
32-
memory: 4Gi
3331
nvidia.com/gpu: {{.GpuPerNode}}
3432
requests:
3533
cpu: "1"

test/images/nvidia-inference/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Base image, arguments, and environment
33
###############################################################################
44
ARG CUDA_MAJOR_VERSION=12
5-
ARG CUDA_MINOR_VERSION=9
5+
ARG CUDA_MINOR_VERSION=8
66

77
FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04
88

test/images/nvidia-training/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
ARG CUDA_MAJOR_VERSION=12
2-
ARG CUDA_MINOR_VERSION=9
2+
ARG CUDA_MINOR_VERSION=8
33

44
# Use the NVIDIA CUDA runtime as a parent image
55
FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04

test/images/nvidia/Dockerfile

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
ARG CUDA_MAJOR_VERSION=12
2-
ARG CUDA_MINOR_VERSION=9
2+
ARG CUDA_MINOR_VERSION=8
33

44
# Start with the NVIDIA CUDA base image
55
FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04
@@ -41,6 +41,9 @@ RUN apt update -y \
4141

4242
RUN ldconfig
4343

44+
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:/usr/lib/aarch64-linux-gnu/:$LD_LIBRARY_PATH
45+
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH
46+
4447
RUN mkdir -p /var/run/sshd \
4548
&& sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config \
4649
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
@@ -53,19 +56,21 @@ RUN git clone https://github.com/NVIDIA/cuda-samples.git /tmp/cuda-samples \
5356
&& cd /tmp/cuda-samples/Samples/1_Utilities/deviceQuery && cmake . && make -j$(nproc) && cp deviceQuery /usr/bin \
5457
&& cd && rm -rf /tmp/cuda-samples
5558

56-
# Build nvbandwidth
57-
# TODO: see https://github.com/NVIDIA/nvbandwidth
58-
59-
ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:/usr/lib/aarch64-linux-gnu/:$LD_LIBRARY_PATH
60-
ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH
61-
6259
# Install EFA
6360
ARG EFA_INSTALLER_VERSION=latest
6461
RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-$EFA_INSTALLER_VERSION.tar.gz | tar xvz -C /tmp \
6562
&& cd /tmp/aws-efa-installer \
6663
&& ./efa_installer.sh --yes --enable-gdr --skip-kmod --skip-limit-conf --no-verify --mpi openmpi4 \
6764
&& cd && rm -rf /tmp/aws-efa-installer
6865

66+
# Build nvbandwidth
67+
ARG NVBANDWIDTH_VERSION=v0.8
68+
RUN apt install -y libboost-program-options-dev
69+
RUN git clone https://github.com/NVIDIA/nvbandwidth.git --branch $NVBANDWIDTH_VERSION /tmp/nvbandwidth \
70+
&& cd /tmp/nvbandwidth \
71+
&& cmake -DMULTINODE=1 . && make && cp nvbandwidth /usr/bin \
72+
&& cd && rm -rf /tmp/cuda-samples
73+
6974
# Install NCCL
7075
ARG LIBNCCL_VERSION=2.27.5-1
7176
RUN git clone https://github.com/NVIDIA/nccl.git --branch v$LIBNCCL_VERSION /tmp/nccl \
@@ -90,6 +95,7 @@ RUN curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCC
9095
&& cd && rm -rf /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION
9196

9297
# Install NCCL Tests
98+
# TODO: automate pin with version bump
9399
RUN git clone https://github.com/NVIDIA/nccl-tests /tmp/nccl-tests \
94100
&& cd /tmp/nccl-tests \
95101
&& make \

test/images/nvidia/gpu_unit_tests/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ Create PR with the new `tests/test_sysinfo.sh.data/xxx`
4141
Usually this is side effect of system misconfiguration (driver or fabric manager is not loaded)
4242
- test_01_device_query
4343
- test_02_vector_add
44+
- test_03_nvbandwidth
4445
- test_04_dcgm_diagnostics
4546

4647

test/images/nvidia/gpu_unit_tests/tests/test_basic.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@ test_02_vector_add()
2424
assert_status_code 0 "$DEMO_SUITE_DIR/vectorAdd"
2525
}
2626

27+
test_03_nvbandwidth()
28+
{
29+
assert_status_code 0 "$DEMO_SUITE_DIR/nvbandwidth"
30+
}
31+
2732
test_04_dcgm_diagnostics()
2833
{
2934
# https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests

0 commit comments

Comments
 (0)