11ARG CUDA_MAJOR_VERSION=12
2- ARG CUDA_MINOR_VERSION=9
2+ ARG CUDA_MINOR_VERSION=8
33
44# Start with the NVIDIA CUDA base image
55FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04
@@ -41,6 +41,9 @@ RUN apt update -y \
4141
4242RUN ldconfig
4343
44+ ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:/usr/lib/aarch64-linux-gnu/:$LD_LIBRARY_PATH
45+ ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH
46+
4447RUN mkdir -p /var/run/sshd \
4548 && sed -i 's/[ #]\( .*StrictHostKeyChecking \) .*/ \1 no/g' /etc/ssh/ssh_config \
4649 && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
@@ -53,19 +56,21 @@ RUN git clone https://github.com/NVIDIA/cuda-samples.git /tmp/cuda-samples \
5356 && cd /tmp/cuda-samples/Samples/1_Utilities/deviceQuery && cmake . && make -j$(nproc) && cp deviceQuery /usr/bin \
5457 && cd && rm -rf /tmp/cuda-samples
5558
56- # Build nvbandwidth
57- # TODO: see https://github.com/NVIDIA/nvbandwidth
58-
59- ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:/usr/lib/aarch64-linux-gnu/:$LD_LIBRARY_PATH
60- ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH
61-
6259# Install EFA
6360ARG EFA_INSTALLER_VERSION=latest
6461RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-$EFA_INSTALLER_VERSION.tar.gz | tar xvz -C /tmp \
6562 && cd /tmp/aws-efa-installer \
6663 && ./efa_installer.sh --yes --enable-gdr --skip-kmod --skip-limit-conf --no-verify --mpi openmpi4 \
6764 && cd && rm -rf /tmp/aws-efa-installer
6865
66+ # Build nvbandwidth
67+ ARG NVBANDWIDTH_VERSION=v0.8
68+ RUN apt install -y libboost-program-options-dev
69+ RUN git clone https://github.com/NVIDIA/nvbandwidth.git --branch $NVBANDWIDTH_VERSION /tmp/nvbandwidth \
70+ && cd /tmp/nvbandwidth \
71+ && cmake -DMULTINODE=1 . && make && cp nvbandwidth /usr/bin \
72+ && cd && rm -rf /tmp/cuda-samples
73+
6974# Install NCCL
7075ARG LIBNCCL_VERSION=2.27.5-1
7176RUN git clone https://github.com/NVIDIA/nccl.git --branch v$LIBNCCL_VERSION /tmp/nccl \
@@ -90,6 +95,7 @@ RUN curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCC
9095 && cd && rm -rf /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION
9196
9297# Install NCCL Tests
98+ # TODO: automate pin with version bump
9399RUN git clone https://github.com/NVIDIA/nccl-tests /tmp/nccl-tests \
94100 && cd /tmp/nccl-tests \
95101 && make \
0 commit comments