Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,10 @@ zypper --non-interactive --no-gpg-checks ref

zypper --gpg-auto-import-keys install -y rocm

# Installing MPICH 3.4.2
# Installing MPICH 3.4a2
# Information about the version of MPICH to use
export MPICH_VERSION=3.4.2
# see "module show cray-mpich/8.1.31" for the ANL MPICH version that Cray MPICH is based on
export MPICH_VERSION=3.4a2
export MPICH_URL="http://www.mpich.org/static/downloads/$MPICH_VERSION/mpich-$MPICH_VERSION.tar.gz"
export MPICH_DIR=/opt/mpich

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
Bootstrap: docker
From: opensuse/leap:15.6

%environment
# Point to MPICH binaries, libraries man pages
export MPICH_DIR=/opt/mpich
export PATH="$MPICH_DIR/bin:$PATH"
export LD_LIBRARY_PATH="$MPICH_DIR/lib:$LD_LIBRARY_PATH"
export MANPATH=$MPICH_DIR/share/man:$MANPATH
# Point to rocm locations
export ROCM_PATH=/opt/rocm
export LD_LIBRARY_PATH="/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH"
export PATH="/opt/rocm/bin:$PATH"

%files
ping_pong_gpu_aware.cpp /ping_pong_gpu_aware.cpp

%post
# By default you start at the root '/' location in the container images filesystem
ls

# Installing some prerequisites
zypper install -y wget sudo gzip gcc-c++ gcc-fortran gcc14-c++ gcc14-fortran tar make autoconf automake binutils cpp glibc-devel m4 makeinfo zlib-devel gcc-info git glibc-info patch pkg-config which

# installing rocm 6.4.2 (see docs: https://rocm.docs.amd.com/en/latest/deploy/linux/installer/install.html)
## prereqs for rocm
zypper --non-interactive --no-gpg-checks addrepo https://download.opensuse.org/repositories/devel:/languages:/perl/15.6/devel:languages:perl.repo
zypper --non-interactive --no-gpg-checks addrepo https://download.opensuse.org/repositories/Education/15.6/Education.repo
zypper --non-interactive --no-gpg-checks addrepo https://download.opensuse.org/repositories/science/SLE_15_SP5/science.repo

ver=6.4.2
tee /etc/zypp/repos.d/amdgpu.repo <<EOF
[amdgpu]
name=amdgpu
baseurl=https://repo.radeon.com/amdgpu/$ver/sle/15.6/main/x86_64
enabled=1
gpgcheck=1
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
EOF
zypper --non-interactive --no-gpg-checks ref

tee --append /etc/zypp/repos.d/rocm.repo <<EOF
[ROCm-$ver]
name=ROCm$ver
name=rocm
baseurl=https://repo.radeon.com/rocm/zyp/$ver/main
enabled=1
gpgcheck=1
gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
EOF
zypper --non-interactive --no-gpg-checks ref

zypper --gpg-auto-import-keys install -y rocm

# Installing MPICH 4.1.2
# Information about the version of MPICH to use
# see "module show cray-mpich/9.0.2" for the ANL MPICH version that Cray MPICH is based on
export MPICH_VERSION=4.1.2
export MPICH_URL="http://www.mpich.org/static/downloads/$MPICH_VERSION/mpich-$MPICH_VERSION.tar.gz"
export MPICH_DIR=/opt/mpich

echo "Installing MPICH..."
mkdir -p /mpich
mkdir -p /opt
# Download
cd /mpich && wget -O mpich-$MPICH_VERSION.tar.gz $MPICH_URL && tar --no-same-owner -xzf mpich-$MPICH_VERSION.tar.gz
# Compile and install
cd /mpich/mpich-$MPICH_VERSION && ./configure --with-device=ch4:ofi --prefix=$MPICH_DIR && make install
rm -rf /mpich

# build ping_pong_gpu_aware.cpp
cd /
which hipcc
echo $PATH
export PATH="$MPICH_DIR/bin:/opt/rocm/bin:$PATH"
export ROCM_PATH=/opt/rocm

# ping pong GPU aware MPI
hipcc -c ping_pong_gpu_aware.cpp -I${MPICH_DIR}/include
mpicc -o ping_pong_gpu_aware.exe ping_pong_gpu_aware.o -L${ROCM_PATH}/lib -lamdhip64

Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@
#SBATCH -e logs/%x_%j.out

module reset
module load cpe/24.11
# Loading rocm/6.2.4 since that is what is installed in the container
module load rocm/6.2.4
module load rocm/6.2.4

# The cray-mpich-abi module contains the libmpi.so that the MPI application in
# the container (that wasn't compiled with a Cray compiler) will look for.
module load cray-mpich-abi/8.1.31
module load cray-mpich-abi/8.1.31

module list

# This is set so to turn on GPU aware MPI
export MPICH_GPU_SUPPORT_ENABLED=1
Expand Down Expand Up @@ -58,5 +61,5 @@ export APPTAINERENV_LD_PRELOAD=$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so.0:
# The program passes data allocated on GPU memory of increasing size back and forth between
# two MPI processes across two nodes.
# The --rocm flag is required to support AMD GPUs inside the container.
srun -N2 --tasks-per-node 1 --gpus-per-task=1 apptainer exec --workdir `pwd` --rocm opensusempich342rocm624.sif /ping_pong_gpu_aware.exe
srun -N2 --tasks-per-node 1 --gpus-per-task=1 apptainer --silent exec --workdir `pwd` --rocm opensusempich342rocm624.sif /ping_pong_gpu_aware.exe

Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash
#SBATCH -t00:10:00
#SBATCH -Astf007
#SBATCH -N2
#SBATCH -J frontier_gpu_aware_mpi_example
#SBATCH -o logs/%x_%j.out
#SBATCH -e logs/%x_%j.out

module reset
module load cpe/25.09
# Loading rocm since that is what is installed in the container
module load rocm/6.4.2

# The cray-mpich-abi module contains the libmpi.so that the MPI application in
# the container (that wasn't compiled with a Cray compiler) will look for.
module load cray-mpich-abi/9.0.1

module list

# This is set so to turn on GPU aware MPI
export MPICH_GPU_SUPPORT_ENABLED=1

# We are setting this as a convenience that we pass to apptainer's --bind flag. What this does is
# mount the given directories from the host into the container so that they are visible within the
# container. So any application inside the container are able to reach the files in these locations
# on the host. We are binding these specific locations as they are required for the Cray MPICH
# libraries to be visible inside the container and for MPI to work with good performance. The
# application in the container will be able to see and link to the Cray MPICH libraries instead of
# using the MPICH libraries we install in the container.
export APPTAINER_BINDPATH=/usr/share/libdrm,/var/spool/slurmd,/opt/cray,${PWD}

# What APPTAINERENV_* does is set that environment variable inside the container environment.
# For example, if you `export APPTAINERENV_BLAH=foo` before you do an `apptainer (run|exec|shell)`,
# within your container the environment variable BLAH=foo will be set.
# So here, we are setting the value of LD_LIBRARY_PATH inside the container to include the Cray
# MPICH locations (which we have access to because we mount /opt/cray into the container, see
# the BINDS environment variable), the host's LD_LIBRARY_PATH and CRAY_LD_LIBRARY_PATH contents,
# and the container's (not host's) rocm location. This will replace any preexisting LD_LIBRARY_PATH
# that might've been defined as part of the container build, so make sure to explicitly include
# any paths from within the container's file tree that you want available to the application
# in the container.
export APPTAINERENV_LD_LIBRARY_PATH="$CRAY_MPICH_ROOTDIR/gtl/lib:/opt/rocm/lib:/opt/rocm/lib64:$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH:/opt/cray/pe/lib64" #:/opt/cray/xpmem/2.10.6-1.2_gfaa90a94be64/lib64"

# APPTAINER_CONTAINLIBS allows you to specify individual libraries from the host that you want
# visible inside the container and added to the LD_LIBRARY_PATH inside the container.
# In this GPU aware MPI example, Some specific libraries from the host's /usr/lib64 need to
# be visible inside the container for the Cray MPICH libraries (that we are making visible
# inside the container with the earlier steps) to work as the Cray MPICH libraries are linked
# linked to them.
export APPTAINER_CONTAINLIBS="/usr/lib64/libcxi.so.1,/usr/lib64/libjson-c.so.5,/lib64/libtinfo.so.6,/usr/lib64/libnl-3.so.200,/usr/lib64/libgfortran.so.5,/usr/lib64/libjansson.so.4,/usr/lib64/libxpmem.so.0"

# This is required when you have an application that is compiled inside a container that
# doesn't have access to the Cray MPICH libraries during the container build process (such as our
# example opensusempich412rocm642.def). The libmpi_gtl_hsa.so is what provides GPU aware MPI,
# but the application is not linked to it because it wasn't available in the container when the
# application was being built during the container build process. So we
# need to make sure it is preloaded so that the application uses it.
export APPTAINERENV_LD_PRELOAD=$CRAY_MPICH_ROOTDIR/gtl/lib/libmpi_gtl_hsa.so.0:

# This is executing the ping_ping_gpu_aware.exe in the container with `apptainer exec`.
# The program passes data allocated on GPU memory of increasing size back and forth between
# two MPI processes across two nodes.
# The --rocm flag is required to support AMD GPUs inside the container.
srun -N2 --tasks-per-node 1 --gpus-per-task=1 apptainer --silent exec --workdir `pwd` --rocm opensusempich412rocm642.sif /ping_pong_gpu_aware.exe