Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions common/nxdt_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ hydra-core>=1.3.0
omegaconf>=2.2,<2.3
pyyaml==6.0.1
torchmetrics>=0.4.1rc0,<=0.10.3
transformers==4.52.4
transformers==4.56.*
wandb
webdataset>=0.1.48,<=0.1.62
pandas
Expand All @@ -22,7 +22,7 @@ ftfy
gdown
inflect
jieba
opencc==1.1.6
opencc==1.1.9
pangu
rapidfuzz
pybind11
Expand All @@ -39,7 +39,6 @@ python-daemon
huggingface_hub>=0.27.1
multiprocess==0.70.16
numba<=0.60.0
numpy>=1.24.3,<=1.25.2
rouge_score
setuptools>=70.0
lightning==2.5.0
Expand Down
224 changes: 224 additions & 0 deletions jax/training/0.7/Dockerfile.neuronx
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
ARG BUILD_STAGE=prod

FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base

LABEL dlc_major_version="1"
LABEL maintainer="Amazon AI"

# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 24
ARG DEBIAN_FRONTEND=noninteractive
ARG PYTHON=python3.12
ARG PYTHON_VERSION=3.12.11
ARG PIP=pip3
ARG OMPI_VERSION=4.1.5
ARG PYPI_SIMPLE_URL="https://pypi.org/simple/"

# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PYTHONIOENCODING=UTF-8
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
ENV PATH="/opt/aws/neuron/bin:${PATH}"

RUN apt-get update \
&& apt-get upgrade -y \
&& apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
cmake \
curl \
emacs \
git \
gnupg2 \
gpg-agent \
jq \
libopencv-dev \
libglib2.0-0 \
libgl1-mesa-dri \
libsm6 \
libxext6 \
libxrender-dev \
libssl-dev \
libsqlite3-dev \
libgdbm-dev \
libc6-dev \
libbz2-dev \
libncurses-dev \
libffi-dev \
libcap-dev \
libhwloc-dev \
openjdk-8-jdk-headless \
openjdk-8-jdk \
openjdk-8-jre \
openjdk-11-jdk \
openssl \
software-properties-common \
tk-dev \
unzip \
wget \
vim \
zlib1g-dev \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Install Open MPI and configure SSH for MPI operator in k8s
RUN mkdir -p /tmp/openmpi \
&& cd /tmp/openmpi \
&& wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
&& tar zxf openmpi-${OMPI_VERSION}.tar.gz \
&& cd openmpi-${OMPI_VERSION} \
&& ./configure --enable-orterun-prefix-by-default \
&& make -j $(nproc) all \
&& make install \
&& ldconfig \
&& rm -rf /tmp/openmpi

# Install packages and configure SSH for MPI operator in k8s
RUN apt-get update \
&& apt-get install -y openmpi-bin openssh-server \
&& mkdir -p /var/run/sshd \
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
&& sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Install Python
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
&& tar -xzf Python-$PYTHON_VERSION.tgz \
&& cd Python-$PYTHON_VERSION \
&& ./configure --enable-shared --prefix=/usr/local \
&& make -j $(nproc) && make install \
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
&& ${PIP} --no-cache-dir install --upgrade \
"awscli<2" \
pip \
requests \
setuptools \
&& rm -rf ~/.cache/pip/*

# U24 will not allow installation of pip packages outside of venv without this flag
# This is because U24 ships with Python 3.12 by default and installation into the Python
# interpreter’s directory are disabled outside of a virtual environment.
# https://peps.python.org/pep-0668/
RUN ${PIP} config set global.break-system-packages true

# Install EFA
RUN apt-get update \
&& cd $HOME \
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
&& cat aws-efa-installer.key | gpg --fingerprint \
&& wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
&& tar -xf aws-efa-installer-latest.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \
&& cd $HOME \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

WORKDIR /

# The ENV variables declared below are changed in the previous section
# Grouping these ENV variables in the first section causes
# ompi_info to fail. This is only observed in CPU containers
ENV PATH="$PATH:/home/.openmpi/bin"
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value

RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt

# Copy workaround script for incorrect hostname
COPY changehostname.c /
COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/

RUN HOME_DIR=/root \
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
&& chmod +x /usr/local/bin/testOSSCompliance \
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
&& rm -rf ${HOME_DIR}/oss_compliance* \
&& rm -rf /tmp/tmp*

# Setting up APT and PIP repo for neuron artifacts
ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com
ARG NEURON_APT_REPO_KEY
ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com
ARG NEURON_PIP_REPO_KEY
RUN mkdir -p /etc/apt/keyrings \
&& APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
&& echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \
&& curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg

# Neuron SDK components version numbers
ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58
ARG NEURONX_TOOLS_VERSION=2.26.14.0
ARG NEURONX_CC_VERSION=2.22.7534.0+c05a3358
ARG NEURONX_JAX_TRAINING_VERSION=0.7.0.1.0.7377+5e6a4049

FROM base AS repo

# Install Neuron components from the apt and pip repos (latest versions)
RUN apt-get update \
&& apt-get install -y \
aws-neuronx-tools \
aws-neuronx-collectives \
aws-neuronx-runtime-lib \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
&& ${PIP} install --no-cache-dir --force-reinstall \
--index-url ${PIP_REPO_URL} \
--extra-index-url ${PYPI_SIMPLE_URL} \
--trusted-host ${NEURON_PIP_REPO} \
"neuronx-cc>=2.0" \
jax-neuronx \
&& rm -rf ~/.cache/pip/*

FROM base AS prod

# Install Neuron components
# Install Neuron Driver, Runtime and Tools
RUN apt-get update \
&& apt-get install -y \
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /tmp/tmp* \
&& apt-get clean

# Install JAX & Neuron CC
RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
&& ${PIP} install --no-cache-dir --force-reinstall \
--index-url ${PIP_REPO_URL} \
--trusted-host ${NEURON_PIP_REPO} \
--extra-index-url ${PYPI_SIMPLE_URL} \
neuronx-cc==$NEURONX_CC_VERSION \
jax-neuronx==$NEURONX_JAX_TRAINING_VERSION \
&& rm -rf ~/.cache/pip/*

FROM ${BUILD_STAGE} AS final

# Starts framework
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
CMD ["/bin/bash"]

HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1
Loading
Loading