diff --git a/common/nxdt_requirements.txt b/common/nxdt_requirements.txt index 4f40ab0..148b452 100644 --- a/common/nxdt_requirements.txt +++ b/common/nxdt_requirements.txt @@ -2,7 +2,7 @@ hydra-core>=1.3.0 omegaconf>=2.2,<2.3 pyyaml==6.0.1 torchmetrics>=0.4.1rc0,<=0.10.3 -transformers==4.52.4 +transformers==4.56.* wandb webdataset>=0.1.48,<=0.1.62 pandas @@ -22,7 +22,7 @@ ftfy gdown inflect jieba -opencc==1.1.6 +opencc==1.1.9 pangu rapidfuzz pybind11 @@ -39,7 +39,6 @@ python-daemon huggingface_hub>=0.27.1 multiprocess==0.70.16 numba<=0.60.0 -numpy>=1.24.3,<=1.25.2 rouge_score setuptools>=70.0 lightning==2.5.0 diff --git a/jax/training/0.7/Dockerfile.neuronx b/jax/training/0.7/Dockerfile.neuronx new file mode 100644 index 0000000..6fa25cf --- /dev/null +++ b/jax/training/0.7/Dockerfile.neuronx @@ -0,0 +1,224 @@ +ARG BUILD_STAGE=prod + +FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base + +LABEL dlc_major_version="1" +LABEL maintainer="Amazon AI" + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 24 +ARG DEBIAN_FRONTEND=noninteractive +ARG PYTHON=python3.12 +ARG PYTHON_VERSION=3.12.11 +ARG PIP=pip3 +ARG OMPI_VERSION=4.1.5 +ARG PYPI_SIMPLE_URL="https://pypi.org/simple/" + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" +ENV PATH="/opt/aws/neuron/bin:${PATH}" + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + gnupg2 \ + gpg-agent \ + jq \ + libopencv-dev \ + libglib2.0-0 \ + libgl1-mesa-dri \ + libsm6 \ + libxext6 \ + libxrender-dev \ + libssl-dev \ + libsqlite3-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + libncurses-dev \ + libffi-dev \ + libcap-dev \ + libhwloc-dev \ + openjdk-8-jdk-headless \ + openjdk-8-jdk \ + openjdk-8-jre \ + openjdk-11-jdk \ + openssl \ + software-properties-common \ + tk-dev \ + unzip \ + wget \ + vim \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Install Open MPI and configure SSH for MPI operator in k8s +RUN mkdir -p /tmp/openmpi \ + && cd /tmp/openmpi \ + && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ + && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ + && cd openmpi-${OMPI_VERSION} \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + +# Install packages and configure SSH for MPI operator in k8s +RUN apt-get update \ + && apt-get install -y openmpi-bin openssh-server \ + && mkdir -p /var/run/sshd \ + && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \ + && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Install Python +RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ + && tar -xzf Python-$PYTHON_VERSION.tgz \ + && cd Python-$PYTHON_VERSION \ + && ./configure --enable-shared --prefix=/usr/local \ + && make -j $(nproc) && make install \ + && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ + && ln -s /usr/local/bin/pip3 /usr/bin/pip \ + && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ + && ${PIP} --no-cache-dir install --upgrade \ + "awscli<2" \ + pip \ + requests \ + setuptools \ + && rm -rf ~/.cache/pip/* + +# U24 will not allow installation of pip packages outside of venv without this flag +# This is because U24 ships with Python 3.12 by default and installation into the Python +# interpreter’s directory are disabled outside of a virtual environment. +# https://peps.python.org/pep-0668/ +RUN ${PIP} config set global.break-system-packages true + +# Install EFA +RUN apt-get update \ + && cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ + && cat aws-efa-installer.key | gpg --fingerprint \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ + && tar -xf aws-efa-installer-latest.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \ + && cd $HOME \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +WORKDIR / + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="$PATH:/home/.openmpi/bin" +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/ + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +# Setting up APT and PIP repo for neuron artifacts +ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com +ARG NEURON_APT_REPO_KEY +ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com +ARG NEURON_PIP_REPO_KEY +RUN mkdir -p /etc/apt/keyrings \ + && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \ + && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \ + && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg + +# Neuron SDK components version numbers +ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008 +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58 +ARG NEURONX_TOOLS_VERSION=2.26.14.0 +ARG NEURONX_CC_VERSION=2.22.7534.0+c05a3358 +ARG NEURONX_JAX_TRAINING_VERSION=0.7.0.1.0.7377+5e6a4049 + +FROM base AS repo + +# Install Neuron components from the apt and pip repos (latest versions) +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools \ + aws-neuronx-collectives \ + aws-neuronx-runtime-lib \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ + && ${PIP} install --no-cache-dir --force-reinstall \ + --index-url ${PIP_REPO_URL} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + "neuronx-cc>=2.0" \ + jax-neuronx \ + && rm -rf ~/.cache/pip/* + +FROM base AS prod + +# Install Neuron components +# Install Neuron Driver, Runtime and Tools +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Install JAX & Neuron CC +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ + && ${PIP} install --no-cache-dir --force-reinstall \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + neuronx-cc==$NEURONX_CC_VERSION \ + jax-neuronx==$NEURONX_JAX_TRAINING_VERSION \ + && rm -rf ~/.cache/pip/* + +FROM ${BUILD_STAGE} AS final + +# Starts framework +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] + +HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1 diff --git a/pytorch/inference/2.9.0/Dockerfile.neuronx b/pytorch/inference/2.9.0/Dockerfile.neuronx new file mode 100644 index 0000000..18aea79 --- /dev/null +++ b/pytorch/inference/2.9.0/Dockerfile.neuronx @@ -0,0 +1,242 @@ +ARG BUILD_STAGE=prod + +FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base + +LABEL dlc_major_version="1" +LABEL maintainer="Amazon AI" +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true + +ARG DEBIAN_FRONTEND=noninteractive +ARG PIP=pip3 +ARG PYTHON=python3.12 +ARG PYTHON_VERSION=3.12.11 +ARG TORCHSERVE_VERSION=0.11.0 +ARG SM_TOOLKIT_VERSION=2.0.25 +ARG MINIFORGE_VERSION=25.3.1-0 +ARG PYPI_SIMPLE_URL="https://pypi.org/simple/" + +# See http://bugs.python.org/issue19846 +ENV LANG=C.UTF-8 +ENV LD_LIBRARY_PATH=/opt/aws/neuron/lib:/lib/x86_64-linux-gnu:/opt/conda/lib/:$LD_LIBRARY_PATH +ENV PATH=/opt/conda/bin:/opt/aws/neuron/bin:$PATH + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + apt-transport-https \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + dmidecode \ + emacs \ + environment-modules \ + ethtool \ + git \ + gnupg2 \ + gpg-agent \ + iproute2 \ + jq \ + libevent-core-2.1-7t64 \ + libevent-pthreads-2.1-7t64 \ + libgl1-mesa-dri \ + libglib2.0-0 \ + libnl-3-200 \ + libnl-3-dev \ + libnl-route-3-200 \ + libnl-route-3-dev \ + libsm6 \ + libxext6 \ + libxrender-dev \ + libcap-dev \ + libhwloc-dev \ + openjdk-11-jdk \ + openssh-client \ + pciutils \ + tcl \ + udev \ + unzip \ + vim \ + wget \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# https://github.com/docker-library/openjdk/issues/261 https://github.com/docker-library/openjdk/pull/263/files +RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt; \ + mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts; \ + /var/lib/dpkg/info/ca-certificates-java.postinst configure; + +RUN curl -L -o ~/miniforge.sh https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/Miniforge3-${MINIFORGE_VERSION}-Linux-x86_64.sh \ + && chmod +x ~/miniforge.sh \ + && ~/miniforge.sh -b -p /opt/conda \ + && rm ~/miniforge.sh \ + && /opt/conda/bin/conda update -y conda \ + && /opt/conda/bin/mamba install -c conda-forge -y \ + python=$PYTHON_VERSION \ + pyopenssl \ + cython \ + mkl-include \ + mkl \ + parso \ + typing \ + # Below 2 are included in miniconda base, but not mamba so need to install + conda-content-trust \ + charset-normalizer \ + && /opt/conda/bin/conda clean -ya + +RUN ${PIP} config set global.break-system-packages true + +RUN /opt/conda/bin/mamba install -c conda-forge \ + python=$PYTHON_VERSION \ + scikit-learn \ + h5py \ + requests \ + && conda clean -ya \ + && ${PIP} install --upgrade pip --no-cache-dir -U \ + --trusted-host pypi.org --trusted-host files.pythonhosted.org \ + && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \ + && ${PIP} install --no-cache-dir -U \ + packaging \ + enum-compat \ + ipython \ + && rm -rf ~/.cache/pip/* + +# Install EFA +RUN apt-get update \ + && cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ + && cat aws-efa-installer.key | gpg --fingerprint \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ + && tar -xf aws-efa-installer-latest.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \ + && cd $HOME \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN ${PIP} install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org \ + && ${PIP} install --no-cache-dir -U \ + opencv-python>=4.8.1.78 \ + "scipy>=1.8.0" \ + six \ + "awscli<2" \ + pandas==1.* \ + boto3 \ + cryptography \ + "protobuf>=3.18.3,<4" \ + torchserve==${TORCHSERVE_VERSION} \ + torch-model-archiver==${TORCHSERVE_VERSION} \ + && rm -rf ~/.cache/pip/* + +ENV SAGEMAKER_SERVING_MODULE=sagemaker_pytorch_serving_container.serving:main +ENV TEMP=/home/model-server/tmp + +RUN useradd -m model-server \ + && mkdir -p /home/model-server/tmp /opt/ml/model \ + && chown -R model-server /home/model-server /opt/ml/model + +COPY --chmod=755 neuron-entrypoint.py /usr/local/bin/dockerd-entrypoint.py +COPY --chmod=755 neuron-monitor.sh deep_learning_container.py /usr/local/bin/ +COPY --chmod=755 torchserve-neuron.sh /usr/local/bin/entrypoint.sh +COPY config.properties /home/model-server + +RUN ${PIP} install --no-cache-dir "sagemaker-pytorch-inference==${SM_TOOLKIT_VERSION}" \ + # patch default_pytorch_inference_handler.py to import torch_neuronx + && DEST_DIR=$(python -c "import os.path, sagemaker_pytorch_serving_container; print(os.path.dirname(sagemaker_pytorch_serving_container.__file__))") \ + && DEST_FILE=${DEST_DIR}/default_pytorch_inference_handler.py \ + && sed -i "s/import torch/import torch, torch_neuronx/" ${DEST_FILE} \ + && rm -rf ~/.cache/pip/* + +# Compliance +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + # conda leaves an empty /root/.cache/conda/notices.cache file which is not removed by conda clean -ya + && rm -rf ${HOME_DIR}/.cache/conda + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.9/license.txt + +# Setting up APT and PIP repo for neuron artifacts +ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com +ARG NEURON_APT_REPO_KEY +ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com +ARG NEURON_PIP_REPO_KEY +RUN mkdir -p /etc/apt/keyrings \ + && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \ + && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \ + && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg + +# Neuron SDK components version numbers +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58 +ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008 +ARG NEURONX_TOOLS_VERSION=2.26.14.0 + +ARG NEURONX_CC_VERSION=2.22.7534.0+c05a3358 +ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.11.18533+7b57219b +ARG NEURONX_DISTRIBUTED_VERSION=0.15.25852+4660bc97 +ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.6.13291+f42296f9 + +FROM base AS repo + + +# Install Neuron components from the apt and pip repos (latest versions) +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools \ + aws-neuronx-collectives \ + aws-neuronx-runtime-lib \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ + && ${PIP} install --no-cache-dir \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + "neuronx-cc>=2.0" \ + "torch-neuronx==2.9.*" \ + neuronx_distributed \ + neuronx_distributed_inference \ + && rm -rf ~/.cache/pip/* + +FROM base AS prod + +# Install Neuron components with specific versions +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ + && ${PIP} install --no-cache-dir \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + neuronx-cc==$NEURONX_CC_VERSION \ + torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ + neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ + neuronx_distributed_inference==$NEURONX_DISTRIBUTED_INFERENCE_VERSION \ + && rm -rf ~/.cache/pip/* + +FROM ${BUILD_STAGE} AS final + +EXPOSE 8080 8081 + +ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"] +CMD ["/usr/local/bin/entrypoint.sh"] + +HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1 \ No newline at end of file diff --git a/pytorch/training/2.9.0/Dockerfile.neuronx b/pytorch/training/2.9.0/Dockerfile.neuronx new file mode 100644 index 0000000..f7ca056 --- /dev/null +++ b/pytorch/training/2.9.0/Dockerfile.neuronx @@ -0,0 +1,290 @@ +ARG BUILD_STAGE=prod + +FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON=python3.12 +ARG PYTHON_VERSION=3.12.11 +ARG PIP=pip3 +ARG OMPI_VERSION=4.1.5 +ARG PYPI_SIMPLE_URL="https://pypi.org/simple/" + +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 22 +ARG DEBIAN_FRONTEND=noninteractive + +# Python won’t try to write .pyc or .pyo files on the import of source modules +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" +ENV PATH="/opt/aws/neuron/bin:${PATH}" +ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main +ENV DGLBACKEND=pytorch + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + git \ + gnupg2 \ + gpg-agent \ + jq \ + libopencv-dev \ + libglib2.0-0 \ + libgl1-mesa-dri \ + libsm6 \ + libxext6 \ + libxrender-dev \ + libssl-dev \ + libsqlite3-dev \ + libgdbm-dev \ + libc6-dev \ + libbz2-dev \ + libncurses-dev \ + libffi-dev \ + libcap-dev \ + libhwloc-dev \ + openjdk-8-jdk-headless \ + openjdk-8-jdk \ + openjdk-8-jre \ + openjdk-11-jdk \ + openssl \ + software-properties-common \ + tk-dev \ + unzip \ + wget \ + vim \ + zlib1g-dev \ + && rm -rf /tmp/tmp* \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + + +# Install Open MPI +RUN mkdir -p /tmp/openmpi \ + && cd /tmp/openmpi \ + && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ + && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ + && cd openmpi-${OMPI_VERSION} \ + && ./configure --enable-orterun-prefix-by-default \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && rm -rf /tmp/openmpi + + # Install packages an configure SSH for MPI in K8s +RUN apt-get update && apt-get install -y openmpi-bin openssh-server \ + && mkdir -p /var/run/sshd \ + && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \ + && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + + + # Install Python +RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ + && tar -xzf Python-$PYTHON_VERSION.tgz \ + && cd Python-$PYTHON_VERSION \ + && ./configure --enable-shared --prefix=/usr/local \ + && make -j $(nproc) && make install \ + && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ + && ln -s /usr/local/bin/pip3 /usr/bin/pip \ + && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ + && ${PIP} --no-cache-dir install --upgrade pip \ + && rm -rf ~/.cache/pip/* + +WORKDIR / + +# The ENV variables declared below are changed in the previous section +# Grouping these ENV variables in the first section causes +# ompi_info to fail. This is only observed in CPU containers +ENV PATH="$PATH:/home/.openmpi/bin" +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value + +RUN ${PIP} install --no-cache-dir -U \ + "bokeh>=2.3,<3" \ + "awscli<2" \ + scipy \ + click \ + "cryptography" \ + "sagemaker>=2,<3" \ + "sagemaker-pytorch-training" \ + psutil==5.6.7 \ + dataset \ + Pillow \ + && rm -rf ~/.cache/pip/* + +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt + +# Copy the NxDT Installation files +COPY --chmod=755 apex_setup.py nxdt_install_setup.sh nxdt_requirements.txt /root/ + +# attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0 +# protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3 +# awscli 1.25.47 has requirement docutils<0.17,>=0.10 +# etcd for kubernetes installation +# awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9. +# awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2 +RUN ${PIP} install --no-cache-dir -U \ + "attrs<24,>=23.1.0" \ + "docutils>=0.10,<0.17" \ + "rsa<4.8,>=3.1.2" \ + "python-etcd" \ + "urllib3>=1.26.0,<1.27" \ + # Install extra packages needed by sagemaker (for passing test_utility_packages_using_import) + && ${PIP} install --no-cache-dir -U \ + "bokeh>=3.0.1,<4" \ + "imageio>=2.22,<3" \ + "opencv-python>=4.8.1.78" \ + "plotly>=5.11,<6" \ + "seaborn>=0.12,<1" \ + "shap>=0.41,<1" \ + && rm -rf ~/.cache/pip/* + +# EFA Installer does apt get. Make sure to run apt update before that +RUN apt-get update \ + && cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ + && cat aws-efa-installer.key | gpg --fingerprint \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ + && tar -xf aws-efa-installer-latest.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \ + && cd $HOME \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Install some common packages used by training scripts +# torchvision needed for MLP. since it depends on torch and torch neuron/torch +# is already installed install it with nodeps +RUN pip3 install --no-cache-dir --no-deps -U \ + torchvision==0.24.0 \ + # Needed for running bert training scripts + && pip3 install --no-cache-dir -U \ + graphviz \ + tensorboard==2.6 \ + accelerate \ + # Install NxDT dependencies + && ${PIP} install --no-cache-dir \ + Cython \ + wheel \ + && rm -rf ~/.cache/pip/* + +# Copy workaround script for incorrect hostname +COPY changehostname.c / +COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/ + +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + && rm -rf /tmp/tmp* + +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.9/license.txt + +# Setting up APT and PIP repo for neuron artifacts +ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com +ARG NEURON_APT_REPO_KEY +ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com +ARG NEURON_PIP_REPO_KEY +RUN mkdir -p /etc/apt/keyrings \ + && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \ + && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \ + && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg + +# Neuron SDK components +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58 +ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008 +ARG NEURONX_TOOLS_VERSION=2.26.14.0 +ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.11.18533+7b57219b +ARG NEURONX_CC_VERSION=2.22.7534.0+c05a3358 +ARG NEURONX_DISTRIBUTED_VERSION=0.15.25852+4660bc97 +ARG NEURONX_DISTRIBUTED_TRAINING_VERSION=1.6.0 + +FROM base AS repo + +# Install Neuron components from the apt and pip repos (latest versions) +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools \ + aws-neuronx-collectives \ + aws-neuronx-runtime-lib \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ + && ${PIP} install --no-cache-dir --force-reinstall \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + "torch-neuronx==2.9.*" \ + "neuronx-cc>=2.0" \ + neuronx_distributed \ + neuronx_distributed_training \ + && rm -rf ~/.cache/pip/* + +FROM base AS prod + +# Install Neuron components with specific versions +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ + && ${PIP} install --no-cache-dir --force-reinstall \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ + neuronx-cc==$NEURONX_CC_VERSION \ + neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ + neuronx_distributed_training==$NEURONX_DISTRIBUTED_TRAINING_VERSION \ + && rm -rf ~/.cache/pip/* + +FROM ${BUILD_STAGE} AS final + +## Installation for Neuronx Distributed Training framework +# Clone and build Apex +RUN git clone https://github.com/NVIDIA/apex.git /root/apex \ + && cd /root/apex \ + && git checkout 23.05 \ + && cp /root/apex_setup.py setup.py \ + # Install dependencies from requirements and extras for SageMaker usecase + && ${PIP} install --no-cache-dir --no-build-isolation -r /root/nxdt_requirements.txt /root/apex \ + && /root/nxdt_install_setup.sh \ + && ${PIP} install --force-reinstall "torch==2.9.0" \ + && rm -rf ~/.cache/pip/* + +# Starts framework +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] +CMD ["/bin/bash"] + +HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1 \ No newline at end of file diff --git a/vllm/inference/0.11.0/Dockerfile.neuronx b/vllm/inference/0.11.0/Dockerfile.neuronx new file mode 100644 index 0000000..2f0d679 --- /dev/null +++ b/vllm/inference/0.11.0/Dockerfile.neuronx @@ -0,0 +1,243 @@ +ARG BUILD_STAGE=prod + +FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base + +LABEL dlc_major_version="1" +LABEL maintainer="Amazon AI" + +ARG DEBIAN_FRONTEND=noninteractive +ARG PIP=pip3 +ARG PYTHON=python3.12 +ARG PYTHON_VERSION=3.12.11 +ARG MINIFORGE_VERSION=25.3.1-0 +ARG TORCHSERVE_VERSION=0.11.0 +ARG PYPI_SIMPLE_URL="https://pypi.org/simple/" + + +# See http://bugs.python.org/issue19846 +ENV LANG=C.UTF-8 +ENV LD_LIBRARY_PATH=/opt/aws/neuron/lib:/lib/x86_64-linux-gnu:/opt/conda/lib/:$LD_LIBRARY_PATH +ENV PATH=/opt/conda/bin:/opt/aws/neuron/bin:$PATH + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + apt-transport-https \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + emacs \ + ffmpeg \ + gcc \ + git \ + gnupg2 \ + gpg-agent \ + jq \ + libgl1 \ + libgl1-mesa-dri \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender-dev \ + libcap-dev \ + libhwloc-dev \ + openssh-client \ + openjdk-11-jdk \ + unzip \ + vim \ + wget \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + + +# https://github.com/docker-library/openjdk/issues/261 https://github.com/docker-library/openjdk/pull/263/files +RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt; \ + mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts; \ + /var/lib/dpkg/info/ca-certificates-java.postinst configure; + +RUN curl -L -o ~/miniforge.sh https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/Miniforge3-${MINIFORGE_VERSION}-Linux-x86_64.sh \ + && chmod +x ~/miniforge.sh \ + && ~/miniforge.sh -b -p /opt/conda \ + && rm ~/miniforge.sh \ + && /opt/conda/bin/conda update -y conda \ + && /opt/conda/bin/mamba install -c conda-forge -y \ + python=$PYTHON_VERSION \ + pyopenssl \ + cython \ + mkl-include \ + mkl \ + parso \ + typing \ + # Below 2 are included in miniconda base, but not mamba so need to install + conda-content-trust \ + charset-normalizer \ + && /opt/conda/bin/conda clean -ya + +RUN /opt/conda/bin/mamba install -c conda-forge \ + python=$PYTHON_VERSION \ + scikit-learn \ + h5py \ + requests \ + && conda clean -ya \ + && pip install --upgrade pip \ + --trusted-host pypi.org --trusted-host files.pythonhosted.org \ + && ln -s /opt/conda/bin/pip /usr/local/bin/pip3 \ + && pip install \ + enum-compat \ + ipython \ + && rm -rf ~/.cache/pip/* + +# Install EFA +RUN apt-get update \ + && cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ + && cat aws-efa-installer.key | gpg --fingerprint \ + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ + && tar -xf aws-efa-installer-latest.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \ + && cd $HOME \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +COPY --chmod=755 vllm_entrypoint.py neuron-monitor.sh deep_learning_container.py /usr/local/bin/ + +### Mount Point ### +# When launching the container, mount the code directory to /workspace +ARG APP_MOUNT=/workspace +VOLUME [ ${APP_MOUNT} ] +WORKDIR ${APP_MOUNT}/vllm + +RUN ${PIP} install --no-cache-dir -U \ + "opencv-python" \ + "awscli" \ + "pandas" \ + "boto3" \ + "cryptography" \ + "pytest" \ + "wheel" \ + "cmake>=3.26" \ + "setuptools-scm>=8" \ + "jinja2" \ + torchserve==${TORCHSERVE_VERSION} \ + torch-model-archiver==${TORCHSERVE_VERSION} \ + && rm -rf ~/.cache/pip/* + +RUN useradd -m model-server \ + && mkdir -p /home/model-server/tmp /opt/ml/model \ + && chown -R model-server /home/model-server /opt/ml/model +COPY config.properties /home/model-server + +# Compliance +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && rm -rf ${HOME_DIR}/oss_compliance* \ + # conda leaves an empty /root/.cache/conda/notices.cache file which is not removed by conda clean -ya + && rm -rf ${HOME_DIR}/.cache/conda + +# Setting up APT and PIP repo for neuron artifacts +ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com +ARG NEURON_APT_REPO_KEY +ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com +ARG NEURON_PIP_REPO_KEY +RUN mkdir -p /etc/apt/keyrings \ + && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \ + && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \ + && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg + +# Neuron SDK components version numbers +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58 +ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008 +ARG NEURONX_TOOLS_VERSION=2.26.14.0 +ARG NEURONX_CC_VERSION=2.22.7534.0+c05a3358 +ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.11.18533+7b57219b +ARG NEURONX_DISTRIBUTED_VERSION=0.15.25852+4660bc97 +ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.6.13291+f42296f9 + +# vLLM branch names +ARG VLLM_PRIVATE_BRANCH=neuron-release-2.27 +ARG VLLM_PUBLIC_BRANCH=0.2.1-lts + +FROM base AS vllm-clone + +RUN mkdir -p /root/.ssh && \ + echo "StrictHostKeyChecking no" >> /root/.ssh/config && \ + ssh-keyscan -t rsa github.com >> /root/.ssh/known_hosts + +WORKDIR /vllm + +RUN --mount=type=secret,id=ssh_key,target=/root/.ssh/id_ed25519,mode=0600 \ + git clone -b ${VLLM_PRIVATE_BRANCH} git@github.com:aws-neuron/private-vllm-neuron.git . + +FROM base AS repo + + +# Install Neuron components from the apt and pip repos (latest versions) +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools \ + aws-neuronx-collectives \ + aws-neuronx-runtime-lib \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Install VLLM from source +COPY --from=vllm-clone /vllm /opt/vllm + +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ + && ${PIP} install --no-cache-dir \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + "neuronx-cc>=2.0" \ + "torch-neuronx==2.8.*" \ + neuronx_distributed \ + neuronx_distributed_inference \ + -e /opt/vllm \ + && rm -rf ~/.cache/pip/* + +FROM base AS prod + +# Install Neuron components with specific versions +RUN apt-get update \ + && apt-get install -y \ + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /tmp/tmp* \ + && apt-get clean + +# Clone VLLM source before pip installations +RUN git clone -b ${VLLM_PUBLIC_BRANCH} https://github.com/vllm-project/vllm-neuron.git /opt/vllm + +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ + && ${PIP} install --no-cache-dir \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + neuronx-cc==$NEURONX_CC_VERSION \ + torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ + neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ + neuronx_distributed_inference==$NEURONX_DISTRIBUTED_INFERENCE_VERSION \ + -e /opt/vllm \ + && rm -rf ~/.cache/pip/* + +FROM ${BUILD_STAGE} AS final + +EXPOSE 8080 8081 + +ENTRYPOINT ["python", "/usr/local/bin/vllm_entrypoint.py"] +CMD ["/bin/bash"] +HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1 \ No newline at end of file diff --git a/vllm/inference/0.9.1/Dockerfile.neuronx b/vllm/inference/0.9.1/Dockerfile.neuronx index f151c5b..7f76ef6 100644 --- a/vllm/inference/0.9.1/Dockerfile.neuronx +++ b/vllm/inference/0.9.1/Dockerfile.neuronx @@ -1,16 +1,17 @@ ARG BUILD_STAGE=prod -FROM public.ecr.aws/docker/library/ubuntu:22.04 AS base +FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base LABEL dlc_major_version="1" LABEL maintainer="Amazon AI" ARG DEBIAN_FRONTEND=noninteractive ARG PIP=pip3 -ARG PYTHON=python3.11 -ARG PYTHON_VERSION=3.11.13 +ARG PYTHON=python3.12 +ARG PYTHON_VERSION=3.12.11 ARG MINIFORGE_VERSION=25.3.1-0 ARG TORCHSERVE_VERSION=0.11.0 +ARG PYPI_SIMPLE_URL="https://pypi.org/simple/" # See http://bugs.python.org/issue19846 @@ -35,7 +36,7 @@ RUN apt-get update \ gpg-agent \ jq \ libgl1 \ - libgl1-mesa-glx \ + libgl1-mesa-dri \ libglib2.0-0 \ libsm6 \ libxext6 \ @@ -52,6 +53,7 @@ RUN apt-get update \ && rm -rf /tmp/tmp* \ && apt-get clean + # https://github.com/docker-library/openjdk/issues/261 https://github.com/docker-library/openjdk/pull/263/files RUN keytool -importkeystore -srckeystore /etc/ssl/certs/java/cacerts -destkeystore /etc/ssl/certs/java/cacerts.jks -deststoretype JKS -srcstorepass changeit -deststorepass changeit -noprompt; \ mv /etc/ssl/certs/java/cacerts.jks /etc/ssl/certs/java/cacerts; \ @@ -151,19 +153,21 @@ ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com ARG NEURON_PIP_REPO_KEY RUN mkdir -p /etc/apt/keyrings \ && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \ - && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} focal main" > /etc/apt/sources.list.d/neuron.list \ - && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") -sSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg \ - && PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ - && ${PIP} config set global.extra-index-url "${PIP_REPO_URL}" + && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \ + && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg # Neuron SDK components version numbers ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58 ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008 ARG NEURONX_TOOLS_VERSION=2.26.14.0 -ARG NEURONX_CC_VERSION=2.21.33363.0+82129205 -ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.16998+e9bf8a50 -ARG NEURONX_DISTRIBUTED_VERSION=0.15.22404+1f27bddf -ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.6.10598+a59fdc00 +ARG NEURONX_CC_VERSION=2.22.7534.0+c05a3358 +ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.11.18533+7b57219b +ARG NEURONX_DISTRIBUTED_VERSION=0.15.25852+4660bc97 +ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.6.13291+f42296f9 + +# vLLM branch names +ARG VLLM_PRIVATE_BRANCH=2.26.1 +ARG VLLM_PUBLIC_BRANCH=2.26.1 FROM base AS vllm-clone @@ -174,7 +178,7 @@ RUN mkdir -p /root/.ssh && \ WORKDIR /vllm RUN --mount=type=secret,id=ssh_key,target=/root/.ssh/id_ed25519,mode=0600 \ - git clone -b 2.26.1 git@github.com:aws-neuron/private-neuronx-vllm-staging.git . + git clone -b ${VLLM_PRIVATE_BRANCH} git@github.com:aws-neuron/private-neuronx-vllm-staging.git . FROM base AS repo @@ -188,19 +192,27 @@ RUN apt-get update \ && rm -rf /tmp/tmp* \ && apt-get clean -RUN ${PIP} install --no-cache-dir \ - neuronx-cc \ - torch-neuronx \ - neuronx_distributed \ - neuronx_distributed_inference \ - && rm -rf ~/.cache/pip/* - # Install VLLM from source COPY --from=vllm-clone /vllm /opt/vllm -WORKDIR /opt/vllm -RUN ${PIP} install --no-cache-dir --no-deps -r requirements/neuron.txt \ - && VLLM_TARGET_DEVICE="neuron" ${PIP} install --no-cache-dir -e . +ARG NEURON_PUBLIC_PIP_REPO="https://pip.repos.neuron.amazonaws.com" + +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ + && ${PIP} install --no-cache-dir \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + --extra-index-url ${NEURON_PUBLIC_PIP_REPO} \ + "neuronx-cc>=2.0" \ + "torch-neuronx==2.8.*" \ + neuronx_distributed \ + neuronx_distributed_inference==$NEURONX_DISTRIBUTED_INFERENCE_VERSION \ + && VLLM_TARGET_DEVICE="neuron" ${PIP} install --no-cache-dir \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + -e /opt/vllm \ + && rm -rf ~/.cache/pip/* FROM base AS prod @@ -214,20 +226,25 @@ RUN apt-get update \ && rm -rf /tmp/tmp* \ && apt-get clean -RUN ${PIP} install --no-cache-dir \ +# Clone VLLM source before pip installations +RUN git clone -b ${VLLM_PUBLIC_BRANCH} https://github.com/aws-neuron/upstreaming-to-vllm.git /opt/vllm + +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ + && ${PIP} install --no-cache-dir \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ neuronx-cc==$NEURONX_CC_VERSION \ torch-neuronx==$NEURONX_FRAMEWORK_VERSION \ neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \ neuronx_distributed_inference==$NEURONX_DISTRIBUTED_INFERENCE_VERSION \ + && VLLM_TARGET_DEVICE="neuron" ${PIP} install --no-cache-dir \ + --index-url ${PIP_REPO_URL} \ + --trusted-host ${NEURON_PIP_REPO} \ + --extra-index-url ${PYPI_SIMPLE_URL} \ + -e /opt/vllm \ && rm -rf ~/.cache/pip/* -# Install VLLM from source -RUN git clone -b 2.26.1 https://github.com/aws-neuron/upstreaming-to-vllm.git /opt/vllm -WORKDIR /opt/vllm - -RUN ${PIP} install --no-cache-dir --no-deps -r requirements/neuron.txt \ - && VLLM_TARGET_DEVICE="neuron" ${PIP} install --no-cache-dir -e . - FROM ${BUILD_STAGE} AS final EXPOSE 8080 8081