|
| 1 | +## Global Args ################################################################# |
| 2 | +ARG BASE_UBI_IMAGE_TAG=9.5-1742914212 |
| 3 | +ARG PYTHON_VERSION=3.12 |
| 4 | +ARG VLLM_VERSION="v0.10.0.2" |
| 5 | +ARG VLLM_TGIS_ADAPTER_VERSION="0.8.0" |
| 6 | +ARG TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX" |
| 7 | +ARG max_jobs=2 |
| 8 | +ARG nvcc_threads=8 |
| 9 | + |
| 10 | +## Base Layer ################################################################## |
| 11 | +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base |
| 12 | +ARG PYTHON_VERSION |
| 13 | +ENV PYTHON_VERSION=${PYTHON_VERSION} |
| 14 | +RUN microdnf -y update && microdnf install -y --nodocs \ |
| 15 | + python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \ |
| 16 | + && microdnf clean all |
| 17 | + |
| 18 | +WORKDIR /workspace |
| 19 | + |
| 20 | +ENV LANG=C.UTF-8 \ |
| 21 | + LC_ALL=C.UTF-8 |
| 22 | + |
| 23 | +# Some utils for dev purposes - tar required for kubectl cp |
| 24 | +RUN microdnf install -y --nodocs \ |
| 25 | + which procps findutils tar vim git\ |
| 26 | + && microdnf clean all |
| 27 | + |
| 28 | + |
| 29 | +## Python Installer ############################################################ |
| 30 | +FROM base as python-install |
| 31 | +ARG PYTHON_VERSION |
| 32 | +ARG CUDA_MAJOR |
| 33 | +ARG CUDA_MINOR |
| 34 | + |
| 35 | +ENV CUDA_MAJOR=${CUDA_MAJOR} |
| 36 | +ENV CUDA_MINOR=${CUDA_MINOR} |
| 37 | +ENV UV_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR} |
| 38 | +ENV UV_INDEX_STRATEGY=unsafe-best-match |
| 39 | +ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR} |
| 40 | + |
| 41 | +ENV VIRTUAL_ENV=/opt/vllm |
| 42 | +ENV PATH="$VIRTUAL_ENV/bin:$PATH" |
| 43 | +ENV PYTHON_VERSION=${PYTHON_VERSION} |
| 44 | +RUN microdnf install -y --nodocs \ |
| 45 | + python${PYTHON_VERSION}-devel && \ |
| 46 | + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \ |
| 47 | + pip install --no-cache -U pip wheel uv && \ |
| 48 | + microdnf clean all |
| 49 | + |
| 50 | + |
| 51 | +## CUDA Base ################################################################### |
| 52 | +FROM python-install as cuda-base |
| 53 | + |
| 54 | +RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ |
| 55 | + https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo |
| 56 | + |
| 57 | +ENV CUDA_HOME="/usr/local/cuda" \ |
| 58 | + PATH="${CUDA_HOME}/bin:${PATH}" |
| 59 | +ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/lib64/stubs/:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}" |
| 60 | + |
| 61 | +RUN microdnf install -y \ |
| 62 | + cuda-nvcc-12-8 cuda-nvtx-12-8 cuda-libraries-devel-12-8 && \ |
| 63 | + microdnf clean all && \ |
| 64 | + ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/ |
| 65 | + |
| 66 | + |
| 67 | +## Python cuda base ################################################################# |
| 68 | +FROM cuda-base AS python-cuda-base |
| 69 | + |
| 70 | +ENV VIRTUAL_ENV=/opt/vllm |
| 71 | +ENV PATH="$VIRTUAL_ENV/bin:$PATH" |
| 72 | + |
| 73 | +# install cuda and common dependencies |
| 74 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 75 | + --mount=type=bind,source=requirements/common.txt,target=requirements/common.txt \ |
| 76 | + --mount=type=bind,source=requirements/cuda.txt,target=requirements/cuda.txt \ |
| 77 | + uv pip install \ |
| 78 | + -r requirements/cuda.txt |
| 79 | + |
| 80 | +## Builder ##################################################################### |
| 81 | +FROM python-cuda-base AS build |
| 82 | + |
| 83 | +# install build dependencies |
| 84 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 85 | + --mount=type=bind,source=requirements/build.txt,target=requirements/build.txt \ |
| 86 | + uv pip install -r requirements/build.txt |
| 87 | + |
| 88 | +# install compiler cache to speed up compilation leveraging local or remote caching |
| 89 | +# git is required for the cutlass kernels |
| 90 | +RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all |
| 91 | + |
| 92 | +COPY . . |
| 93 | + |
| 94 | +ARG TORCH_CUDA_ARCH_LIST |
| 95 | +ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST |
| 96 | + |
| 97 | +# max jobs used by Ninja to build extensions |
| 98 | +ARG max_jobs |
| 99 | +ENV MAX_JOBS=${max_jobs} |
| 100 | +# number of threads used by nvcc |
| 101 | +ARG nvcc_threads |
| 102 | +ENV NVCC_THREADS=$nvcc_threads |
| 103 | +# make sure punica kernels are built (for LoRA) |
| 104 | +ENV VLLM_INSTALL_PUNICA_KERNELS=1 |
| 105 | +ARG VLLM_VERSION |
| 106 | + |
| 107 | +# Make sure the cuda environment is in the PATH |
| 108 | +ENV PATH=/usr/local/cuda/bin:$PATH |
| 109 | + |
| 110 | +ENV CCACHE_DIR=/root/.cache/ccache |
| 111 | +RUN --mount=type=cache,target=/root/.cache/ccache \ |
| 112 | + --mount=type=cache,target=/root/.cache/pip \ |
| 113 | + --mount=type=cache,target=/root/.cache/uv \ |
| 114 | + --mount=type=bind,src=.git,target=/workspace/.git \ |
| 115 | + env CFLAGS="-march=haswell" \ |
| 116 | + CXXFLAGS="$CFLAGS $CXXFLAGS" \ |
| 117 | + CMAKE_BUILD_TYPE=Release \ |
| 118 | + SETUPTOOLS_SCM_PRETEND_VERSION="${VLLM_VERSION}" \ |
| 119 | + python3 setup.py bdist_wheel --dist-dir=dist |
| 120 | + |
| 121 | +## Release ##################################################################### |
| 122 | +FROM python-install AS vllm-openai |
| 123 | +ARG PYTHON_VERSION |
| 124 | + |
| 125 | +WORKDIR /workspace |
| 126 | + |
| 127 | +ENV VIRTUAL_ENV=/opt/vllm |
| 128 | +ENV PATH=$VIRTUAL_ENV/bin:$PATH |
| 129 | + |
| 130 | +# force using the python venv's cuda runtime libraries |
| 131 | +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}" |
| 132 | +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}" |
| 133 | +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}" |
| 134 | + |
| 135 | +# Triton needs a CC compiler |
| 136 | +RUN microdnf install -y --nodocs gcc \ |
| 137 | + rsync \ |
| 138 | + && microdnf clean all |
| 139 | + |
| 140 | +# install vllm wheel first, so that torch etc will be installed |
| 141 | +# nccl install is temp until torch is upgraded: https://github.com/vllm-project/vllm/issues/19166 |
| 142 | +RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ |
| 143 | + --mount=type=cache,target=/root/.cache/uv \ |
| 144 | + uv pip install \ |
| 145 | + --extra-index-url="https://download.pytorch.org/whl/cu128" --index-strategy='unsafe-best-match' \ |
| 146 | + "$(echo dist/*.whl)[audio,video,tensorizer]" --verbose \ |
| 147 | + "https://storage.googleapis.com/nm-public-pypi/dist/flashinfer_python-0.2.8-cp39-abi3-linux_x86_64.whl" \ |
| 148 | + && uv pip install -U nvidia-nccl-cu12==2.26.5 blobfile |
| 149 | + |
| 150 | +ENV HF_HUB_OFFLINE=1 \ |
| 151 | + HOME=/home/vllm \ |
| 152 | + # Allow requested max length to exceed what is extracted from the |
| 153 | + # config.json |
| 154 | + # see: https://github.com/vllm-project/vllm/pull/7080 |
| 155 | + VLLM_USAGE_SOURCE=production-docker-image \ |
| 156 | + VLLM_WORKER_MULTIPROC_METHOD=fork \ |
| 157 | + VLLM_NO_USAGE_STATS=1 \ |
| 158 | + OUTLINES_CACHE_DIR=/tmp/outlines \ |
| 159 | + NUMBA_CACHE_DIR=/tmp/numba \ |
| 160 | + TRITON_CACHE_DIR=/tmp/triton \ |
| 161 | + # Setup NCCL monitoring with torch |
| 162 | + # For tensor-parallel workloads, this monitors for NCCL deadlocks when |
| 163 | + # one rank dies, and tears down the NCCL process groups so that the driver |
| 164 | + # can cleanly exit. |
| 165 | + TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=15 \ |
| 166 | + TORCH_NCCL_DUMP_ON_TIMEOUT=0 |
| 167 | + |
| 168 | +# setup non-root user for OpenShift |
| 169 | +RUN umask 002 && \ |
| 170 | + useradd --uid 2000 --gid 0 vllm && \ |
| 171 | + mkdir -p /home/vllm && \ |
| 172 | + chown -R vllm /home/vllm && \ |
| 173 | + chmod g+rwx /home/vllm |
| 174 | + |
| 175 | +COPY LICENSE /licenses/vllm.md |
| 176 | +COPY examples/*.jinja /app/data/template/ |
| 177 | +COPY examples/*.jinja /opt/app-root/template/ |
| 178 | +RUN chown -R vllm /opt/app-root/template && chmod -R g+r /opt/app-root/template |
| 179 | + |
| 180 | +USER 2000 |
| 181 | +WORKDIR /home/vllm |
| 182 | + |
| 183 | +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] |
| 184 | + |
| 185 | + |
| 186 | +FROM vllm-openai as vllm-grpc-adapter |
| 187 | + |
| 188 | +USER root |
| 189 | + |
| 190 | +ARG VLLM_TGIS_ADAPTER_VERSION |
| 191 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 192 | + --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ |
| 193 | + HOME=/root uv pip install \ |
| 194 | + --extra-index-url="https://download.pytorch.org/whl/cu128" --index-strategy='unsafe-best-match' \ |
| 195 | + "$(echo /workspace/dist/*.whl)[audio,video,tensorizer]" \ |
| 196 | + vllm-tgis-adapter==${VLLM_TGIS_ADAPTER_VERSION} |
| 197 | + |
| 198 | +# Upgrade NCCL back to required version after vllm-tgis-adapter installation |
| 199 | +RUN --mount=type=cache,target=/root/.cache/uv \ |
| 200 | + HOME=/root uv pip install -U nvidia-nccl-cu12==2.26.5 |
| 201 | + |
| 202 | +ENV GRPC_PORT=8033 \ |
| 203 | + PORT=8000 \ |
| 204 | + # As an optimization, vLLM disables logprobs when using spec decoding by |
| 205 | + # default, but this would be unexpected to users of a hosted model that |
| 206 | + # happens to have spec decoding |
| 207 | + # see: https://github.com/vllm-project/vllm/pull/6485 |
| 208 | + DISABLE_LOGPROBS_DURING_SPEC_DECODING=false |
| 209 | + |
| 210 | +USER 2000 |
| 211 | +ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"] |
0 commit comments