add konflux dockerfile

riprasad · riprasad · commit 1b44916fc071 · 2025-08-29T15:27:18.000+01:00
diff --git a/Dockerfile.konflux.cuda b/Dockerfile.konflux.cuda
@@ -0,0 +1,211 @@
+## Global Args #################################################################
+ARG BASE_UBI_IMAGE_TAG=9.5-1742914212
+ARG PYTHON_VERSION=3.12
+ARG VLLM_VERSION="v0.10.0.2"
+ARG VLLM_TGIS_ADAPTER_VERSION="0.8.0"
+ARG TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX"
+ARG max_jobs=2
+ARG nvcc_threads=8
+
+## Base Layer ##################################################################
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
+ARG PYTHON_VERSION
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+RUN microdnf -y update && microdnf install -y --nodocs \
+    python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \
+    && microdnf clean all
+
+WORKDIR /workspace
+
+ENV LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8
+
+# Some utils for dev purposes - tar required for kubectl cp
+RUN microdnf install -y --nodocs \
+        which procps findutils tar vim git\
+    && microdnf clean all
+
+
+## Python Installer ############################################################
+FROM base as python-install
+ARG PYTHON_VERSION
+ARG CUDA_MAJOR
+ARG CUDA_MINOR
+
+ENV CUDA_MAJOR=${CUDA_MAJOR}
+ENV CUDA_MINOR=${CUDA_MINOR}
+ENV UV_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR}
+ENV UV_INDEX_STRATEGY=unsafe-best-match
+ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR}
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+RUN microdnf install -y --nodocs \
+    python${PYTHON_VERSION}-devel  && \
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
+    pip install --no-cache -U pip wheel uv && \
+    microdnf clean all
+
+
+## CUDA Base ###################################################################
+FROM python-install as cuda-base
+
+RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
+        https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
+
+ENV CUDA_HOME="/usr/local/cuda" \
+    PATH="${CUDA_HOME}/bin:${PATH}"
+ENV LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/lib64/stubs/:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
+
+RUN microdnf install -y \
+        cuda-nvcc-12-8 cuda-nvtx-12-8 cuda-libraries-devel-12-8 && \
+    microdnf clean all && \
+    ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/
+
+
+## Python cuda base #################################################################
+FROM cuda-base AS python-cuda-base
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# install cuda and common dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,source=requirements/cuda.txt,target=requirements/cuda.txt \
+    uv pip install \
+        -r requirements/cuda.txt
+
+## Builder #####################################################################
+FROM python-cuda-base AS build
+
+# install build dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements/build.txt,target=requirements/build.txt \
+    uv pip install -r requirements/build.txt
+
+# install compiler cache to speed up compilation leveraging local or remote caching
+# git is required for the cutlass kernels
+RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all
+
+COPY . .
+
+ARG TORCH_CUDA_ARCH_LIST
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads
+ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+ARG VLLM_VERSION
+
+# Make sure the cuda environment is in the PATH
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=.git,target=/workspace/.git \
+    env CFLAGS="-march=haswell" \
+        CXXFLAGS="$CFLAGS $CXXFLAGS" \
+        CMAKE_BUILD_TYPE=Release \
+        SETUPTOOLS_SCM_PRETEND_VERSION="${VLLM_VERSION}" \
+        python3 setup.py bdist_wheel --dist-dir=dist
+
+## Release #####################################################################
+FROM python-install AS vllm-openai
+ARG PYTHON_VERSION
+
+WORKDIR /workspace
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=$VIRTUAL_ENV/bin:$PATH
+
+# force using the python venv's cuda runtime libraries
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
+
+# Triton needs a CC compiler
+RUN microdnf install -y --nodocs gcc \
+    rsync \
+    && microdnf clean all
+
+# install vllm wheel first, so that torch etc will be installed
+# nccl install is temp until torch is upgraded: https://github.com/vllm-project/vllm/issues/19166
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install \
+        --extra-index-url="https://download.pytorch.org/whl/cu128" --index-strategy='unsafe-best-match' \
+        "$(echo dist/*.whl)[audio,video,tensorizer]" --verbose \
+        "https://storage.googleapis.com/nm-public-pypi/dist/flashinfer_python-0.2.8-cp39-abi3-linux_x86_64.whl" \
+    && uv pip install -U nvidia-nccl-cu12==2.26.5 blobfile
+
+ENV HF_HUB_OFFLINE=1 \
+    HOME=/home/vllm \
+    # Allow requested max length to exceed what is extracted from the
+    # config.json
+    # see: https://github.com/vllm-project/vllm/pull/7080
+    VLLM_USAGE_SOURCE=production-docker-image \
+    VLLM_WORKER_MULTIPROC_METHOD=fork \
+    VLLM_NO_USAGE_STATS=1 \
+    OUTLINES_CACHE_DIR=/tmp/outlines \
+    NUMBA_CACHE_DIR=/tmp/numba \
+    TRITON_CACHE_DIR=/tmp/triton \
+    # Setup NCCL monitoring with torch
+    # For tensor-parallel workloads, this monitors for NCCL deadlocks when
+    # one rank dies, and tears down the NCCL process groups so that the driver
+    # can cleanly exit.
+    TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC=15 \
+    TORCH_NCCL_DUMP_ON_TIMEOUT=0
+
+# setup non-root user for OpenShift
+RUN umask 002 && \
+    useradd --uid 2000 --gid 0 vllm && \
+    mkdir -p /home/vllm && \
+    chown -R vllm /home/vllm && \
+    chmod g+rwx /home/vllm
+
+COPY LICENSE /licenses/vllm.md
+COPY examples/*.jinja /app/data/template/
+COPY examples/*.jinja /opt/app-root/template/
+RUN chown -R vllm /opt/app-root/template && chmod -R g+r /opt/app-root/template
+
+USER 2000
+WORKDIR /home/vllm
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+
+
+FROM vllm-openai as vllm-grpc-adapter
+
+USER root
+
+ARG VLLM_TGIS_ADAPTER_VERSION
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    HOME=/root uv pip install \
+        --extra-index-url="https://download.pytorch.org/whl/cu128" --index-strategy='unsafe-best-match' \
+        "$(echo /workspace/dist/*.whl)[audio,video,tensorizer]" \
+        vllm-tgis-adapter==${VLLM_TGIS_ADAPTER_VERSION}
+
+# Upgrade NCCL back to required version after vllm-tgis-adapter installation
+RUN --mount=type=cache,target=/root/.cache/uv \
+    HOME=/root uv pip install -U nvidia-nccl-cu12==2.26.5
+
+ENV GRPC_PORT=8033 \
+    PORT=8000 \
+    # As an optimization, vLLM disables logprobs when using spec decoding by
+    # default, but this would be unexpected to users of a hosted model that
+    # happens to have spec decoding
+    # see: https://github.com/vllm-project/vllm/pull/6485
+    DISABLE_LOGPROBS_DURING_SPEC_DECODING=false
+
+USER 2000
+ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"]