diff --git a/modules/src/vllm_module/item.yaml b/modules/src/vllm_module/item.yaml index ca684340..1e9b7d59 100644 --- a/modules/src/vllm_module/item.yaml +++ b/modules/src/vllm_module/item.yaml @@ -14,3 +14,4 @@ spec: image: mlrun/mlrun kind: generic version: 1.0.0 + diff --git a/modules/src/vllm_module/vllm-module.ipynb b/modules/src/vllm_module/vllm-module.ipynb new file mode 100644 index 00000000..05b584e4 --- /dev/null +++ b/modules/src/vllm_module/vllm-module.ipynb @@ -0,0 +1,234 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7d551647-dfc2-47da-bc8a-3792af622073", + "metadata": {}, + "source": [ + "# vLLM Module with MLRun\n", + "\n", + "This notebook shows how to configure and deploy a vLLM OpenAI compatible server as an MLRun application runtime, then showcases how to send a chat request to it to the vLLM server." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7707b270-30cc-448a-a828-cb93aa28030d", + "metadata": {}, + "outputs": [], + "source": [ + "import mlrun\n" + ] + }, + { + "cell_type": "markdown", + "id": "d5cff681-bfdf-4468-a1d1-2aeadb56065e", + "metadata": {}, + "source": [ + "## Prerequisite\n", + "* At lease one GPU is required for running this notebook." + ] + }, + { + "cell_type": "markdown", + "id": "d5c84798-289f-4b4f-8c1b-f4dd12a3bda5", + "metadata": {}, + "source": [ + "## What this notebook does\n", + "\n", + "In this notebook we will:\n", + "\n", + "- Create or load an **MLRun project**\n", + "- Import a custom **vLLM module** from the MLRun Hub\n", + "- Deploy a **vLLM OpenAI-compatible server** as an MLRun application runtime\n", + "- Configure deployment parameters such as model, GPU count, memory, node selector, port, and log level\n", + "- Invoke the deployed service using the `/v1/chat/completions` endpoint\n", + "- Parse the response and extract only the assistant’s generated text\n", + "\n", + "By the end of this notebook, you will have a working vLLM deployment that can be queried directly from a Jupyter notebook using OpenAI-style APIs.\n", + "\n", + "For more information about [vLLM documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server/)" + ] + }, + { + "cell_type": "markdown", + "id": "879ca641-ee35-4682-9995-4eb319d89090", + "metadata": {}, + "source": [ + "## 1. Create an MLRun project\n", + "\n", + "In this section we create or load an MLRun project that will own the deployed vLLM application runtime." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6eac263a-17d1-4454-9e19-459dfbe2f231", + "metadata": {}, + "outputs": [], + "source": [ + "project = mlrun.get_or_create_project(name=\"vllm-module\", context=\"\", user_project=True)" + ] + }, + { + "cell_type": "markdown", + "id": "da49d335-b704-4fb6-801f-4d07b64f9be6", + "metadata": {}, + "source": [ + "## 2. Import the vLLM module from the MLRun Hub\n", + "\n", + "In this section we import the vLLM module from the MLRun Hub so we can instantiate `VLLMModule` and deploy it as an application runtime." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6d89dee-db58-4c0c-8009-b37020c9599a", + "metadata": {}, + "outputs": [], + "source": [ + "vllm = mlrun.import_module(\"hub://vllm-module\")" + ] + }, + { + "cell_type": "markdown", + "id": "1202ddd5-0ce7-4769-be29-8fc264c1f80e", + "metadata": {}, + "source": [ + "## 3. Deploy the vLLM application runtime\n", + "\n", + "Configure the vLLM deployment parameters and deploy the application.\n", + "\n", + "The returned address is the service URL for the application runtime." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e433123a-e64b-4a7a-8c7f-8165bcdcc6d1", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the vLLM app\n", + "vllm_module = vllm.VLLMModule(\n", + " project=project,\n", + " node_selector={\"alpha.eksctl.io/nodegroup-name\": \"added-gpu\"},\n", + " name=\"qwen-vllm\",\n", + " image=\"vllm/vllm-openai:latest\",\n", + " model=\"Qwen/Qwen2.5-Omni-3B\",\n", + " gpus=1,\n", + " mem=\"10G\",\n", + " port=8000,\n", + " dtype=\"auto\",\n", + " uvicorn_log_level=\"info\",\n", + " max_tokens = 501,\n", + ")\n", + "\n", + "# Deploy the vLLM app\n", + "addr = vllm_module.vllm_app.deploy(with_mlrun=True)\n", + "addr" + ] + }, + { + "cell_type": "markdown", + "id": "06832de3-5c31-43bf-b07b-0e71fb2d072d", + "metadata": {}, + "source": [ + "## 4. Get the runtime handle\n", + "\n", + "Fetch the runtime object and invoke the service using `app.invoke(...)`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102d3fd0-1ee6-49b8-8c86-df742ac1c559", + "metadata": {}, + "outputs": [], + "source": [ + "# Optional: get_runtime() method uses to get the MLRun application runtime\n", + "app = vllm_module.get_runtime()" + ] + }, + { + "cell_type": "markdown", + "id": "925730c1-0ac5-454b-8fb2-ab8cebb3f3ac", + "metadata": {}, + "source": [ + "## 5. Send a chat request for testing\n", + "\n", + "Call the OpenAI compatible endpoint `/v1/chat/completions`, parse the JSON response, and print only the assistant message text." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "31bc78d4-1c6f-439c-b894-1522e3a6d3e6", + "metadata": {}, + "outputs": [], + "source": [ + "body = {\n", + " \"model\": vllm_module.model,\n", + " \"messages\": [{\"role\": \"user\", \"content\": \"what are the 3 countries with the most gpu as far as you know\"}],\n", + " \"max_tokens\": vllm_module.max_tokens, # start smaller for testing\n", + "}\n", + "\n", + "resp = app.invoke(path=\"/v1/chat/completions\", body=body)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a459d5f8-dad0-4735-94c2-3801d4f94bb5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "assistant:\n", + "\n", + "As of the most commonly cited estimates, the three countries with the largest GPU capacity for AI workloads are the United States, China, and India.\n" + ] + } + ], + "source": [ + "data = resp\n", + "assistant_text = data[\"choices\"][0][\"message\"][\"content\"]\n", + "\n", + "print(\"\\nassistant:\\n\")\n", + "print(assistant_text.strip())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "957b5d21-7ade-4131-9100-878652c477fc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mlrun-base", + "language": "python", + "name": "conda-env-mlrun-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.22" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/modules/src/vllm_module/vllm_module.py b/modules/src/vllm_module/vllm_module.py index 52145bf1..b9bad014 100644 --- a/modules/src/vllm_module/vllm_module.py +++ b/modules/src/vllm_module/vllm_module.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # -#This module acts as a lightweight gateway to OpenAI-compatible APIs. -#You can send chat prompts, create embeddings, or get model responses without worrying about authentication or endpoint differences. -#It simplifies access so you can test, analyze, or integrate AI features directly into your projects or notebooks with minimal setup. + +# This module acts as a lightweight gateway to OpenAI-compatible APIs. +# You can send chat prompts, create embeddings, or get model responses without worrying about authentication or endpoint differences. +# It simplifies access so you can test, analyze, or integrate AI features directly into your projects or notebooks with minimal setup. from typing import Dict, Optional, List @@ -67,11 +68,9 @@ def __init__( f"tensor_parallel_size ({tensor_parallel_size}) cannot be greater than gpus ({gpus})" ) - - if node_selector is None: node_selector = {"alpha.eksctl.io/nodegroup-name": "added-gpu"} - + if not isinstance(max_tokens, int): raise TypeError("max_tokens must be an integer") @@ -124,8 +123,6 @@ def __init__( self.vllm_app.spec.volumes = [{"name": "dshm", "emptyDir": {"medium": "Memory"}}] self.vllm_app.spec.volume_mounts = [{"name": "dshm", "mountPath": "/dev/shm"}] - - self.vllm_app.spec.command = "vllm" self.vllm_app.spec.args = args @@ -139,4 +136,3 @@ def add_args(self, extra_args: List[str]): if not isinstance(extra_args, list) or not all(isinstance(x, str) for x in extra_args): raise ValueError("extra_args must be a list of strings") self.vllm_app.spec.args += extra_args -