diff --git a/chart/templates/backends/vllm-cpu.yaml b/chart/templates/backends/vllm-cpu.yaml new file mode 100644 index 00000000..d140e2ba --- /dev/null +++ b/chart/templates/backends/vllm-cpu.yaml @@ -0,0 +1,75 @@ +{{- if .Values.backendRuntime.enabled -}} +apiVersion: inference.llmaz.io/v1alpha1 +kind: BackendRuntime +metadata: + labels: + app.kubernetes.io/name: backendruntime + app.kubernetes.io/part-of: llmaz + app.kubernetes.io/created-by: llmaz + name: vllmcpu +spec: + image: {{ .Values.backendRuntime.vllmcpu.image.repository }} + version: {{ .Values.backendRuntime.vllmcpu.image.tag }} + envs: + - name: VLLM_CPU_KVCACHE_SPACE + value: "8" + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - | + while true; do + RUNNING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_running' | grep -v '#' | awk '{print $2}') + WAITING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_waiting' | grep -v '#' | awk '{print $2}') + if [ "$RUNNING" = "0.0" ] && [ "$WAITING" = "0.0" ]; then + echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1 + exit 0 + else + echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1 + sleep 5 + fi + done + # Do not edit the preset argument name unless you know what you're doing. + # Free to add more arguments with your requirements. + recommendedConfigs: + - name: default + args: + - --model + - "{{`{{ .ModelPath }}`}}" + - --served-model-name + - "{{`{{ .ModelName }}`}}" + - --host + - "0.0.0.0" + - --port + - "8080" + sharedMemorySize: 2Gi + resources: + requests: + cpu: 10 + memory: 32Gi + limits: + cpu: 10 + memory: 32Gi + startupProbe: + periodSeconds: 10 + failureThreshold: 30 + httpGet: + path: /health + port: 8080 + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /health + port: 8080 +{{- end }} \ No newline at end of file diff --git a/chart/values.global.yaml b/chart/values.global.yaml index 4b467479..37a62016 100644 --- a/chart/values.global.yaml +++ b/chart/values.global.yaml @@ -26,6 +26,11 @@ backendRuntime: image: repository: vllm/vllm-openai tag: v0.7.3 + vllmcpu: + image: + # more image detail: https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo + repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo + tag: v0.8.5 leaderWorkerSet: enabled: true diff --git a/docs/examples/README.md b/docs/examples/README.md index 31b188b0..80e63462 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -12,6 +12,7 @@ We provide a set of examples to help you serve large language models, by default - [Deploy models via TensorRT-LLM](#deploy-models-via-tensorrt-llm) - [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference) - [Deploy models via ollama](#deploy-models-via-ollama) +- [Deploy models via vLLM CPU](#deploy-models-via-vllm-cpu) - [Speculative Decoding with llama.cpp](#speculative-decoding-with-llamacpp) - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm) - [Multi-Host Inference](#multi-host-inference) @@ -64,6 +65,11 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference llama.cpp supports speculative decoding to significantly improve inference performance, see [example](./speculative-decoding/llamacpp/) here. +### Deploy models via vLLM CPU + +[vLLM](https://github.com/vllm-project/vllm) is an efficient and high-throughput LLM inference engine. It also provides a **CPU version** for environments without GPU support. see [example](./vllm-cpu/) here. + + ### Speculative Decoding with vLLM [Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here. diff --git a/docs/examples/vllm-cpu/playground.yaml b/docs/examples/vllm-cpu/playground.yaml new file mode 100644 index 00000000..01ac9b0c --- /dev/null +++ b/docs/examples/vllm-cpu/playground.yaml @@ -0,0 +1,20 @@ +apiVersion: llmaz.io/v1alpha1 +kind: OpenModel +metadata: + name: qwen3-0--6b +spec: + familyName: qwen3 + source: + modelHub: + modelID: Qwen/Qwen3-0.6B +--- +apiVersion: inference.llmaz.io/v1alpha1 +kind: Playground +metadata: + name: qwen3-0--6b +spec: + replicas: 1 + modelClaim: + modelName: qwen3-0--6b + backendRuntimeConfig: + backendName: vllmcpu \ No newline at end of file