InftyAI · googs1025 · Jun 29, 2025 · googs1025 · Jun 29, 2025
diff --git a/chart/templates/backends/vllm-cpu.yaml b/chart/templates/backends/vllm-cpu.yaml
@@ -0,0 +1,75 @@
+{{- if .Values.backendRuntime.enabled -}}
+apiVersion: inference.llmaz.io/v1alpha1
+kind: BackendRuntime
+metadata:
+  labels:
+    app.kubernetes.io/name: backendruntime
+    app.kubernetes.io/part-of: llmaz
+    app.kubernetes.io/created-by: llmaz
+  name: vllmcpu
+spec:
+  image: {{ .Values.backendRuntime.vllmcpu.image.repository }}
+  version: {{ .Values.backendRuntime.vllmcpu.image.tag }}
+  envs:
+    - name: VLLM_CPU_KVCACHE_SPACE
+      value: "8"
+  lifecycle:
+    preStop:
+      exec:
+        command:
+          - /bin/sh
+          - -c
+          - |
+            while true; do
+              RUNNING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_running' | grep -v '#' | awk '{print $2}')
+              WAITING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_waiting' | grep -v '#' | awk '{print $2}')
+              if [ "$RUNNING" = "0.0" ] && [ "$WAITING" = "0.0" ]; then
+                echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1
+                exit 0
+              else
+                echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1
+                sleep 5
+              fi
+            done
+  # Do not edit the preset argument name unless you know what you're doing.
+  # Free to add more arguments with your requirements.
+  recommendedConfigs:
+    - name: default
+      args:
+        - --model
+        - "{{`{{ .ModelPath }}`}}"
+        - --served-model-name
+        - "{{`{{ .ModelName }}`}}"
+        - --host
+        - "0.0.0.0"
+        - --port
+        - "8080"
+      sharedMemorySize: 2Gi
+      resources:
+        requests:
+          cpu: 10
+          memory: 32Gi
+        limits:
+          cpu: 10
+          memory: 32Gi
+  startupProbe:
+    periodSeconds: 10
+    failureThreshold: 30
+    httpGet:
+      path: /health
+      port: 8080
+  livenessProbe:
+    initialDelaySeconds: 15
+    periodSeconds: 10
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080
+{{- end }}
diff --git a/chart/values.global.yaml b/chart/values.global.yaml
@@ -26,6 +26,11 @@ backendRuntime:
     image:
       repository: vllm/vllm-openai
       tag: v0.7.3
+  vllmcpu:
+    image:
+      # more image detail: https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
+      repository: public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
+      tag: v0.8.5
 
 leaderWorkerSet:
   enabled: true

diff --git a/docs/examples/README.md b/docs/examples/README.md
@@ -12,6 +12,7 @@ We provide a set of examples to help you serve large language models, by default
 - [Deploy models via TensorRT-LLM](#deploy-models-via-tensorrt-llm)
 - [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference)
 - [Deploy models via ollama](#deploy-models-via-ollama)
+- [Deploy models via vLLM CPU](#deploy-models-via-vllm-cpu)
 - [Speculative Decoding with llama.cpp](#speculative-decoding-with-llamacpp)
 - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm)
 - [Multi-Host Inference](#multi-host-inference)
@@ -64,6 +65,11 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference
 
 llama.cpp supports speculative decoding to significantly improve inference performance, see [example](./speculative-decoding/llamacpp/) here.
 
+### Deploy models via vLLM CPU
+
+[vLLM](https://github.com/vllm-project/vllm) is an efficient and high-throughput LLM inference engine. It also provides a **CPU version** for environments without GPU support. see [example](./vllm-cpu/) here.
+
+
 ### Speculative Decoding with vLLM
 
 [Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here.

diff --git a/docs/examples/vllm-cpu/playground.yaml b/docs/examples/vllm-cpu/playground.yaml
@@ -0,0 +1,20 @@
+apiVersion: llmaz.io/v1alpha1
+kind: OpenModel
+metadata:
+  name: qwen3-0--6b
+spec:
+  familyName: qwen3
+  source:
+    modelHub:
+      modelID: Qwen/Qwen3-0.6B
+---
+apiVersion: inference.llmaz.io/v1alpha1
+kind: Playground
+metadata:
+  name: qwen3-0--6b
+spec:
+  replicas: 1
+  modelClaim:
+    modelName: qwen3-0--6b
+  backendRuntimeConfig:
+    backendName: vllmcpu