diff --git a/ChatQnA/kubernetes/helm/README.md b/ChatQnA/kubernetes/helm/README.md index 8ada19b070..f301458bbb 100644 --- a/ChatQnA/kubernetes/helm/README.md +++ b/ChatQnA/kubernetes/helm/README.md @@ -28,3 +28,224 @@ helm install chatqna oci://ghcr.io/opea-project/charts/chatqna --set global.HUG ``` See other *-values.yaml files in this directory for more reference. + +## Deploy on AMD ROCm using Helm charts from the binary Helm repository + +```bash +mkdir ~/chatqna-k8s-install && cd ~/chatqna-k8s-install +``` + +### Cloning repos + +```bash +git clone https://github.com/opea-project/GenAIExamples.git +``` + +### Go to the installation directory + +```bash +cd GenAIExamples/ChatQnA/kubernetes/helm +``` + +### Settings system variables + +```bash +export HFTOKEN="your_huggingface_token" +export MODELDIR="/mnt/opea-models" +export MODELNAME="meta-llama/Meta-Llama-3-8B-Instruct" +``` + +### Setting variables in Values files + +#### If ROCm vLLM used +```bash +nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/rocm-values.yaml +``` + +#### If deploy FaqGen based application on AMD ROCm device with vLLM +```bash +nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/faqgen-rocm-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- TENSOR_PARALLEL_SIZE - must match the number of GPUs used +- ```yaml + resources: + limits: + amd.com/gpu: "1" # replace "1" with the number of GPUs used + ``` + +#### If ROCm TGI used + +```bash +nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/rocm-tgi-values.yaml +``` + +#### If deploy FaqGen based application on AMD ROCm device with TGI + +```bash +nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/faqgen-rocm-tgi-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- extraCmdArgs: [ "--num-shard","1" ] - replace "1" with the number of GPUs used +- ```yaml + resources: + limits: + amd.com/gpu: "1" # replace "1" with the number of GPUs used + ``` + +### Installing the Helm Chart + +#### If ROCm vLLM used +```bash +helm upgrade --install chatqna oci://ghcr.io/opea-project/charts/chatqna \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values rocm-values.yaml +``` + +#### If ROCm TGI used +```bash +helm upgrade --install chatqna oci://ghcr.io/opea-project/charts/chatqna \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values rocm-tgi-values.yaml +``` + +#### If deploy FaqGen based application on AMD ROCm device with vLLM +```bash +helm upgrade --install chatqna oci://ghcr.io/opea-project/charts/chatqna \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values faqgen-rocm-values.yaml +``` + +#### If deploy FaqGen based application on AMD ROCm device with TGI +```bash +helm upgrade --install chatqna oci://ghcr.io/opea-project/charts/chatqna \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values faqgen-rocm-tgi-values.yaml +``` + +## Deploy on AMD ROCm using Helm charts from Git repositories + +### Creating working dirs + +```bash +mkdir ~/chatqna-k8s-install && cd ~/chatqna-k8s-install +``` + +### Cloning repos + +```bash +git clone git clone https://github.com/opea-project/GenAIExamples.git +git clone git clone https://github.com/opea-project/GenAIInfra.git +``` + +### Go to the installation directory + +```bash +cd GenAIExamples/ChatQnA/kubernetes/helm +``` + +### Settings system variables + +```bash +export HFTOKEN="your_huggingface_token" +export MODELDIR="/mnt/opea-models" +export MODELNAME="Intel/neural-chat-7b-v3-3" +``` + +### Setting variables in Values files + +#### If ROCm vLLM used +```bash +nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/rocm-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- TENSOR_PARALLEL_SIZE - must match the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +#### If ROCm TGI used + +```bash +nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/rocm-tgi-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- extraCmdArgs: [ "--num-shard","1" ] - replace "1" with the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +#### If deploy FaqGen based application on AMD ROCm device with vLLM +```bash +nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/faqgen-rocm-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- TENSOR_PARALLEL_SIZE - must match the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +#### If deploy FaqGen based application on AMD ROCm device with TGI + +```bash +nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/faqgen-rocm-tgi-values.yaml +``` + +- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use. + You can specify either one or several comma-separated ones - "0" or "0,1,2,3" +- extraCmdArgs: [ "--num-shard","1" ] - replace "1" with the number of GPUs used +- resources: + limits: + amd.com/gpu: "1" - replace "1" with the number of GPUs used + +### Installing the Helm Chart + +#### If ROCm vLLM used +```bash +cd ~/chatqna-k8s-install/GenAIInfra/helm-charts +./update_dependency.sh +helm dependency update chatqna +helm upgrade --install chatqna chatqna \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values ../../GenAIExamples/ChatQnA/kubernetes/helm/rocm-values.yaml +``` + +#### If ROCm TGI used +```bash +cd ~/chatqna-k8s-install/GenAIInfra/helm-charts +./update_dependency.sh +helm dependency update chatqna +helm upgrade --install chatqna chatqna \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values ../../GenAIExamples/ChatQnA/kubernetes/helm/rocm-tgi-values.yaml +``` + +#### If deploy FaqGen based application on AMD ROCm device with vLLM +```bash +cd ~/chatqna-k8s-install/GenAIInfra/helm-charts +./update_dependency.sh +helm dependency update chatqna +helm upgrade --install chatqna chatqna \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values ../../GenAIExamples/ChatQnA/kubernetes/helm/faqgen-rocm-values.yaml +``` + +#### If deploy FaqGen based application on AMD ROCm device with TGI +```bash +cd ~/chatqna-k8s-install/GenAIInfra/helm-charts +./update_dependency.sh +helm dependency update chatqna +helm upgrade --install chatqna chatqna \ + --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \ + --values ../../GenAIExamples/ChatQnA/kubernetes/helm/faqgen-rocm-tgi-values.yaml +``` diff --git a/ChatQnA/kubernetes/helm/faqgen-rocm-tgi-values.yaml b/ChatQnA/kubernetes/helm/faqgen-rocm-tgi-values.yaml new file mode 100644 index 0000000000..f261cad95e --- /dev/null +++ b/ChatQnA/kubernetes/helm/faqgen-rocm-tgi-values.yaml @@ -0,0 +1,66 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CHATQNA_TYPE: "CHATQNA_FAQGEN" +llm-uservice: + enabled: true + image: + repository: opea/llm-faqgen + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + FAQGEN_BACKEND: "TGI" + service: + port: 80 +tgi: + enabled: true + accelDevice: "rocm" + image: + repository: ghcr.io/huggingface/text-generation-inference + tag: "2.4.1-rocm" + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + MAX_INPUT_LENGTH: "3072" + MAX_TOTAL_TOKENS: "4096" + PYTORCH_TUNABLEOP_ENABLED: "0" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "false" + HIP_VISIBLE_DEVICES: "0,1" + MAX_BATCH_SIZE: "2" + extraCmdArgs: [ "--num-shard","2" ] + resources: + limits: + amd.com/gpu: "2" + requests: + cpu: 1 + memory: 16Gi + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: + - SYS_PTRACE + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 +vllm: + enabled: false + +# Reranking: second largest bottleneck when reranking is in use +# (i.e. query context docs have been uploaded with data-prep) +# +# TODO: could vLLM be used also for reranking / embedding? +teirerank: + accelDevice: "cpu" + image: + repository: ghcr.io/huggingface/text-embeddings-inference + tag: cpu-1.5 + # securityContext: + # readOnlyRootFilesystem: false + readinessProbe: + timeoutSeconds: 1 diff --git a/ChatQnA/kubernetes/helm/faqgen-rocm-values.yaml b/ChatQnA/kubernetes/helm/faqgen-rocm-values.yaml new file mode 100644 index 0000000000..279c59721d --- /dev/null +++ b/ChatQnA/kubernetes/helm/faqgen-rocm-values.yaml @@ -0,0 +1,59 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +CHATQNA_TYPE: "CHATQNA_FAQGEN" +llm-uservice: + enabled: true + image: + repository: opea/llm-faqgen + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + FAQGEN_BACKEND: "vLLM" + service: + port: 80 +tgi: + enabled: false +vllm: + enabled: true + accelDevice: "rocm" + image: + repository: opea/vllm-rocm + tag: latest + env: + HIP_VISIBLE_DEVICES: "0" + TENSOR_PARALLEL_SIZE: "1" + HF_HUB_DISABLE_PROGRESS_BARS: "1" + HF_HUB_ENABLE_HF_TRANSFER: "0" + VLLM_USE_TRITON_FLASH_ATTN: "0" + VLLM_WORKER_MULTIPROC_METHOD: "spawn" + PYTORCH_JIT: "0" + HF_HOME: "/data" + extraCmd: + command: [ "python3", "/workspace/api_server.py" ] + extraCmdArgs: [ "--swap-space", "16", + "--disable-log-requests", + "--dtype", "float16", + "--num-scheduler-steps", "1", + "--distributed-executor-backend", "mp" ] + resources: + limits: + amd.com/gpu: "1" + startupProbe: + failureThreshold: 180 + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + +# Reranking: second largest bottleneck when reranking is in use +# (i.e. query context docs have been uploaded with data-prep) +# +# TODO: could vLLM be used also for reranking / embedding? +teirerank: + accelDevice: "cpu" + image: + repository: ghcr.io/huggingface/text-embeddings-inference + tag: cpu-1.5 + # securityContext: + # readOnlyRootFilesystem: false + readinessProbe: + timeoutSeconds: 1 diff --git a/ChatQnA/kubernetes/helm/rocm-tgi-values.yaml b/ChatQnA/kubernetes/helm/rocm-tgi-values.yaml new file mode 100644 index 0000000000..ab6006e06c --- /dev/null +++ b/ChatQnA/kubernetes/helm/rocm-tgi-values.yaml @@ -0,0 +1,61 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: + enabled: true + accelDevice: "rocm" + image: + repository: ghcr.io/huggingface/text-generation-inference + tag: "2.4.1-rocm" + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct + MAX_INPUT_LENGTH: "3072" + MAX_TOTAL_TOKENS: "4096" + PYTORCH_TUNABLEOP_ENABLED: "0" + USE_FLASH_ATTENTION: "true" + FLASH_ATTENTION_RECOMPUTE: "false" + HIP_VISIBLE_DEVICES: "0,1" + MAX_BATCH_SIZE: "2" + extraCmdArgs: [ "--num-shard","2" ] + resources: + limits: + amd.com/gpu: "2" + requests: + cpu: 1 + memory: 16Gi + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: + - SYS_PTRACE + readinessProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + startupProbe: + initialDelaySeconds: 60 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + +vllm: + enabled: false + +# Reranking: second largest bottleneck when reranking is in use +# (i.e. query context docs have been uploaded with data-prep) +# +# TODO: could vLLM be used also for reranking / embedding? +teirerank: + accelDevice: "cpu" + image: + repository: ghcr.io/huggingface/text-embeddings-inference + tag: cpu-1.5 + securityContext: + readOnlyRootFilesystem: false + readinessProbe: + timeoutSeconds: 1 diff --git a/ChatQnA/kubernetes/helm/rocm-values.yaml b/ChatQnA/kubernetes/helm/rocm-values.yaml new file mode 100644 index 0000000000..085b044084 --- /dev/null +++ b/ChatQnA/kubernetes/helm/rocm-values.yaml @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: + enabled: false +vllm: + enabled: true + accelDevice: "rocm" + image: + repository: opea/vllm-rocm + tag: latest + env: + HIP_VISIBLE_DEVICES: "0" + TENSOR_PARALLEL_SIZE: "1" + HF_HUB_DISABLE_PROGRESS_BARS: "1" + HF_HUB_ENABLE_HF_TRANSFER: "0" + VLLM_USE_TRITON_FLASH_ATTN: "0" + VLLM_WORKER_MULTIPROC_METHOD: "spawn" + PYTORCH_JIT: "0" + HF_HOME: "/data" + extraCmd: + command: [ "python3", "/workspace/api_server.py" ] + extraCmdArgs: [ "--swap-space", "16", + "--disable-log-requests", + "--dtype", "float16", + "--num-scheduler-steps", "1", + "--distributed-executor-backend", "mp" ] + resources: + limits: + amd.com/gpu: "1" + startupProbe: + failureThreshold: 180 + securityContext: + readOnlyRootFilesystem: false + runAsNonRoot: false + runAsUser: 0 + +# Reranking: second largest bottleneck when reranking is in use +# (i.e. query context docs have been uploaded with data-prep) +# +# TODO: could vLLM be used also for reranking / embedding? +teirerank: + accelDevice: "cpu" + image: + repository: ghcr.io/huggingface/text-embeddings-inference + tag: cpu-1.5 + securityContext: + readOnlyRootFilesystem: false + readinessProbe: + timeoutSeconds: 1