opea-project · chyundunovDatamonsters · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
@@ -28,3 +28,224 @@ helm install chatqna oci://ghcr.io/opea-project/charts/chatqna  --set global.HUG
 ```
 
 See other *-values.yaml files in this directory for more reference.
+
+## Deploy on AMD ROCm using Helm charts from the binary Helm repository
+
+```bash
+mkdir ~/chatqna-k8s-install && cd ~/chatqna-k8s-install
+```
+
+### Cloning repos
+
+```bash
+git clone https://github.com/opea-project/GenAIExamples.git
+```
+
+### Go to the installation directory
+
+```bash
+cd GenAIExamples/ChatQnA/kubernetes/helm
+```
+
+### Settings system variables
+
+```bash
+export HFTOKEN="your_huggingface_token"
+export MODELDIR="/mnt/opea-models"
+export MODELNAME="meta-llama/Meta-Llama-3-8B-Instruct"
+```
+
+### Setting variables in Values files
+
+#### If ROCm vLLM used
+```bash
+nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/rocm-values.yaml
+```
+
+#### If deploy FaqGen based application on AMD ROCm device with vLLM
+```bash
+nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/faqgen-rocm-values.yaml
+```
+
+- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use.
+  You can specify either one or several comma-separated ones - "0" or "0,1,2,3"
+- TENSOR_PARALLEL_SIZE - must match the number of GPUs used
+- ```yaml
+  resources:
+    limits:
+      amd.com/gpu: "1" # replace "1" with the number of GPUs used
+  ```
+
+#### If ROCm TGI used
+
+```bash
+nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/rocm-tgi-values.yaml
+```
+
+#### If deploy FaqGen based application on AMD ROCm device with TGI
+
+```bash
+nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/faqgen-rocm-tgi-values.yaml
+```
+
+- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use.
+  You can specify either one or several comma-separated ones - "0" or "0,1,2,3"
+- extraCmdArgs: [ "--num-shard","1" ] - replace "1" with the number of GPUs used
+- ```yaml
+  resources:
+    limits:
+      amd.com/gpu: "1" # replace "1" with the number of GPUs used
+  ```
+
+### Installing the Helm Chart
+
+#### If ROCm vLLM used
+```bash
+helm upgrade --install chatqna oci://ghcr.io/opea-project/charts/chatqna \
+    --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \
+    --values rocm-values.yaml
+```
+
+#### If ROCm TGI used
+```bash
+helm upgrade --install chatqna oci://ghcr.io/opea-project/charts/chatqna \
+    --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \
+    --values rocm-tgi-values.yaml
+```
+
+#### If deploy FaqGen based application on AMD ROCm device with vLLM
+```bash
+helm upgrade --install chatqna oci://ghcr.io/opea-project/charts/chatqna \
+    --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \
+    --values faqgen-rocm-values.yaml
+```
+
+#### If deploy FaqGen based application on AMD ROCm device with TGI
+```bash
+helm upgrade --install chatqna oci://ghcr.io/opea-project/charts/chatqna \
+    --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \
+    --values faqgen-rocm-tgi-values.yaml
+```
+
+## Deploy on AMD ROCm using Helm charts from Git repositories
+
+### Creating working dirs
+
+```bash
+mkdir ~/chatqna-k8s-install && cd ~/chatqna-k8s-install
+```
+
+### Cloning repos
+
+```bash
+git clone git clone https://github.com/opea-project/GenAIExamples.git
+git clone git clone https://github.com/opea-project/GenAIInfra.git
+```
+
+### Go to the installation directory
+
+```bash
+cd GenAIExamples/ChatQnA/kubernetes/helm
+```
+
+### Settings system variables
+
+```bash
+export HFTOKEN="your_huggingface_token"
+export MODELDIR="/mnt/opea-models"
+export MODELNAME="Intel/neural-chat-7b-v3-3"
+```
+
+### Setting variables in Values files
+
+#### If ROCm vLLM used
+```bash
+nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/rocm-values.yaml
+```
+
+- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use.
+  You can specify either one or several comma-separated ones - "0" or "0,1,2,3"
+- TENSOR_PARALLEL_SIZE - must match the number of GPUs used
+- resources:
+  limits:
+  amd.com/gpu: "1" - replace "1" with the number of GPUs used
+
+#### If ROCm TGI used
+
+```bash
+nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/rocm-tgi-values.yaml
+```
+
+- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use.
+  You can specify either one or several comma-separated ones - "0" or "0,1,2,3"
+- extraCmdArgs: [ "--num-shard","1" ] - replace "1" with the number of GPUs used
+- resources:
+  limits:
+  amd.com/gpu: "1" - replace "1" with the number of GPUs used
+
+#### If deploy FaqGen based application on AMD ROCm device with vLLM
+```bash
+nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/faqgen-rocm-values.yaml
+```
+
+- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use.
+  You can specify either one or several comma-separated ones - "0" or "0,1,2,3"
+- TENSOR_PARALLEL_SIZE - must match the number of GPUs used
+- resources:
+  limits:
+  amd.com/gpu: "1" - replace "1" with the number of GPUs used
+
+#### If deploy FaqGen based application on AMD ROCm device with TGI
+
+```bash
+nano ~/chatqna-k8s-install/GenAIExamples/ChatQnA/kubernetes/helm/faqgen-rocm-tgi-values.yaml
+```
+
+- HIP_VISIBLE_DEVICES - this variable specifies the ID of the GPU that you want to use.
+  You can specify either one or several comma-separated ones - "0" or "0,1,2,3"
+- extraCmdArgs: [ "--num-shard","1" ] - replace "1" with the number of GPUs used
+- resources:
+  limits:
+  amd.com/gpu: "1" - replace "1" with the number of GPUs used
+
+### Installing the Helm Chart
+
+#### If ROCm vLLM used
+```bash
+cd ~/chatqna-k8s-install/GenAIInfra/helm-charts
+./update_dependency.sh
+helm dependency update chatqna
+helm upgrade --install chatqna chatqna \
+    --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \
+    --values ../../GenAIExamples/ChatQnA/kubernetes/helm/rocm-values.yaml
+```
+
+#### If ROCm TGI used
+```bash
+cd ~/chatqna-k8s-install/GenAIInfra/helm-charts
+./update_dependency.sh
+helm dependency update chatqna
+helm upgrade --install chatqna chatqna \
+    --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \
+    --values ../../GenAIExamples/ChatQnA/kubernetes/helm/rocm-tgi-values.yaml
+```
+
+#### If deploy FaqGen based application on AMD ROCm device with vLLM
+```bash
+cd ~/chatqna-k8s-install/GenAIInfra/helm-charts
+./update_dependency.sh
+helm dependency update chatqna
+helm upgrade --install chatqna chatqna \
+    --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \
+    --values ../../GenAIExamples/ChatQnA/kubernetes/helm/faqgen-rocm-values.yaml
+```
+
+#### If deploy FaqGen based application on AMD ROCm device with TGI
+```bash
+cd ~/chatqna-k8s-install/GenAIInfra/helm-charts
+./update_dependency.sh
+helm dependency update chatqna
+helm upgrade --install chatqna chatqna \
+    --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} \
+    --values ../../GenAIExamples/ChatQnA/kubernetes/helm/faqgen-rocm-tgi-values.yaml
+```
@@ -0,0 +1,66 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+CHATQNA_TYPE: "CHATQNA_FAQGEN"
+llm-uservice:
+  enabled: true
+  image:
+    repository: opea/llm-faqgen
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  FAQGEN_BACKEND: "TGI"
+  service:
+    port: 80
+tgi:
+  enabled: true
+  accelDevice: "rocm"
+  image:
+    repository: ghcr.io/huggingface/text-generation-inference
+    tag: "2.4.1-rocm"
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  MAX_INPUT_LENGTH: "3072"
+  MAX_TOTAL_TOKENS: "4096"
+  PYTORCH_TUNABLEOP_ENABLED: "0"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "false"
+  HIP_VISIBLE_DEVICES: "0,1"
+  MAX_BATCH_SIZE: "2"
+  extraCmdArgs: [ "--num-shard","2" ]
+  resources:
+    limits:
+      amd.com/gpu: "2"
+    requests:
+      cpu: 1
+      memory: 16Gi
+  securityContext:
+    readOnlyRootFilesystem: false
+    runAsNonRoot: false
+    runAsUser: 0
+    capabilities:
+      add:
+        - SYS_PTRACE
+  readinessProbe:
+    initialDelaySeconds: 60
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+  startupProbe:
+    initialDelaySeconds: 60
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+vllm:
+  enabled: false
+
+# Reranking: second largest bottleneck when reranking is in use
+# (i.e. query context docs have been uploaded with data-prep)
+#
+# TODO: could vLLM be used also for reranking / embedding?
+teirerank:
+  accelDevice: "cpu"
+  image:
+    repository: ghcr.io/huggingface/text-embeddings-inference
+    tag: cpu-1.5
+  # securityContext:
+  #   readOnlyRootFilesystem: false
+  readinessProbe:
+    timeoutSeconds: 1
@@ -0,0 +1,59 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+CHATQNA_TYPE: "CHATQNA_FAQGEN"
+llm-uservice:
+  enabled: true
+  image:
+    repository: opea/llm-faqgen
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
+  FAQGEN_BACKEND: "vLLM"
+  service:
+    port: 80
+tgi:
+  enabled: false
+vllm:
+  enabled: true
+  accelDevice: "rocm"
+  image:
+    repository: opea/vllm-rocm
+    tag: latest
+  env:
+    HIP_VISIBLE_DEVICES: "0"
+    TENSOR_PARALLEL_SIZE: "1"
+    HF_HUB_DISABLE_PROGRESS_BARS: "1"
+    HF_HUB_ENABLE_HF_TRANSFER: "0"
+    VLLM_USE_TRITON_FLASH_ATTN: "0"
+    VLLM_WORKER_MULTIPROC_METHOD: "spawn"
+    PYTORCH_JIT: "0"
+    HF_HOME: "/data"
+  extraCmd:
+    command: [ "python3", "/workspace/api_server.py" ]
+  extraCmdArgs: [ "--swap-space", "16",
+                  "--disable-log-requests",
+                  "--dtype", "float16",
+                  "--num-scheduler-steps", "1",
+                  "--distributed-executor-backend", "mp" ]
+  resources:
+    limits:
+      amd.com/gpu: "1"
+  startupProbe:
+    failureThreshold: 180
+  securityContext:
+    readOnlyRootFilesystem: false
+    runAsNonRoot: false
+    runAsUser: 0
+
+# Reranking: second largest bottleneck when reranking is in use
+# (i.e. query context docs have been uploaded with data-prep)
+#
+# TODO: could vLLM be used also for reranking / embedding?
+teirerank:
+  accelDevice: "cpu"
+  image:
+    repository: ghcr.io/huggingface/text-embeddings-inference
+    tag: cpu-1.5
+  # securityContext:
+  #   readOnlyRootFilesystem: false
+  readinessProbe:
+    timeoutSeconds: 1